From 400a55dcac1ba397076361c419f6caf38f0f2626 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 08:24:05 +0200 Subject: [PATCH 001/130] feat: add anonymize core crate --- .cargo/config.toml | 8 + .github/workflows/ci.yml | 8 + .gitignore | 1 + Cargo.lock | 7 + Cargo.toml | 122 +++++++++ clippy.toml | 26 ++ crates/anonymize-core/Cargo.toml | 15 ++ crates/anonymize-core/src/lib.rs | 14 ++ crates/anonymize-core/src/normalize.rs | 292 ++++++++++++++++++++++ crates/anonymize-core/src/placeholders.rs | 156 ++++++++++++ crates/anonymize-core/src/redact.rs | 158 ++++++++++++ crates/anonymize-core/src/types.rs | 166 ++++++++++++ crates/anonymize-core/src/utf16.rs | 66 +++++ crates/anonymize-core/tests/redaction.rs | 252 +++++++++++++++++++ package.json | 4 + rustfmt.toml | 4 + 16 files changed, 1299 insertions(+) create mode 100644 .cargo/config.toml create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 clippy.toml create mode 100644 crates/anonymize-core/Cargo.toml create mode 100644 crates/anonymize-core/src/lib.rs create mode 100644 crates/anonymize-core/src/normalize.rs create mode 100644 crates/anonymize-core/src/placeholders.rs create mode 100644 crates/anonymize-core/src/redact.rs create mode 100644 crates/anonymize-core/src/types.rs create mode 100644 crates/anonymize-core/src/utf16.rs create mode 100644 crates/anonymize-core/tests/redaction.rs create mode 100644 rustfmt.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 00000000..20687cf9 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,8 @@ +# Use sparse index for faster resolution. +[registries.crates-io] +protocol = "sparse" + +[alias] +ci-fmt = "fmt --all -- --check" +ci-clippy = "clippy --workspace --all-targets --all-features --locked -- -D warnings" +ci-test = "test --workspace --all-features --locked" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd2c72ef..1320484a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,14 @@ jobs: # the real npm token to dependency lifecycle scripts. NPM_TOKEN: "" + - name: Setup Rust + run: | + rustup toolchain install stable --profile minimal --component rustfmt,clippy + rustup default stable + + - name: Rust checks + run: bun run rust:check + - name: Check runtime package versions # Verify, do not mutate: a PR that bumps VERSION without bumping # the package versions (or vice versa) must fail CI, not be diff --git a/.gitignore b/.gitignore index 8eed3f1c..7f14bb34 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ node_modules *.tsbuildinfo dist .turbo +target/ # Claude Code local worktrees. .claude/worktrees/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..2552aebd --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "stella-anonymize-core" +version = "1.5.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..2bc63b08 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,122 @@ +[workspace] +members = ["crates/anonymize-core"] +resolver = "3" + +[workspace.package] +version = "1.5.0" +edition = "2024" +license = "MIT" +publish = false +repository = "https://github.com/stella/anonymize" + +[workspace.lints.rust] +warnings = { level = "deny", priority = -1 } +dead_code = "deny" +future_incompatible = { level = "deny", priority = -1 } +nonstandard_style = { level = "deny", priority = -1 } +rust_2018_idioms = { level = "deny", priority = -1 } +unreachable_code = "deny" +unreachable_patterns = "deny" +unreachable_pub = "deny" +unsafe_code = "deny" +unused_imports = "deny" +unused_macros = "deny" +unused_mut = "deny" +unused_variables = "deny" + +[workspace.lints.rustdoc] +broken_intra_doc_links = "deny" +bare_urls = "deny" + +[workspace.lints.clippy] +all = { level = "deny", priority = -1 } +pedantic = { level = "warn", priority = -1 } +cargo = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } + +dbg_macro = "deny" +todo = "deny" +unimplemented = "deny" +panic = "deny" +unwrap_used = "deny" +expect_used = "deny" +indexing_slicing = "warn" +integer_division = "warn" +arithmetic_side_effects = "warn" +as_conversions = "warn" +cast_possible_truncation = "warn" +cast_possible_wrap = "warn" +cast_precision_loss = "warn" +cast_sign_loss = "warn" +clone_on_ref_ptr = "deny" +create_dir = "deny" +decimal_literal_representation = "warn" +derive_partial_eq_without_eq = "deny" +disallowed_macros = "deny" +disallowed_methods = "deny" +disallowed_types = "deny" +empty_enum_variants_with_brackets = "deny" +empty_structs_with_brackets = "deny" +enum_glob_use = "deny" +exit = "deny" +filetype_is_file = "deny" +float_cmp = "warn" +fn_to_numeric_cast_any = "deny" +format_collect = "deny" +if_then_some_else_none = "deny" +implicit_clone = "deny" +inefficient_to_string = "deny" +large_enum_variant = "deny" +large_stack_arrays = "deny" +large_stack_frames = "deny" +manual_let_else = "deny" +manual_memcpy = "deny" +map_unwrap_or = "deny" +mem_forget = "deny" +missing_assert_message = "warn" +missing_errors_doc = "allow" +missing_panics_doc = "allow" +module_name_repetitions = "allow" +multiple_crate_versions = "allow" +needless_collect = "deny" +needless_continue = "deny" +needless_pass_by_ref_mut = "deny" +needless_pass_by_value = "deny" +or_fun_call = "deny" +print_stderr = "deny" +print_stdout = "deny" +redundant_clone = "deny" +same_name_method = "deny" +self_named_module_files = "allow" +semicolon_if_nothing_returned = "deny" +shadow_reuse = "allow" +shadow_same = "warn" +shadow_unrelated = "warn" +std_instead_of_alloc = "allow" +std_instead_of_core = "allow" +string_slice = "warn" +tests_outside_test_module = "allow" +trivially_copy_pass_by_ref = "deny" +unnecessary_wraps = "deny" +unneeded_field_pattern = "deny" +unreachable = "deny" +unused_async = "deny" +unused_self = "deny" +use_self = "warn" +used_underscore_binding = "deny" +verbose_file_reads = "deny" +wildcard_imports = "deny" + +[profile.release] +lto = "fat" +codegen-units = 1 +panic = "unwind" +strip = "symbols" + +[profile.dev] +debug = "line-tables-only" + +[profile.ci] +inherits = "dev" +debug-assertions = true +overflow-checks = true diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 00000000..251b36f0 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,26 @@ +# Portable stella Rust lint configuration. Keep repo-specific wrappers and +# exceptions in consuming repositories. + +disallowed-macros = [ + { path = "std::dbg", reason = "remove debug output before commit" }, + { path = "std::print", reason = "library crates should return data or use structured logging" }, + { path = "std::println", reason = "library crates should return data or use structured logging" }, + { path = "std::eprint", reason = "library crates should return data or use structured logging" }, + { path = "std::eprintln", reason = "library crates should return data or use structured logging" }, +] + +disallowed-methods = [ + { path = "std::mem::forget", reason = "leaks ownership; use a narrow ownership type or ManuallyDrop with justification" }, + { path = "std::string::String::from_utf8_lossy", reason = "lossy decoding can corrupt offsets and user data; validate or keep bytes" }, +] + +disallowed-types = [ + { path = "std::collections::LinkedList", reason = "prefer Vec, VecDeque, or an explicit intrusive structure" }, +] + +pass-by-value-size-limit = 64 +stack-size-threshold = 131072 +enum-variant-size-threshold = 128 +too-large-for-stack = 4096 +avoid-breaking-exported-api = false +accept-comment-above-attributes = true diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml new file mode 100644 index 00000000..4d424ef2 --- /dev/null +++ b/crates/anonymize-core/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "stella-anonymize-core" +version.workspace = true +edition.workspace = true +description = "Core anonymization semantics" +license.workspace = true +publish.workspace = true +repository.workspace = true +keywords = ["anonymization", "pii", "redaction", "text"] +categories = ["text-processing"] + +[dependencies] + +[lints] +workspace = true diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs new file mode 100644 index 00000000..09eb4dcc --- /dev/null +++ b/crates/anonymize-core/src/lib.rs @@ -0,0 +1,14 @@ +#![allow(clippy::redundant_pub_crate)] + +pub(crate) mod normalize; +mod placeholders; +mod redact; +mod types; +pub(crate) mod utf16; + +pub use placeholders::build_placeholder_map; +pub use redact::{deanonymise, redact_text}; +pub use types::{ + Entity, EntityKind, Error, OperatorConfig, OperatorEntry, OperatorType, + PlaceholderEntry, PlaceholderMap, RedactionEntry, RedactionResult, Result, +}; diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs new file mode 100644 index 00000000..fec3d403 --- /dev/null +++ b/crates/anonymize-core/src/normalize.rs @@ -0,0 +1,292 @@ +const PHONE_NOISE: [char; 3] = ['(', ')', '-']; +const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; + +pub(crate) fn label_key(label: &str) -> String { + let uppercase = uppercase(label); + collapse_whitespace(&uppercase, "_", false) +} + +pub(crate) fn placeholder_fallback(label: &str) -> String { + format!("[{}]", label_key(label)) +} + +pub(crate) fn normalize_entity_text(label: &str, text: &str) -> String { + let upper = label_key(label); + + if upper == "EMAIL_ADDRESS" || upper == "EMAIL" { + return text.to_lowercase().trim().to_owned(); + } + if upper == "PHONE_NUMBER" || upper == "PHONE" { + return text + .chars() + .filter(|ch| !ch.is_whitespace() && !PHONE_NOISE.contains(ch)) + .collect(); + } + if upper == "CRYPTO" { + return normalize_crypto_text(text); + } + if upper == "NATIONAL_IDENTIFICATION_NUMBER" && contains_nhs_cue(text) { + return text.chars().filter(char::is_ascii_digit).collect(); + } + if is_identifier_label(&upper) { + return strip_id_separators(text).to_uppercase(); + } + if upper == "PASSPORT_NUMBER" { + return normalize_passport_text(text); + } + if is_collapsible_text_label(&upper) { + return collapse_whitespace(text, " ", false) + .to_lowercase() + .trim() + .to_owned(); + } + + text.trim().to_owned() +} + +fn uppercase(text: &str) -> String { + let mut output = String::new(); + for ch in text.chars() { + output.extend(ch.to_uppercase()); + } + output +} + +fn collapse_whitespace(text: &str, replacement: &str, trim: bool) -> String { + let mut output = String::new(); + let mut in_whitespace = false; + + for ch in text.chars() { + if ch.is_whitespace() { + if !in_whitespace { + output.push_str(replacement); + in_whitespace = true; + } + continue; + } + + output.push(ch); + in_whitespace = false; + } + + if trim { + return output.trim().to_owned(); + } + output +} + +fn strip_id_separators(text: &str) -> String { + text + .chars() + .filter(|ch| !ch.is_whitespace() && !ID_SEPARATORS.contains(ch)) + .collect() +} + +fn is_identifier_label(upper: &str) -> bool { + matches!( + upper, + "IBAN" + | "BANK_ACCOUNT_NUMBER" + | "TAX_IDENTIFICATION_NUMBER" + | "REGISTRATION_NUMBER" + | "NATIONAL_IDENTIFICATION_NUMBER" + | "SOCIAL_SECURITY_NUMBER" + | "BIRTH_NUMBER" + | "IDENTITY_CARD_NUMBER" + | "CREDIT_CARD_NUMBER" + ) +} + +fn is_collapsible_text_label(upper: &str) -> bool { + matches!( + upper, + "PERSON" | "ORGANIZATION" | "ADDRESS" | "LAND_PARCEL" | "MISC" + ) +} + +fn contains_nhs_cue(text: &str) -> bool { + let lower = text.to_lowercase(); + contains_word(&lower, "nhs") + || collapse_whitespace(&lower, " ", true) + .contains("national health service") +} + +fn normalize_crypto_text(text: &str) -> String { + let trimmed = text.trim(); + + if let Some(address) = find_ethereum_address(trimmed) { + return address.to_lowercase(); + } + if let Some(address) = find_bech32_address(trimmed) { + return address.to_lowercase(); + } + if let Some(address) = find_base58_address(trimmed) { + return address.to_owned(); + } + + trimmed.to_owned() +} + +fn find_ethereum_address(text: &str) -> Option<&str> { + for (start, _) in text.match_indices("0x") { + let end = start.saturating_add(42); + let Some(candidate) = text.get(start..end) else { + continue; + }; + if candidate.chars().skip(2).all(|ch| ch.is_ascii_hexdigit()) { + return Some(candidate); + } + } + + None +} + +fn find_bech32_address(text: &str) -> Option<&str> { + find_ascii_token(text, |token| { + let lower = token.to_lowercase(); + lower.len() >= 14 + && lower.len() <= 74 + && lower.starts_with("bc1") + && lower + .chars() + .skip(3) + .all(|ch| matches!(ch, 'a'..='h' | 'j'..='n' | 'p'..='z' | '0'..='9')) + }) +} + +fn find_base58_address(text: &str) -> Option<&str> { + find_ascii_token(text, |token| { + let len = token.len(); + (26..=35).contains(&len) + && (token.starts_with('1') || token.starts_with('3')) + && token.chars().all(is_base58_char) + }) +} + +fn find_ascii_token( + text: &str, + predicate: impl Fn(&str) -> bool, +) -> Option<&str> { + let mut token_start = None; + + for (index, ch) in text.char_indices() { + if ch.is_ascii_alphanumeric() { + if token_start.is_none() { + token_start = Some(index); + } + continue; + } + + if let Some(start) = token_start { + let token = text.get(start..index)?; + if predicate(token) { + return Some(token); + } + token_start = None; + } + } + + let start = token_start?; + let token = text.get(start..)?; + predicate(token).then_some(token) +} + +const fn is_base58_char(ch: char) -> bool { + matches!( + ch, + 'a'..='k' + | 'm'..='z' + | 'A'..='H' + | 'J'..='N' + | 'P'..='Z' + | '1'..='9' + ) +} + +fn normalize_passport_text(text: &str) -> String { + let passport_identifier = + find_ascii_token(text, is_passport_identifier).unwrap_or(text); + strip_id_separators(passport_identifier).to_uppercase() +} + +fn is_passport_identifier(token: &str) -> bool { + let chars: Vec = token.chars().collect(); + matches_letters_digits(&chars, 1, 2, 6, 8) + || matches_digits_letters_digits(&chars, 2, 2, 5) + || (token.len() >= 7 + && token.len() <= 9 + && token.chars().all(|ch| ch.is_ascii_digit())) +} + +fn matches_letters_digits( + chars: &[char], + min_letters: usize, + max_letters: usize, + min_digits: usize, + max_digits: usize, +) -> bool { + for letter_count in min_letters..=max_letters { + let digit_count = chars.len().saturating_sub(letter_count); + if digit_count < min_digits || digit_count > max_digits { + continue; + } + let Some((letters, digits)) = chars.split_at_checked(letter_count) else { + continue; + }; + if letters.iter().all(char::is_ascii_alphabetic) + && digits.iter().all(char::is_ascii_digit) + { + return true; + } + } + + false +} + +fn matches_digits_letters_digits( + chars: &[char], + first_digits: usize, + letters_count: usize, + last_digits: usize, +) -> bool { + let expected_len = first_digits + .saturating_add(letters_count) + .saturating_add(last_digits); + if chars.len() != expected_len { + return false; + } + + let Some((first, tail)) = chars.split_at_checked(first_digits) else { + return false; + }; + let Some((letters, last)) = tail.split_at_checked(letters_count) else { + return false; + }; + + first.iter().all(char::is_ascii_digit) + && letters.iter().all(char::is_ascii_alphabetic) + && last.iter().all(char::is_ascii_digit) +} + +fn contains_word(text: &str, word: &str) -> bool { + let mut start = 0; + while let Some(relative) = text.get(start..).and_then(|tail| tail.find(word)) + { + let word_start = start.saturating_add(relative); + let word_end = word_start.saturating_add(word.len()); + let before_ok = text + .get(..word_start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphanumeric()); + let after_ok = text + .get(word_end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if before_ok && after_ok { + return true; + } + start = word_end; + } + + false +} diff --git a/crates/anonymize-core/src/placeholders.rs b/crates/anonymize-core/src/placeholders.rs new file mode 100644 index 00000000..ec839bd2 --- /dev/null +++ b/crates/anonymize-core/src/placeholders.rs @@ -0,0 +1,156 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::normalize::{label_key, normalize_entity_text}; +use crate::types::{Entity, EntityKind, PlaceholderMap}; + +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +struct NormalizedKey { + label_key: String, + text: String, +} + +#[must_use] +pub fn build_placeholder_map( + entities: &[Entity], + reserved_text: &str, +) -> PlaceholderMap { + let mut counters = BTreeMap::::new(); + let mut normalized_to_placeholder = BTreeMap::::new(); + let reserved_placeholders = collect_reserved_placeholders(reserved_text); + let mut placeholder_map = PlaceholderMap::default(); + + let mut sorted = entities.to_vec(); + sorted.sort_by(|left, right| left.start.cmp(&right.start)); + + for entity in &sorted { + if placeholder_map.has(&entity.label, &entity.text) { + continue; + } + + let label_key = label_key(&entity.label); + let source_normalized_key = source_normalized_key(entity, &label_key); + + if let Some(source_key) = source_normalized_key.as_ref() + && let Some(existing) = normalized_to_placeholder.get(source_key) + { + placeholder_map.push(&entity.label, &entity.text, existing); + continue; + } + + let normalized = normalize_entity_text(&entity.label, &entity.text); + let normalized_key = NormalizedKey { + label_key: label_key.clone(), + text: normalized, + }; + + if let Some(existing) = normalized_to_placeholder.get(&normalized_key) { + placeholder_map.push(&entity.label, &entity.text, existing); + if let Some(source_key) = source_normalized_key { + normalized_to_placeholder.insert(source_key, existing.clone()); + } + continue; + } + + let placeholder = + next_placeholder(&label_key, &mut counters, &reserved_placeholders); + placeholder_map.push(&entity.label, &entity.text, &placeholder); + normalized_to_placeholder.insert(normalized_key, placeholder.clone()); + if let Some(source_key) = source_normalized_key { + normalized_to_placeholder.insert(source_key, placeholder); + } + } + + placeholder_map +} + +fn source_normalized_key( + entity: &Entity, + label_key: &str, +) -> Option { + let EntityKind::Coreference { source_text } = &entity.kind else { + return None; + }; + + Some(NormalizedKey { + label_key: label_key.to_owned(), + text: normalize_entity_text(&entity.label, source_text), + }) +} + +fn next_placeholder( + label_key: &str, + counters: &mut BTreeMap, + reserved_placeholders: &BTreeSet, +) -> String { + let mut count = counters.get(label_key).copied().unwrap_or(0); + + loop { + count = count.saturating_add(1); + let placeholder = format!("[{label_key}_{count}]"); + if reserved_placeholders.contains(&placeholder) { + continue; + } + + counters.insert(label_key.to_owned(), count); + return placeholder; + } +} + +fn collect_reserved_placeholders(text: &str) -> BTreeSet { + let mut placeholders = BTreeSet::new(); + let mut remaining = text; + + while let Some(start) = remaining.find('[') { + let candidate_start = start.saturating_add('['.len_utf8()); + let Some(after_open) = remaining.get(candidate_start..) else { + break; + }; + let Some(end) = after_open.find(']') else { + break; + }; + let Some(inner) = after_open.get(..end) else { + break; + }; + let valid = is_placeholder_inner(inner); + if valid { + placeholders.insert(format!("[{inner}]")); + } + + let next_start = if valid { + candidate_start + .saturating_add(end) + .saturating_add(']'.len_utf8()) + } else { + candidate_start + }; + remaining = remaining.get(next_start..).unwrap_or_default(); + } + + placeholders +} + +fn is_placeholder_inner(inner: &str) -> bool { + if inner.is_empty() + || inner + .chars() + .any(|ch| ch.is_whitespace() || ch == '[' || ch == ']') + { + return false; + } + + let Some(separator) = inner.rfind('_') else { + return false; + }; + if separator == 0 { + return false; + } + + let Some(number) = inner.get(separator.saturating_add(1)..) else { + return false; + }; + let mut chars = number.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_ascii_digit() && first != '0' && chars.all(|ch| ch.is_ascii_digit()) +} diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs new file mode 100644 index 00000000..a28b02d1 --- /dev/null +++ b/crates/anonymize-core/src/redact.rs @@ -0,0 +1,158 @@ +use crate::normalize::placeholder_fallback; +use crate::placeholders::build_placeholder_map; +use crate::types::{ + Entity, EntityKind, OperatorConfig, OperatorEntry, OperatorType, + RedactionEntry, RedactionResult, Result, +}; +use crate::utf16::Utf16Offsets; + +pub fn redact_text( + full_text: &str, + entities: &[Entity], + config: &OperatorConfig, +) -> Result { + if entities.is_empty() { + return Ok(RedactionResult { + redacted_text: full_text.to_owned(), + redaction_map: Vec::new(), + operator_map: Vec::new(), + entity_count: 0, + }); + } + + let offsets = Utf16Offsets::new(full_text); + validate_spans(entities, &offsets)?; + + let placeholder_map = build_placeholder_map(entities, full_text); + let mut sorted = entities.to_vec(); + sorted.sort_by(|left, right| left.start.cmp(&right.start)); + + let mut non_overlapping = Vec::::new(); + let mut last_end = 0; + for entity in sorted { + if entity.start >= last_end { + last_end = entity.end; + non_overlapping.push(entity); + } + } + + let mut parts = Vec::::new(); + let mut redaction_map = Vec::::new(); + let mut operator_map = Vec::::new(); + let mut cursor = 0; + + for entity in &non_overlapping { + if entity.start > cursor { + parts.push(offsets.slice(full_text, cursor, entity.start)?); + } + + let placeholder = placeholder_map + .get(&entity.label, &entity.text) + .map_or_else(|| placeholder_fallback(&entity.label), ToOwned::to_owned); + let operator = operator_for(config, &entity.label); + let replacement = match operator { + OperatorType::Replace => placeholder.clone(), + OperatorType::Redact => config.redact_string.clone(), + }; + + parts.push(replacement); + set_operator_entry(&mut operator_map, &placeholder, operator); + + if operator == OperatorType::Replace + && redaction_value(&redaction_map, &placeholder).is_none() + { + redaction_map.push(RedactionEntry { + placeholder: placeholder.clone(), + original: entity_original_text(entity), + }); + } + + cursor = entity.end; + } + + let full_text_len = offsets.len()?; + if cursor < full_text_len { + parts.push(offsets.slice(full_text, cursor, full_text_len)?); + } + + Ok(RedactionResult { + redacted_text: parts.concat(), + redaction_map, + operator_map, + entity_count: non_overlapping.len(), + }) +} + +#[must_use] +pub fn deanonymise( + redacted_text: &str, + redaction_map: &[RedactionEntry], +) -> String { + let mut result = redacted_text.to_owned(); + + for entry in redaction_map { + result = result.replace(&entry.placeholder, &entry.original); + } + + result +} + +fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { + for entity in entities { + if entity.start > entity.end { + return Err(crate::types::Error::InvalidSpan { + start: entity.start, + end: entity.end, + }); + } + + offsets.validate_offset(entity.start)?; + offsets.validate_offset(entity.end)?; + } + + Ok(()) +} + +fn operator_for(config: &OperatorConfig, label: &str) -> OperatorType { + config + .operators + .get(label) + .copied() + .unwrap_or(OperatorType::Replace) +} + +fn set_operator_entry( + operator_map: &mut Vec, + placeholder: &str, + operator: OperatorType, +) { + if let Some(entry) = operator_map + .iter_mut() + .find(|entry| entry.placeholder == placeholder) + { + entry.operator = operator; + return; + } + + operator_map.push(OperatorEntry { + placeholder: placeholder.to_owned(), + operator, + }); +} + +fn redaction_value<'a>( + redaction_map: &'a [RedactionEntry], + placeholder: &str, +) -> Option<&'a str> { + redaction_map + .iter() + .find(|entry| entry.placeholder == placeholder) + .map(|entry| entry.original.as_str()) +} + +fn entity_original_text(entity: &Entity) -> String { + match &entity.kind { + EntityKind::Detected => entity.text.clone(), + EntityKind::Coreference { source_text } => source_text.clone(), + } +} diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs new file mode 100644 index 00000000..344b87b2 --- /dev/null +++ b/crates/anonymize-core/src/types.rs @@ -0,0 +1,166 @@ +use std::collections::BTreeMap; +use std::{error, fmt}; + +pub type Result = std::result::Result; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + InvalidSpan { start: u32, end: u32 }, + Utf16OffsetOutOfBounds { offset: u32 }, + Utf16OffsetInsideSurrogate { offset: u32 }, +} + +impl fmt::Display for Error { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidSpan { start, end } => { + write!(formatter, "Invalid entity span: {start}..{end}") + } + Self::Utf16OffsetOutOfBounds { offset } => { + write!(formatter, "UTF-16 offset is out of bounds: {offset}") + } + Self::Utf16OffsetInsideSurrogate { offset } => { + write!( + formatter, + "UTF-16 offset is not a scalar boundary: {offset}" + ) + } + } + } +} + +impl error::Error for Error {} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum EntityKind { + Detected, + Coreference { source_text: String }, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Entity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub kind: EntityKind, +} + +impl Entity { + #[must_use] + pub fn detected( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + kind: EntityKind::Detected, + } + } + + #[must_use] + pub fn coreference( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + source_text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + kind: EntityKind::Coreference { + source_text: source_text.into(), + }, + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum OperatorType { + #[default] + Replace, + Redact, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OperatorConfig { + pub operators: BTreeMap, + pub redact_string: String, +} + +impl Default for OperatorConfig { + fn default() -> Self { + Self { + operators: BTreeMap::new(), + redact_string: String::from("[REDACTED]"), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PlaceholderEntry { + pub label: String, + pub text: String, + pub placeholder: String, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PlaceholderMap { + entries: Vec, +} + +impl PlaceholderMap { + #[must_use] + pub fn entries(&self) -> &[PlaceholderEntry] { + &self.entries + } + + #[must_use] + pub fn get(&self, label: &str, text: &str) -> Option<&str> { + self + .entries + .iter() + .find(|entry| entry.label == label && entry.text == text) + .map(|entry| entry.placeholder.as_str()) + } + + pub(super) fn has(&self, label: &str, text: &str) -> bool { + self.get(label, text).is_some() + } + + pub(super) fn push(&mut self, label: &str, text: &str, placeholder: &str) { + self.entries.push(PlaceholderEntry { + label: label.to_owned(), + text: text.to_owned(), + placeholder: placeholder.to_owned(), + }); + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OperatorEntry { + pub placeholder: String, + pub operator: OperatorType, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: usize, +} diff --git a/crates/anonymize-core/src/utf16.rs b/crates/anonymize-core/src/utf16.rs new file mode 100644 index 00000000..6afb583b --- /dev/null +++ b/crates/anonymize-core/src/utf16.rs @@ -0,0 +1,66 @@ +use crate::types::{Error, Result}; + +pub(crate) struct Utf16Offsets { + offsets: Vec>, +} + +impl Utf16Offsets { + pub(crate) fn new(text: &str) -> Self { + let capacity = text.encode_utf16().count().saturating_add(1); + let mut offsets = Vec::with_capacity(capacity); + offsets.push(Some(0)); + + let mut byte_cursor: usize = 0; + for ch in text.chars() { + byte_cursor = byte_cursor.saturating_add(ch.len_utf8()); + if ch.len_utf16() == 2 { + offsets.push(None); + } + offsets.push(Some(byte_cursor)); + } + + Self { offsets } + } + + pub(crate) fn len(&self) -> Result { + let len = self + .offsets + .len() + .checked_sub(1) + .ok_or(Error::Utf16OffsetOutOfBounds { offset: 0 })?; + u32::try_from(len) + .map_err(|_| Error::Utf16OffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn validate_offset(&self, offset: u32) -> Result { + let index = usize::try_from(offset) + .map_err(|_| Error::Utf16OffsetOutOfBounds { offset })?; + self + .offsets + .get(index) + .copied() + .ok_or(Error::Utf16OffsetOutOfBounds { offset })? + .ok_or(Error::Utf16OffsetInsideSurrogate { offset }) + } + + pub(crate) fn slice( + &self, + full_text: &str, + start: u32, + end: u32, + ) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let start_byte = self.validate_offset(start)?; + let end_byte = self.validate_offset(end)?; + + Ok( + full_text + .get(start_byte..end_byte) + .ok_or(Error::InvalidSpan { start, end })? + .to_owned(), + ) + } +} diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs new file mode 100644 index 00000000..57562f96 --- /dev/null +++ b/crates/anonymize-core/tests/redaction.rs @@ -0,0 +1,252 @@ +#![allow( + clippy::expect_used, + clippy::indexing_slicing, + clippy::panic, + clippy::unwrap_used +)] + +use stella_anonymize_core::{ + Entity, Error, OperatorConfig, OperatorType, deanonymise, redact_text, +}; + +fn entity(text: &str, label: &str, value: &str) -> Entity { + let start = text + .find(value) + .unwrap_or_else(|| panic!("missing fixture value: {value}")); + Entity::detected( + u32::try_from(start).unwrap_or(u32::MAX), + u32::try_from(start.saturating_add(value.len())).unwrap_or(u32::MAX), + label, + value, + ) +} + +#[test] +fn repeated_values_share_first_non_colliding_placeholder() { + let value = "Alice Smith"; + let text = format!("Existing [PERSON_1]. {value} called. {value} signed."); + let first = text.find(value).unwrap_or(0); + let second = text + .get(first.saturating_add(1)..) + .and_then(|tail| tail.find(value)) + .map_or(first, |relative| { + first.saturating_add(1).saturating_add(relative) + }); + let entities = vec![ + Entity::detected( + u32::try_from(first).unwrap_or(u32::MAX), + u32::try_from(first.saturating_add(value.len())).unwrap_or(u32::MAX), + "person", + value, + ), + Entity::detected( + u32::try_from(second).unwrap_or(u32::MAX), + u32::try_from(second.saturating_add(value.len())).unwrap_or(u32::MAX), + "person", + value, + ), + ]; + + let result = + redact_text(&text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!( + result.redacted_text, + "Existing [PERSON_1]. [PERSON_2] called. [PERSON_2] signed." + ); + assert_eq!(result.redaction_map[0].placeholder, "[PERSON_2]"); + assert_eq!( + deanonymise(&result.redacted_text, &result.redaction_map), + text + ); +} + +#[test] +fn literal_placeholders_inside_extra_brackets_are_reserved() { + let text = "Keep [[PERSON_1]]; Alice Smith signs."; + let entities = vec![entity(text, "person", "Alice Smith")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "Keep [[PERSON_1]]; [PERSON_2] signs."); + assert_eq!(result.redaction_map[0].placeholder, "[PERSON_2]"); +} + +#[test] +fn normalized_identifier_values_share_placeholder() { + let text = "Mail Alice@Example.com and alice@example.com."; + let entities = vec![ + entity(text, "email address", "Alice@Example.com"), + entity(text, "email address", "alice@example.com"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[EMAIL_ADDRESS_1]"); +} + +#[test] +fn coreference_alias_uses_source_placeholder_and_value() { + let text = "Acme signed. Acme Corporation countersigned."; + let alias_start = text.find("Acme").unwrap_or(0); + let source_start = text.find("Acme Corporation").unwrap_or(0); + let entities = vec![ + Entity::coreference( + u32::try_from(alias_start).unwrap_or(u32::MAX), + u32::try_from(alias_start.saturating_add("Acme".len())) + .unwrap_or(u32::MAX), + "organization", + "Acme", + "Acme Corporation", + ), + Entity::detected( + u32::try_from(source_start).unwrap_or(u32::MAX), + u32::try_from(source_start.saturating_add("Acme Corporation".len())) + .unwrap_or(u32::MAX), + "organization", + "Acme Corporation", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!( + result.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] countersigned." + ); + assert_eq!(result.redaction_map[0].original, "Acme Corporation"); +} + +#[test] +fn redact_operator_is_not_reversible() { + let text = "Contact Alice Smith at alice@example.com."; + let mut config = OperatorConfig::default(); + config + .operators + .insert(String::from("person"), OperatorType::Redact); + config.redact_string = String::from("[GONE]"); + let entities = vec![ + entity(text, "person", "Alice Smith"), + entity(text, "email address", "alice@example.com"), + ]; + + let result = redact_text(text, &entities, &config).unwrap(); + + assert!(result.redacted_text.contains("[GONE]")); + assert!( + result + .redaction_map + .iter() + .all(|entry| entry.placeholder != "[PERSON_1]") + ); + assert!( + result + .redaction_map + .iter() + .any(|entry| entry.placeholder == "[EMAIL_ADDRESS_1]") + ); +} + +#[test] +fn utf16_offsets_apply_non_ascii_spans() { + let text = "A 🦀 Bob"; + let start = 5; + let end = 8; + let entities = vec![Entity::detected(start, end, "person", "Bob")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "A 🦀 [PERSON_1]"); +} + +#[test] +fn invalid_utf16_boundary_is_rejected() { + let text = "A 🦀 Bob"; + let entities = vec![Entity::detected(3, 5, "person", " Bob")]; + + let error = redact_text(text, &entities, &OperatorConfig::default()) + .expect_err("offset inside a surrogate pair must fail"); + + assert_eq!(error, Error::Utf16OffsetInsideSurrogate { offset: 3 }); +} + +#[test] +fn overlapping_spans_keep_first_entity() { + let text = "Alice Smith"; + let entities = vec![ + Entity::detected(0, 11, "person", "Alice Smith"), + Entity::detected(6, 11, "person", "Smith"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "[PERSON_1]"); + assert_eq!(result.entity_count, 1); +} + +#[test] +fn equivalent_crypto_spellings_share_placeholders() { + let text = concat!( + "ETH wallet 0x742d35Cc6634C0532925a3b844Bc454e4438f44e.\n", + "ETH wallet 0x742d35cc6634c0532925a3b844bc454e4438f44e." + ); + let first = "0x742d35Cc6634C0532925a3b844Bc454e4438f44e"; + let second = "0x742d35cc6634c0532925a3b844bc454e4438f44e"; + let entities = vec![ + entity(text, "crypto", first), + entity(text, "crypto", second), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[CRYPTO_1]"); +} + +#[test] +fn equivalent_nhs_cues_share_placeholders() { + let text = concat!( + "NHS number 401 023 2137 was present.\n", + "National Health Service No. 401 023 2137 was repeated." + ); + let first = "NHS number 401 023 2137"; + let second = "National Health Service No. 401 023 2137"; + let entities = vec![ + entity(text, "national identification number", first), + entity(text, "national identification number", second), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[NATIONAL_IDENTIFICATION_NUMBER_1]" + ); +} + +#[test] +fn equivalent_passport_cues_share_placeholders() { + let text = concat!( + "US passport number X12345678 was inspected.\n", + "Passport No. X12345678 was listed." + ); + let entities = vec![ + entity(text, "passport number", "US passport number X12345678"), + entity(text, "passport number", "Passport No. X12345678"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[PASSPORT_NUMBER_1]"); +} diff --git a/package.json b/package.json index 12a69357..11a11068 100644 --- a/package.json +++ b/package.json @@ -17,6 +17,10 @@ "format": "turbo run format && oxfmt . \"!packages/**\" \"!.ai/**\" \"!.agents/**\" \"!.claude/**\" \"!AGENTS.md\" \"!CLAUDE.md\" \"!GEMINI.md\"", "format:check": "turbo run format -- --check && oxfmt --check . \"!packages/**\" \"!.ai/**\" \"!.agents/**\" \"!.claude/**\" \"!AGENTS.md\" \"!CLAUDE.md\" \"!GEMINI.md\"", "test": "turbo run test", + "rust:fmt": "cargo ci-fmt", + "rust:lint": "cargo ci-clippy", + "rust:test": "cargo ci-test", + "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:test", "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 00000000..b1b458b0 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,4 @@ +edition = "2024" +max_width = 80 +reorder_imports = true +tab_spaces = 2 From 5c744fde6fbf72643cb49e1c037fa28841d71ac3 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 08:34:10 +0200 Subject: [PATCH 002/130] fix: satisfy stable clippy --- crates/anonymize-core/src/placeholders.rs | 2 +- crates/anonymize-core/src/redact.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/anonymize-core/src/placeholders.rs b/crates/anonymize-core/src/placeholders.rs index ec839bd2..bebeb20d 100644 --- a/crates/anonymize-core/src/placeholders.rs +++ b/crates/anonymize-core/src/placeholders.rs @@ -20,7 +20,7 @@ pub fn build_placeholder_map( let mut placeholder_map = PlaceholderMap::default(); let mut sorted = entities.to_vec(); - sorted.sort_by(|left, right| left.start.cmp(&right.start)); + sorted.sort_by_key(|entity| entity.start); for entity in &sorted { if placeholder_map.has(&entity.label, &entity.text) { diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index a28b02d1..b08c61e1 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -25,7 +25,7 @@ pub fn redact_text( let placeholder_map = build_placeholder_map(entities, full_text); let mut sorted = entities.to_vec(); - sorted.sort_by(|left, right| left.start.cmp(&right.start)); + sorted.sort_by_key(|entity| entity.start); let mut non_overlapping = Vec::::new(); let mut last_end = 0; From c8d7e470709e30e72ca583b933e17491c6d2c875 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 09:21:05 +0200 Subject: [PATCH 003/130] feat: add core search layer --- Cargo.lock | 142 ++++++++++ crates/anonymize-core/Cargo.toml | 3 + crates/anonymize-core/src/lib.rs | 6 + crates/anonymize-core/src/search.rs | 364 ++++++++++++++++++++++++++ crates/anonymize-core/src/types.rs | 111 +++++++- crates/anonymize-core/tests/search.rs | 107 ++++++++ 6 files changed, 730 insertions(+), 3 deletions(-) create mode 100644 crates/anonymize-core/src/search.rs create mode 100644 crates/anonymize-core/tests/search.rs diff --git a/Cargo.lock b/Cargo.lock index 2552aebd..4f08eb01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,148 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "fancy-regex" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e1dacd0d2082dfcf1351c4bdd566bbe89a2b263235a2b50058f1e130a47277" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "stella-aho-corasick-core" +version = "1.0.4" +source = "git+https://github.com/stella/aho-corasick?rev=ad5dfa06c1be8bffda75e050030fa4e70b93c75f#ad5dfa06c1be8bffda75e050030fa4e70b93c75f" +dependencies = [ + "aho-corasick", + "unicode-case-mapping", +] + [[package]] name = "stella-anonymize-core" version = "1.5.0" +dependencies = [ + "stella-aho-corasick-core", + "stella-fuzzy-search-core", + "stella-regex-set-core", +] + +[[package]] +name = "stella-fuzzy-search-core" +version = "1.1.2" +source = "git+https://github.com/stella/fuzzy-search?rev=4ccb8ced60d8f2ff7f5d1870d2931556e8247632#4ccb8ced60d8f2ff7f5d1870d2931556e8247632" +dependencies = [ + "unicode-case-mapping", + "unicode-normalization", + "unicode-segmentation", +] + +[[package]] +name = "stella-regex-set-core" +version = "1.0.5" +source = "git+https://github.com/stella/regex-set?rev=a50fdc018b40d23ecf732be85b00d495bd6d95cf#a50fdc018b40d23ecf732be85b00d495bd6d95cf" +dependencies = [ + "fancy-regex", + "regex", + "regex-automata", + "regex-syntax", + "unicode-segmentation", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "unicode-case-mapping" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e9026503b74f3207a4c04e6bf4ea735daa8edf6c0bbfa044cae597bb947a9db" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 4d424ef2..a115528b 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -10,6 +10,9 @@ keywords = ["anonymization", "pii", "redaction", "text"] categories = ["text-processing"] [dependencies] +stella-aho-corasick-core = { version = "1.0.4", git = "https://github.com/stella/aho-corasick", rev = "ad5dfa06c1be8bffda75e050030fa4e70b93c75f" } +stella-fuzzy-search-core = { version = "1.1.2", git = "https://github.com/stella/fuzzy-search", rev = "4ccb8ced60d8f2ff7f5d1870d2931556e8247632" } +stella-regex-set-core = { version = "1.0.5", git = "https://github.com/stella/regex-set", rev = "a50fdc018b40d23ecf732be85b00d495bd6d95cf" } [lints] workspace = true diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 09eb4dcc..121a3e14 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -3,12 +3,18 @@ pub(crate) mod normalize; mod placeholders; mod redact; +mod search; mod types; pub(crate) mod utf16; pub use placeholders::build_placeholder_map; pub use redact::{deanonymise, redact_text}; +pub use search::{ + FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, + SearchOptions, SearchPattern, +}; pub use types::{ Entity, EntityKind, Error, OperatorConfig, OperatorEntry, OperatorType, PlaceholderEntry, PlaceholderMap, RedactionEntry, RedactionResult, Result, + SearchEngine, SearchMatch, }; diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs new file mode 100644 index 00000000..b10361f7 --- /dev/null +++ b/crates/anonymize-core/src/search.rs @@ -0,0 +1,364 @@ +use stella_aho_corasick_core as literal_core; +use stella_fuzzy_search_core as fuzzy_core; +use stella_regex_set_core as regex_core; + +use crate::types::{Error, Result, SearchEngine, SearchMatch}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SearchPattern { + Literal(String), + Regex(String), + Fuzzy { + pattern: String, + distance: Option, + }, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct SearchOptions { + pub literal: LiteralSearchOptions, + pub regex: RegexSearchOptions, + pub fuzzy: FuzzySearchOptions, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct LiteralSearchOptions { + pub case_insensitive: bool, + pub whole_words: bool, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct RegexSearchOptions { + pub whole_words: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct FuzzySearchOptions { + pub case_insensitive: bool, + pub whole_words: bool, + pub normalize_diacritics: bool, +} + +impl Default for FuzzySearchOptions { + fn default() -> Self { + Self { + case_insensitive: false, + whole_words: true, + normalize_diacritics: false, + } + } +} + +pub struct SearchIndex { + literal: Option, + literal_pattern_indexes: Vec, + regex: Option, + regex_pattern_indexes: Vec, + fuzzy: Option, + fuzzy_pattern_indexes: Vec, +} + +impl SearchIndex { + pub fn new( + patterns: Vec, + options: SearchOptions, + ) -> Result { + let mut literal_patterns = Vec::::new(); + let mut literal_pattern_indexes = Vec::::new(); + let mut regex_patterns = Vec::::new(); + let mut regex_pattern_indexes = Vec::::new(); + let mut fuzzy_patterns = Vec::::new(); + let mut fuzzy_pattern_indexes = Vec::::new(); + + for (index, entry) in patterns.into_iter().enumerate() { + let pattern_index = pattern_index(index)?; + match entry { + SearchPattern::Literal(value) => { + literal_patterns.push(value); + literal_pattern_indexes.push(pattern_index); + } + SearchPattern::Regex(value) => { + regex_patterns.push(value); + regex_pattern_indexes.push(pattern_index); + } + SearchPattern::Fuzzy { + pattern: fuzzy_pattern, + distance, + } => { + fuzzy_patterns.push(fuzzy_core::PatternEntry { + pattern: fuzzy_pattern, + distance, + }); + fuzzy_pattern_indexes.push(pattern_index); + } + } + } + + let literal = build_literal(literal_patterns, options)?; + let regex = build_regex(regex_patterns, options)?; + let fuzzy = build_fuzzy(fuzzy_patterns, options)?; + + Ok(Self { + literal, + literal_pattern_indexes, + regex, + regex_pattern_indexes, + fuzzy, + fuzzy_pattern_indexes, + }) + } + + pub fn find_iter(&self, haystack: &str) -> Result> { + let mut matches = Vec::new(); + + if let Some(literal) = &self.literal { + extend_triple_matches( + &mut matches, + SearchEngine::Literal, + &self.literal_pattern_indexes, + &literal + .find_iter_packed(haystack) + .map_err(|err| Error::Search { + engine: SearchEngine::Literal, + reason: err.to_string(), + })?, + |pattern, start, end| SearchMatch::Literal { + pattern, + start, + end, + }, + )?; + } + + if let Some(regex) = &self.regex { + extend_triple_matches( + &mut matches, + SearchEngine::Regex, + &self.regex_pattern_indexes, + ®ex + .find_iter_packed(haystack) + .map_err(|err| Error::Search { + engine: SearchEngine::Regex, + reason: err.to_string(), + })?, + |pattern, start, end| SearchMatch::Regex { + pattern, + start, + end, + }, + )?; + } + + if let Some(fuzzy) = &self.fuzzy { + extend_fuzzy_matches( + &mut matches, + &self.fuzzy_pattern_indexes, + &fuzzy + .find_iter_packed(haystack) + .map_err(|err| Error::Search { + engine: SearchEngine::Fuzzy, + reason: err.to_string(), + })?, + )?; + } + + matches.sort_by(|left, right| { + left + .start() + .cmp(&right.start()) + .then_with(|| left.end().cmp(&right.end())) + .then_with(|| left.pattern().cmp(&right.pattern())) + }); + Ok(matches) + } + + pub fn is_match(&self, haystack: &str) -> Result { + if let Some(literal) = &self.literal + && literal.is_match(haystack).map_err(|err| Error::Search { + engine: SearchEngine::Literal, + reason: err.to_string(), + })? + { + return Ok(true); + } + + if let Some(regex) = &self.regex + && regex.is_match(haystack) + { + return Ok(true); + } + + if let Some(fuzzy) = &self.fuzzy + && fuzzy.is_match(haystack).map_err(|err| Error::Search { + engine: SearchEngine::Fuzzy, + reason: err.to_string(), + })? + { + return Ok(true); + } + + Ok(false) + } +} + +fn build_literal( + patterns: Vec, + options: SearchOptions, +) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + + literal_core::AhoCorasick::new( + patterns, + literal_core::Options { + match_kind: literal_core::MatchKind::LeftmostFirst, + case_insensitive: options.literal.case_insensitive, + dfa: false, + whole_words: options.literal.whole_words, + }, + ) + .map(Some) + .map_err(|err| Error::Search { + engine: SearchEngine::Literal, + reason: err.to_string(), + }) +} + +fn build_regex( + patterns: Vec, + options: SearchOptions, +) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + + regex_core::RegexSet::new( + patterns, + regex_core::Options { + whole_words: options.regex.whole_words, + unicode_boundaries: true, + }, + ) + .map(Some) + .map_err(|err| Error::Search { + engine: SearchEngine::Regex, + reason: err.to_string(), + }) +} + +fn build_fuzzy( + patterns: Vec, + options: SearchOptions, +) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + + fuzzy_core::FuzzySearch::new( + patterns, + fuzzy_core::Options { + metric: fuzzy_core::Metric::Levenshtein, + normalize_diacritics: options.fuzzy.normalize_diacritics, + unicode_boundaries: true, + whole_words: options.fuzzy.whole_words, + case_insensitive: options.fuzzy.case_insensitive, + }, + ) + .map(Some) + .map_err(|err| Error::Search { + engine: SearchEngine::Fuzzy, + reason: err.to_string(), + }) +} + +fn extend_triple_matches( + matches: &mut Vec, + engine: SearchEngine, + pattern_indexes: &[u32], + packed: &[u32], + make_match: impl Fn(u32, u32, u32) -> SearchMatch, +) -> Result<()> { + let chunks = packed.chunks_exact(3); + if !chunks.remainder().is_empty() { + return Err(invalid_packed_search_result(engine, packed.len())); + } + + for chunk in chunks { + let [local_pattern, start, end] = chunk else { + return Err(invalid_packed_search_result(engine, packed.len())); + }; + let pattern = pattern_index_from_packed( + engine, + pattern_indexes, + *local_pattern, + packed.len(), + )?; + + matches.push(make_match(pattern, *start, *end)); + } + + Ok(()) +} + +fn extend_fuzzy_matches( + matches: &mut Vec, + pattern_indexes: &[u32], + packed: &[u32], +) -> Result<()> { + let chunks = packed.chunks_exact(4); + if !chunks.remainder().is_empty() { + return Err(invalid_packed_search_result( + SearchEngine::Fuzzy, + packed.len(), + )); + } + + for chunk in chunks { + let [local_pattern, start, end, distance] = chunk else { + return Err(invalid_packed_search_result( + SearchEngine::Fuzzy, + packed.len(), + )); + }; + let pattern = pattern_index_from_packed( + SearchEngine::Fuzzy, + pattern_indexes, + *local_pattern, + packed.len(), + )?; + + matches.push(SearchMatch::Fuzzy { + pattern, + start: *start, + end: *end, + distance: *distance, + }); + } + + Ok(()) +} + +fn pattern_index_from_packed( + engine: SearchEngine, + pattern_indexes: &[u32], + local_pattern: u32, + len: usize, +) -> Result { + usize::try_from(local_pattern) + .ok() + .and_then(|index| pattern_indexes.get(index)) + .copied() + .ok_or_else(|| invalid_packed_search_result(engine, len)) +} + +const fn invalid_packed_search_result( + engine: SearchEngine, + len: usize, +) -> Error { + Error::InvalidPackedSearchResult { engine, len } +} + +fn pattern_index(index: usize) -> Result { + u32::try_from(index).map_err(|_| Error::PatternIndexOutOfRange { index }) +} diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 344b87b2..0ba02852 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -5,9 +5,27 @@ pub type Result = std::result::Result; #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { - InvalidSpan { start: u32, end: u32 }, - Utf16OffsetOutOfBounds { offset: u32 }, - Utf16OffsetInsideSurrogate { offset: u32 }, + InvalidSpan { + start: u32, + end: u32, + }, + Utf16OffsetOutOfBounds { + offset: u32, + }, + Utf16OffsetInsideSurrogate { + offset: u32, + }, + Search { + engine: SearchEngine, + reason: String, + }, + InvalidPackedSearchResult { + engine: SearchEngine, + len: usize, + }, + PatternIndexOutOfRange { + index: usize, + }, } impl fmt::Display for Error { @@ -25,6 +43,18 @@ impl fmt::Display for Error { "UTF-16 offset is not a scalar boundary: {offset}" ) } + Self::Search { engine, reason } => { + write!(formatter, "{engine} search failed: {reason}") + } + Self::InvalidPackedSearchResult { engine, len } => { + write!( + formatter, + "{engine} search returned malformed packed matches of length {len}" + ) + } + Self::PatternIndexOutOfRange { index } => { + write!(formatter, "Search pattern index exceeds u32 range: {index}") + } } } } @@ -164,3 +194,78 @@ pub struct RedactionResult { pub operator_map: Vec, pub entity_count: usize, } + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SearchEngine { + Literal, + Regex, + Fuzzy, +} + +impl fmt::Display for SearchEngine { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Literal => formatter.write_str("literal"), + Self::Regex => formatter.write_str("regex"), + Self::Fuzzy => formatter.write_str("fuzzy"), + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SearchMatch { + Literal { + pattern: u32, + start: u32, + end: u32, + }, + Regex { + pattern: u32, + start: u32, + end: u32, + }, + Fuzzy { + pattern: u32, + start: u32, + end: u32, + distance: u32, + }, +} + +impl SearchMatch { + #[must_use] + pub const fn engine(&self) -> SearchEngine { + match self { + Self::Literal { .. } => SearchEngine::Literal, + Self::Regex { .. } => SearchEngine::Regex, + Self::Fuzzy { .. } => SearchEngine::Fuzzy, + } + } + + #[must_use] + pub const fn pattern(&self) -> u32 { + match self { + Self::Literal { pattern, .. } + | Self::Regex { pattern, .. } + | Self::Fuzzy { pattern, .. } => *pattern, + } + } + + #[must_use] + pub const fn start(&self) -> u32 { + match self { + Self::Literal { start, .. } + | Self::Regex { start, .. } + | Self::Fuzzy { start, .. } => *start, + } + } + + #[must_use] + pub const fn end(&self) -> u32 { + match self { + Self::Literal { end, .. } + | Self::Regex { end, .. } + | Self::Fuzzy { end, .. } => *end, + } + } +} diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs new file mode 100644 index 00000000..507d7821 --- /dev/null +++ b/crates/anonymize-core/tests/search.rs @@ -0,0 +1,107 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use stella_anonymize_core::{ + FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, + SearchMatch, SearchOptions, SearchPattern, +}; + +#[test] +fn search_index_routes_literal_regex_and_fuzzy_patterns() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\b[A-Z]{2}\d{4}\b")), + SearchPattern::Fuzzy { + pattern: String::from("Muller"), + distance: Some(1), + }, + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + regex: RegexSearchOptions { whole_words: false }, + fuzzy: FuzzySearchOptions { + case_insensitive: true, + whole_words: true, + normalize_diacritics: false, + }, + }, + ) + .unwrap(); + + let matches = index + .find_iter("Alice signed AB1234. Later, Muler countersigned.") + .unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 13, + end: 19, + }, + SearchMatch::Fuzzy { + pattern: 2, + start: 28, + end: 33, + distance: 1, + }, + ] + ); +} + +#[test] +fn search_index_preserves_utf16_offsets_from_primitive_engines() { + const SUPPLEMENTARY_SCALAR: &str = "\u{1F9EA}"; + + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Bob")), + SearchPattern::Regex(String::from(SUPPLEMENTARY_SCALAR)), + ], + SearchOptions::default(), + ) + .unwrap(); + + let haystack = format!("A {SUPPLEMENTARY_SCALAR} Bob"); + let matches = index.find_iter(&haystack).unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Regex { + pattern: 1, + start: 2, + end: 4, + }, + SearchMatch::Literal { + pattern: 0, + start: 5, + end: 8, + }, + ] + ); +} + +#[test] +fn search_index_reports_match_presence_across_engines() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\d{4}")), + ], + SearchOptions::default(), + ) + .unwrap(); + + assert!(index.is_match("Case 2026").unwrap()); + assert!(!index.is_match("No hit").unwrap()); +} From 235f8ab28dafac6b48c79b0a0b75254596a5a7e1 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 09:29:18 +0200 Subject: [PATCH 004/130] chore: allow internal core crate license checks --- .github/workflows/dependency-review.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 60cacc8d..2dc53ebb 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -41,4 +41,8 @@ jobs: Python-2.0, Zlib, Unicode-3.0 + allow-dependencies-licenses: >- + pkg:cargo/stella-aho-corasick-core, + pkg:cargo/stella-fuzzy-search-core, + pkg:cargo/stella-regex-set-core comment-summary-in-pr: always From 1d202ab2063b7e44c2c13f30b9ab29292ee29abf Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 09:46:29 +0200 Subject: [PATCH 005/130] fix: harden core redaction contracts --- crates/anonymize-core/src/normalize.rs | 82 +++++++++++++++- crates/anonymize-core/src/placeholders.rs | 47 +++------ crates/anonymize-core/src/redact.rs | 26 ++++- crates/anonymize-core/src/search.rs | 2 +- crates/anonymize-core/src/types.rs | 49 ++++++++-- crates/anonymize-core/tests/redaction.rs | 110 ++++++++++++++++++++++ crates/anonymize-core/tests/search.rs | 36 +++++++ 7 files changed, 304 insertions(+), 48 deletions(-) diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index fec3d403..261ac312 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -29,7 +29,7 @@ pub(crate) fn normalize_entity_text(label: &str, text: &str) -> String { return text.chars().filter(char::is_ascii_digit).collect(); } if is_identifier_label(&upper) { - return strip_id_separators(text).to_uppercase(); + return normalize_identifier_text(text); } if upper == "PASSPORT_NUMBER" { return normalize_passport_text(text); @@ -82,6 +82,11 @@ fn strip_id_separators(text: &str) -> String { .collect() } +fn normalize_identifier_text(text: &str) -> String { + find_compact_ascii_identifier(text, true, is_generic_identifier) + .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) +} + fn is_identifier_label(upper: &str) -> bool { matches!( upper, @@ -191,6 +196,76 @@ fn find_ascii_token( predicate(token).then_some(token) } +fn find_compact_ascii_identifier( + text: &str, + allow_whitespace: bool, + predicate: impl Fn(&str) -> bool, +) -> Option { + for (start, ch) in text.char_indices() { + if !is_identifier_start(text, start, ch) { + continue; + } + let Some(candidate) = + compact_ascii_identifier_from(text, start, allow_whitespace, &predicate) + else { + continue; + }; + return Some(candidate); + } + + None +} + +fn compact_ascii_identifier_from( + text: &str, + start: usize, + allow_whitespace: bool, + predicate: &impl Fn(&str) -> bool, +) -> Option { + let mut compact = String::new(); + let mut last_valid = None; + let tail = text.get(start..)?; + + for ch in tail.chars() { + if ch.is_ascii_alphanumeric() { + compact.push(ch.to_ascii_uppercase()); + continue; + } + + if is_identifier_separator(ch, allow_whitespace) { + if predicate(&compact) { + last_valid = Some(compact.clone()); + } + continue; + } + + break; + } + + if predicate(&compact) { + return Some(compact); + } + last_valid +} + +fn is_identifier_start(text: &str, index: usize, ch: char) -> bool { + ch.is_ascii_alphanumeric() + && text + .get(..index) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|previous| !previous.is_ascii_alphanumeric()) +} + +fn is_identifier_separator(ch: char, allow_whitespace: bool) -> bool { + ID_SEPARATORS.contains(&ch) || (allow_whitespace && ch.is_whitespace()) +} + +fn is_generic_identifier(candidate: &str) -> bool { + (5..=64).contains(&candidate.len()) + && candidate.chars().any(|ch| ch.is_ascii_digit()) + && candidate.chars().all(|ch| ch.is_ascii_alphanumeric()) +} + const fn is_base58_char(ch: char) -> bool { matches!( ch, @@ -204,9 +279,8 @@ const fn is_base58_char(ch: char) -> bool { } fn normalize_passport_text(text: &str) -> String { - let passport_identifier = - find_ascii_token(text, is_passport_identifier).unwrap_or(text); - strip_id_separators(passport_identifier).to_uppercase() + find_compact_ascii_identifier(text, true, is_passport_identifier) + .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) } fn is_passport_identifier(token: &str) -> bool { diff --git a/crates/anonymize-core/src/placeholders.rs b/crates/anonymize-core/src/placeholders.rs index bebeb20d..8ac103dc 100644 --- a/crates/anonymize-core/src/placeholders.rs +++ b/crates/anonymize-core/src/placeholders.rs @@ -23,58 +23,39 @@ pub fn build_placeholder_map( sorted.sort_by_key(|entity| entity.start); for entity in &sorted { - if placeholder_map.has(&entity.label, &entity.text) { + if placeholder_map.has_entity(entity) { continue; } let label_key = label_key(&entity.label); - let source_normalized_key = source_normalized_key(entity, &label_key); - - if let Some(source_key) = source_normalized_key.as_ref() - && let Some(existing) = normalized_to_placeholder.get(source_key) - { - placeholder_map.push(&entity.label, &entity.text, existing); - continue; - } - - let normalized = normalize_entity_text(&entity.label, &entity.text); - let normalized_key = NormalizedKey { - label_key: label_key.clone(), - text: normalized, - }; + let normalized_key = normalized_key(entity, &label_key); if let Some(existing) = normalized_to_placeholder.get(&normalized_key) { - placeholder_map.push(&entity.label, &entity.text, existing); - if let Some(source_key) = source_normalized_key { - normalized_to_placeholder.insert(source_key, existing.clone()); - } + placeholder_map.push_entity(entity, existing); continue; } let placeholder = next_placeholder(&label_key, &mut counters, &reserved_placeholders); - placeholder_map.push(&entity.label, &entity.text, &placeholder); - normalized_to_placeholder.insert(normalized_key, placeholder.clone()); - if let Some(source_key) = source_normalized_key { - normalized_to_placeholder.insert(source_key, placeholder); - } + placeholder_map.push_entity(entity, &placeholder); + normalized_to_placeholder.insert(normalized_key, placeholder); } placeholder_map } -fn source_normalized_key( - entity: &Entity, - label_key: &str, -) -> Option { - let EntityKind::Coreference { source_text } = &entity.kind else { - return None; +fn normalized_key(entity: &Entity, label_key: &str) -> NormalizedKey { + let text = match &entity.kind { + EntityKind::Detected => normalize_entity_text(&entity.label, &entity.text), + EntityKind::Coreference { source_text } => { + normalize_entity_text(&entity.label, source_text) + } }; - Some(NormalizedKey { + NormalizedKey { label_key: label_key.to_owned(), - text: normalize_entity_text(&entity.label, source_text), - }) + text, + } } fn next_placeholder( diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index b08c61e1..0acd8e1d 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -22,9 +22,10 @@ pub fn redact_text( let offsets = Utf16Offsets::new(full_text); validate_spans(entities, &offsets)?; + let entities = entities_with_source_text(full_text, entities, &offsets)?; - let placeholder_map = build_placeholder_map(entities, full_text); - let mut sorted = entities.to_vec(); + let placeholder_map = build_placeholder_map(&entities, full_text); + let mut sorted = entities; sorted.sort_by_key(|entity| entity.start); let mut non_overlapping = Vec::::new(); @@ -47,7 +48,7 @@ pub fn redact_text( } let placeholder = placeholder_map - .get(&entity.label, &entity.text) + .get_entity(entity) .map_or_else(|| placeholder_fallback(&entity.label), ToOwned::to_owned); let operator = operator_for(config, &entity.label); let replacement = match operator { @@ -99,7 +100,7 @@ pub fn deanonymise( fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { for entity in entities { - if entity.start > entity.end { + if entity.start >= entity.end { return Err(crate::types::Error::InvalidSpan { start: entity.start, end: entity.end, @@ -113,6 +114,23 @@ fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { Ok(()) } +fn entities_with_source_text( + full_text: &str, + entities: &[Entity], + offsets: &Utf16Offsets, +) -> Result> { + let mut resolved = Vec::with_capacity(entities.len()); + + for entity in entities { + let mut resolved_entity = entity.clone(); + resolved_entity.text = + offsets.slice(full_text, entity.start, entity.end)?; + resolved.push(resolved_entity); + } + + Ok(resolved) +} + fn operator_for(config: &OperatorConfig, label: &str) -> OperatorType { config .operators diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index b10361f7..7e02d39c 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -117,7 +117,7 @@ impl SearchIndex { SearchEngine::Literal, &self.literal_pattern_indexes, &literal - .find_iter_packed(haystack) + .find_overlapping_iter_packed(haystack) .map_err(|err| Error::Search { engine: SearchEngine::Literal, reason: err.to_string(), diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 0ba02852..57f76a7b 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -139,6 +139,7 @@ impl Default for OperatorConfig { pub struct PlaceholderEntry { pub label: String, pub text: String, + pub source_text: Option, pub placeholder: String, } @@ -155,26 +156,62 @@ impl PlaceholderMap { #[must_use] pub fn get(&self, label: &str, text: &str) -> Option<&str> { + self.get_with_source(label, text, None).or_else(|| { + self + .entries + .iter() + .find(|entry| entry.label == label && entry.text == text) + .map(|entry| entry.placeholder.as_str()) + }) + } + + #[must_use] + pub(crate) fn get_entity(&self, entity: &Entity) -> Option<&str> { + self.get_with_source( + &entity.label, + &entity.text, + coreference_source_text(entity), + ) + } + + fn get_with_source( + &self, + label: &str, + text: &str, + source_text: Option<&str>, + ) -> Option<&str> { self .entries .iter() - .find(|entry| entry.label == label && entry.text == text) + .find(|entry| { + entry.label == label + && entry.text == text + && entry.source_text.as_deref() == source_text + }) .map(|entry| entry.placeholder.as_str()) } - pub(super) fn has(&self, label: &str, text: &str) -> bool { - self.get(label, text).is_some() + pub(super) fn has_entity(&self, entity: &Entity) -> bool { + self.get_entity(entity).is_some() } - pub(super) fn push(&mut self, label: &str, text: &str, placeholder: &str) { + pub(super) fn push_entity(&mut self, entity: &Entity, placeholder: &str) { self.entries.push(PlaceholderEntry { - label: label.to_owned(), - text: text.to_owned(), + label: entity.label.clone(), + text: entity.text.clone(), + source_text: coreference_source_text(entity).map(ToOwned::to_owned), placeholder: placeholder.to_owned(), }); } } +fn coreference_source_text(entity: &Entity) -> Option<&str> { + let EntityKind::Coreference { source_text } = &entity.kind else { + return None; + }; + Some(source_text) +} + #[derive(Clone, Debug, Eq, PartialEq)] pub struct RedactionEntry { pub placeholder: String, diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index 57562f96..212c3613 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -88,6 +88,43 @@ fn normalized_identifier_values_share_placeholder() { assert_eq!(result.redaction_map[0].placeholder, "[EMAIL_ADDRESS_1]"); } +#[test] +fn contextual_identifier_cues_share_identifier_placeholder() { + let text = "CNI: 12AB34567 was present. CNI nº 12AB34567 was repeated."; + let entities = vec![ + entity(text, "national identification number", "CNI: 12AB34567"), + entity(text, "national identification number", "CNI nº 12AB34567"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[NATIONAL_IDENTIFICATION_NUMBER_1]" + ); +} + +#[test] +fn spaced_identifier_values_still_share_placeholder() { + let text = + "Card 4242 4242 4242 4242 was present. Card 4242424242424242 repeated."; + let entities = vec![ + entity(text, "credit card number", "4242 4242 4242 4242"), + entity(text, "credit card number", "4242424242424242"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[CREDIT_CARD_NUMBER_1]" + ); +} + #[test] fn coreference_alias_uses_source_placeholder_and_value() { let text = "Acme signed. Acme Corporation countersigned."; @@ -121,6 +158,36 @@ fn coreference_alias_uses_source_placeholder_and_value() { assert_eq!(result.redaction_map[0].original, "Acme Corporation"); } +#[test] +fn same_alias_text_can_point_to_different_source_placeholders() { + let text = "Smith met Smith."; + let first = text.find("Smith").unwrap_or(0); + let second = text.rfind("Smith").unwrap_or(first); + let entities = vec![ + Entity::coreference( + u32::try_from(first).unwrap_or(u32::MAX), + u32::try_from(first.saturating_add("Smith".len())).unwrap_or(u32::MAX), + "person", + "Smith", + "Alice Smith", + ), + Entity::coreference( + u32::try_from(second).unwrap_or(u32::MAX), + u32::try_from(second.saturating_add("Smith".len())).unwrap_or(u32::MAX), + "person", + "Smith", + "Bob Smith", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "[PERSON_1] met [PERSON_2]."); + assert_eq!(result.redaction_map[0].original, "Alice Smith"); + assert_eq!(result.redaction_map[1].original, "Bob Smith"); +} + #[test] fn redact_operator_is_not_reversible() { let text = "Contact Alice Smith at alice@example.com."; @@ -164,6 +231,21 @@ fn utf16_offsets_apply_non_ascii_spans() { assert_eq!(result.redacted_text, "A 🦀 [PERSON_1]"); } +#[test] +fn detected_original_uses_redacted_source_span() { + let text = "Alice signed."; + let entities = vec![Entity::detected(0, 5, "person", "Bob")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map[0].original, "Alice"); + assert_eq!( + deanonymise(&result.redacted_text, &result.redaction_map), + text + ); +} + #[test] fn invalid_utf16_boundary_is_rejected() { let text = "A 🦀 Bob"; @@ -175,6 +257,17 @@ fn invalid_utf16_boundary_is_rejected() { assert_eq!(error, Error::Utf16OffsetInsideSurrogate { offset: 3 }); } +#[test] +fn empty_spans_are_rejected() { + let text = "Alice"; + let entities = vec![Entity::detected(0, 0, "person", "")]; + + let error = redact_text(text, &entities, &OperatorConfig::default()) + .expect_err("empty entity spans must fail"); + + assert_eq!(error, Error::InvalidSpan { start: 0, end: 0 }); +} + #[test] fn overlapping_spans_keep_first_entity() { let text = "Alice Smith"; @@ -250,3 +343,20 @@ fn equivalent_passport_cues_share_placeholders() { assert_eq!(result.redaction_map.len(), 1); assert_eq!(result.redaction_map[0].placeholder, "[PASSPORT_NUMBER_1]"); } + +#[test] +fn passport_prefixes_split_by_separators_stay_distinct() { + let text = + "Passport X-12345678 was inspected. Passport Y 12345678 was listed."; + let entities = vec![ + entity(text, "passport number", "X-12345678"), + entity(text, "passport number", "Y 12345678"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 2); + assert_eq!(result.redaction_map[0].placeholder, "[PASSPORT_NUMBER_1]"); + assert_eq!(result.redaction_map[1].placeholder, "[PASSPORT_NUMBER_2]"); +} diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index 507d7821..52eb8e50 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -91,6 +91,42 @@ fn search_index_preserves_utf16_offsets_from_primitive_engines() { ); } +#[test] +fn search_index_returns_overlapping_literal_matches() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Literal(String::from("Alice Smith")), + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Smith signed.").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Literal { + pattern: 1, + start: 0, + end: 11, + }, + ] + ); +} + #[test] fn search_index_reports_match_presence_across_engines() { let index = SearchIndex::new( From 7801b4f41278bfe79e78fcd98e752edb658ea279 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:08:12 +0200 Subject: [PATCH 006/130] docs: clarify core contracts --- crates/anonymize-core/src/lib.rs | 2 ++ crates/anonymize-core/src/normalize.rs | 2 ++ crates/anonymize-core/src/placeholders.rs | 2 ++ crates/anonymize-core/src/redact.rs | 4 ++++ crates/anonymize-core/src/search.rs | 2 ++ crates/anonymize-core/src/types.rs | 3 +++ 6 files changed, 15 insertions(+) diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 121a3e14..ddc5ee15 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -1,5 +1,7 @@ #![allow(clippy::redundant_pub_crate)] +//! Core anonymization contracts shared by host-language bindings. + pub(crate) mod normalize; mod placeholders; mod redact; diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 261ac312..8968ca77 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -1,6 +1,7 @@ const PHONE_NOISE: [char; 3] = ['(', ')', '-']; const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; +// Normalization decides placeholder identity. pub(crate) fn label_key(label: &str) -> String { let uppercase = uppercase(label); collapse_whitespace(&uppercase, "_", false) @@ -83,6 +84,7 @@ fn strip_id_separators(text: &str) -> String { } fn normalize_identifier_text(text: &str) -> String { + // Strip contextual cues before comparing identifiers. find_compact_ascii_identifier(text, true, is_generic_identifier) .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) } diff --git a/crates/anonymize-core/src/placeholders.rs b/crates/anonymize-core/src/placeholders.rs index 8ac103dc..9216e35e 100644 --- a/crates/anonymize-core/src/placeholders.rs +++ b/crates/anonymize-core/src/placeholders.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, BTreeSet}; use crate::normalize::{label_key, normalize_entity_text}; use crate::types::{Entity, EntityKind, PlaceholderMap}; +// Document-local placeholder key. #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] struct NormalizedKey { label_key: String, @@ -45,6 +46,7 @@ pub fn build_placeholder_map( } fn normalized_key(entity: &Entity, label_key: &str) -> NormalizedKey { + // Coreference aliases key by source identity, not alias text. let text = match &entity.kind { EntityKind::Detected => normalize_entity_text(&entity.label, &entity.text), EntityKind::Coreference { source_text } => { diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index 0acd8e1d..675693ef 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -22,12 +22,15 @@ pub fn redact_text( let offsets = Utf16Offsets::new(full_text); validate_spans(entities, &offsets)?; + + // Reversible originals come from the source span, not caller display text. let entities = entities_with_source_text(full_text, entities, &offsets)?; let placeholder_map = build_placeholder_map(&entities, full_text); let mut sorted = entities; sorted.sort_by_key(|entity| entity.start); + // Existing contract: first accepted span wins overlaps. let mut non_overlapping = Vec::::new(); let mut last_end = 0; for entity in sorted { @@ -100,6 +103,7 @@ pub fn deanonymise( fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { for entity in entities { + // Empty spans would insert without redacting. if entity.start >= entity.end { return Err(crate::types::Error::InvalidSpan { start: entity.start, diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 7e02d39c..8b2f2f11 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -4,6 +4,7 @@ use stella_regex_set_core as regex_core; use crate::types::{Error, Result, SearchEngine, SearchMatch}; +// Preserves caller pattern indexes across primitive engines. #[derive(Clone, Debug, Eq, PartialEq)] pub enum SearchPattern { Literal(String), @@ -112,6 +113,7 @@ impl SearchIndex { let mut matches = Vec::new(); if let Some(literal) = &self.literal { + // Downstream merge priority chooses among overlaps. extend_triple_matches( &mut matches, SearchEngine::Literal, diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 57f76a7b..cd194cd2 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -67,6 +67,7 @@ pub enum EntityKind { Coreference { source_text: String }, } +/// Source span with UTF-16 offsets. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Entity { pub start: u32, @@ -143,6 +144,7 @@ pub struct PlaceholderEntry { pub placeholder: String, } +/// Deterministic placeholder lookup for one document. #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct PlaceholderMap { entries: Vec, @@ -249,6 +251,7 @@ impl fmt::Display for SearchEngine { } } +/// Search match with the caller's pattern index. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum SearchMatch { Literal { From c2ff1a380de515d4520a40baca36c5176010597e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:13:38 +0200 Subject: [PATCH 007/130] fix: normalize identifier cues --- .../anonymize-core/data/identifier-cues.txt | 22 ++++++++++ crates/anonymize-core/src/normalize.rs | 41 +++++++++++++++++++ crates/anonymize-core/tests/redaction.rs | 25 +++++++---- 3 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 crates/anonymize-core/data/identifier-cues.txt diff --git a/crates/anonymize-core/data/identifier-cues.txt b/crates/anonymize-core/data/identifier-cues.txt new file mode 100644 index 00000000..00b7ad4a --- /dev/null +++ b/crates/anonymize-core/data/identifier-cues.txt @@ -0,0 +1,22 @@ +CIF +CNI +CNPJ +CPF +DIČ +DIC +DNI +IČO +ICO +ID +KRS +NIE +NIF +NIP +OAB +PESEL +RCS +REGON +RG +SIREN +SIRET +TVA diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 8968ca77..55ea6f6b 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -1,5 +1,6 @@ const PHONE_NOISE: [char; 3] = ['(', ')', '-']; const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; +const IDENTIFIER_CUES: &str = include_str!("../data/identifier-cues.txt"); // Normalization decides placeholder identity. pub(crate) fn label_key(label: &str) -> String { @@ -85,10 +86,50 @@ fn strip_id_separators(text: &str) -> String { fn normalize_identifier_text(text: &str) -> String { // Strip contextual cues before comparing identifiers. + if let Some(after_cue) = strip_leading_identifier_cue(text) + && let Some(identifier) = + find_compact_ascii_identifier(after_cue, true, is_generic_identifier) + { + return identifier; + } + find_compact_ascii_identifier(text, true, is_generic_identifier) .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) } +fn strip_leading_identifier_cue(text: &str) -> Option<&str> { + let trimmed = text.trim_start(); + let mut cue_end = 0; + + for (index, ch) in trimmed.char_indices() { + if !ch.is_alphabetic() { + break; + } + cue_end = index.saturating_add(ch.len_utf8()); + } + + if cue_end == 0 { + return None; + } + + let cue = trimmed.get(..cue_end)?; + if !is_identifier_cue(cue) { + return None; + } + + let after_cue = trimmed.get(cue_end..)?; + after_cue + .chars() + .next() + .is_some_and(char::is_whitespace) + .then(|| after_cue.trim_start()) +} + +fn is_identifier_cue(cue: &str) -> bool { + let upper = uppercase(cue); + IDENTIFIER_CUES.lines().any(|line| line == upper) +} + fn is_identifier_label(upper: &str) -> bool { matches!( upper, diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index 212c3613..ffe8f113 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -10,15 +10,19 @@ use stella_anonymize_core::{ }; fn entity(text: &str, label: &str, value: &str) -> Entity { - let start = text + let byte_start = text .find(value) .unwrap_or_else(|| panic!("missing fixture value: {value}")); - Entity::detected( - u32::try_from(start).unwrap_or(u32::MAX), - u32::try_from(start.saturating_add(value.len())).unwrap_or(u32::MAX), - label, - value, - ) + let prefix = text + .get(..byte_start) + .unwrap_or_else(|| panic!("invalid fixture boundary: {byte_start}")); + let start = utf16_len(prefix); + let end = start.saturating_add(utf16_len(value)); + Entity::detected(start, end, label, value) +} + +fn utf16_len(text: &str) -> u32 { + u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) } #[test] @@ -90,10 +94,15 @@ fn normalized_identifier_values_share_placeholder() { #[test] fn contextual_identifier_cues_share_identifier_placeholder() { - let text = "CNI: 12AB34567 was present. CNI nº 12AB34567 was repeated."; + let text = concat!( + "CNI: 12AB34567 was present. ", + "CNI nº 12AB34567 was repeated. ", + "CNI 12AB34567 was listed." + ); let entities = vec![ entity(text, "national identification number", "CNI: 12AB34567"), entity(text, "national identification number", "CNI nº 12AB34567"), + entity(text, "national identification number", "CNI 12AB34567"), ]; let result = From 0abb52faff02e8db72debe6c967f1d897be787a7 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:23:47 +0200 Subject: [PATCH 008/130] feat: add core entity resolution --- .../data/address-final-abbrevs.txt | 5 + .../data/legal-period-suffixes.txt | 7 + crates/anonymize-core/src/lib.rs | 5 + crates/anonymize-core/src/resolution.rs | 690 ++++++++++++++++++ crates/anonymize-core/tests/resolution.rs | 216 ++++++ 5 files changed, 923 insertions(+) create mode 100644 crates/anonymize-core/data/address-final-abbrevs.txt create mode 100644 crates/anonymize-core/data/legal-period-suffixes.txt create mode 100644 crates/anonymize-core/src/resolution.rs create mode 100644 crates/anonymize-core/tests/resolution.rs diff --git a/crates/anonymize-core/data/address-final-abbrevs.txt b/crates/anonymize-core/data/address-final-abbrevs.txt new file mode 100644 index 00000000..83b517a7 --- /dev/null +++ b/crates/anonymize-core/data/address-final-abbrevs.txt @@ -0,0 +1,5 @@ +St. +Ave. +Rd. +Blvd. +Sq. diff --git a/crates/anonymize-core/data/legal-period-suffixes.txt b/crates/anonymize-core/data/legal-period-suffixes.txt new file mode 100644 index 00000000..a4055448 --- /dev/null +++ b/crates/anonymize-core/data/legal-period-suffixes.txt @@ -0,0 +1,7 @@ +Inc. +Ltd. +Corp. +N.A. +Kft. +S.A. +a. s. diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index ddc5ee15..bb58e5c7 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -5,12 +5,17 @@ pub(crate) mod normalize; mod placeholders; mod redact; +mod resolution; mod search; mod types; pub(crate) mod utf16; pub use placeholders::build_placeholder_map; pub use redact::{deanonymise, redact_text}; +pub use resolution::{ + DetectionSource, PipelineEntity, SourceDetail, merge_and_dedup, + sanitize_entities, +}; pub use search::{ FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, SearchOptions, SearchPattern, diff --git a/crates/anonymize-core/src/resolution.rs b/crates/anonymize-core/src/resolution.rs new file mode 100644 index 00000000..2806a8f3 --- /dev/null +++ b/crates/anonymize-core/src/resolution.rs @@ -0,0 +1,690 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::types::EntityKind; + +const LEGAL_PERIOD_SUFFIXES: &str = + include_str!("../data/legal-period-suffixes.txt"); +const ADDRESS_FINAL_ABBREVS: &str = + include_str!("../data/address-final-abbrevs.txt"); + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DetectionSource { + Trigger, + Regex, + DenyList, + LegalForm, + Gazetteer, + Country, + Ner, + Coreference, +} + +impl DetectionSource { + const fn priority(self) -> u8 { + match self { + Self::Gazetteer => 5, + Self::Trigger => 4, + Self::LegalForm | Self::Regex | Self::Country => 3, + Self::DenyList | Self::Coreference => 2, + Self::Ner => 1, + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SourceDetail { + CustomDenyList, + CustomRegex, + GazetteerExtension, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct PipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: DetectionSource, + pub source_detail: Option, + pub kind: EntityKind, +} + +impl PipelineEntity { + #[must_use] + pub fn detected( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source: DetectionSource, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source, + source_detail: None, + kind: EntityKind::Detected, + } + } + + #[must_use] + pub fn coreference( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source_text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source: DetectionSource::Coreference, + source_detail: None, + kind: EntityKind::Coreference { + source_text: source_text.into(), + }, + } + } +} + +#[must_use] +pub fn merge_and_dedup(entities: &[PipelineEntity]) -> Vec { + if entities.is_empty() { + return Vec::new(); + } + + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let Some(first) = sorted.first() else { + return Vec::new(); + }; + let mut merged = vec![first.clone()]; + + for entity in sorted.into_iter().skip(1) { + let overlaps = overlapping_indexes(&merged, &entity); + if overlaps.is_empty() { + merged.push(entity); + continue; + } + + let has_partial_overlap = overlaps.iter().any(|index| { + merged.get(*index).is_some_and(|existing| { + existing.start != entity.start || existing.end != entity.end + }) + }); + + if !has_partial_overlap { + let same_label_index = overlaps.iter().find_map(|index| { + merged + .get(*index) + .is_some_and(|existing| existing.label == entity.label) + .then_some(*index) + }); + + let Some(index) = same_label_index else { + merged.push(entity); + merged.sort_by_key(|entry| entry.start); + continue; + }; + + if let Some(existing) = merged.get(index) + && should_replace(&entity, existing) + { + replace_at(&mut merged, index, entity); + } + continue; + } + + let replaces_all = overlaps.iter().all(|index| { + merged + .get(*index) + .is_some_and(|existing| should_replace(&entity, existing)) + }); + if !replaces_all { + continue; + } + + let Some(insert_at) = overlaps.first().copied() else { + continue; + }; + for index in overlaps.iter().rev() { + remove_at(&mut merged, *index); + } + insert_at_or_push(&mut merged, insert_at, entity); + } + + resolve_same_span_label_conflicts(&sanitize_entities(&merged)) +} + +#[must_use] +pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { + let mut sanitized = Vec::new(); + + for entity in entities { + if is_caller_owned(entity) || has_curated_literal_boundary(entity) { + sanitized.push(entity.clone()); + continue; + } + + let Some(cleaned) = clean_entity_text(entity) else { + continue; + }; + sanitized.push(cleaned); + } + + sanitized +} + +fn overlapping_indexes( + entities: &[PipelineEntity], + entity: &PipelineEntity, +) -> Vec { + entities + .iter() + .enumerate() + .filter_map(|(index, existing)| { + (existing.end > entity.start && existing.start < entity.end) + .then_some(index) + }) + .collect() +} + +fn should_replace( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + let candidate_len = entity_len(candidate); + let existing_len = entity_len(existing); + let candidate_caller_owned = is_caller_owned(candidate); + let existing_caller_owned = is_caller_owned(existing); + if candidate_caller_owned != existing_caller_owned { + return candidate_caller_owned; + } + + if literal_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if literal_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if address_contains_bare_postal(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if address_contains_bare_postal(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if legal_form_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if legal_form_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if same_start_longest_wins(candidate, existing) + && candidate_len != existing_len + { + return candidate_len > existing_len; + } + + if country_inside_person_or_org(candidate, existing) + && existing_len > candidate_len + { + return false; + } + if country_inside_person_or_org(existing, candidate) + && candidate_len > existing_len + { + return true; + } + + let candidate_priority = candidate.source.priority(); + let existing_priority = existing.source.priority(); + if candidate_priority != existing_priority { + return candidate_priority > existing_priority; + } + + match candidate.score.total_cmp(&existing.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => candidate_len > existing_len, + } +} + +fn resolve_same_span_label_conflicts( + entities: &[PipelineEntity], +) -> Vec { + if entities.len() < 2 { + return entities.to_vec(); + } + + let mut by_offsets = BTreeMap::<(u32, u32), Vec>::new(); + for (index, entity) in entities.iter().enumerate() { + by_offsets + .entry((entity.start, entity.end)) + .or_default() + .push(index); + } + + let mut dropped = BTreeSet::::new(); + for group in by_offsets.values() { + if group.len() < 2 { + continue; + } + + let labels = group + .iter() + .filter_map(|index| entities.get(*index)) + .map(|entity| entity.label.as_str()) + .collect::>(); + if labels.len() < 2 { + continue; + } + + let has_person = labels.contains("person"); + let has_precise_non_address = labels + .iter() + .any(|label| *label != "address" && precise_over_address(label)); + let mut yielding_to_person = BTreeSet::::new(); + + if has_person { + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if !is_caller_owned(entity) && person_preferred_over(&entity.label) { + yielding_to_person.insert(*index); + } + } + } + + let mut max_priority = None::; + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) || yielding_to_person.contains(index) { + continue; + } + max_priority = Some(max_priority.map_or_else( + || entity.source.priority(), + |priority| priority.max(entity.source.priority()), + )); + } + + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) { + continue; + } + if yielding_to_person.contains(index) { + dropped.insert(*index); + continue; + } + if max_priority + .is_some_and(|priority| entity.source.priority() < priority) + { + dropped.insert(*index); + continue; + } + if has_precise_non_address && entity.label == "address" { + dropped.insert(*index); + } + } + } + + entities + .iter() + .enumerate() + .filter(|(index, _)| !dropped.contains(index)) + .map(|(_, entity)| entity.clone()) + .collect() +} + +fn clean_entity_text(entity: &PipelineEntity) -> Option { + let mut start_byte = 0; + let mut end_byte = entity.text.len(); + + while let Some((ch, len)) = first_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_leading_trim(ch, &entity.label) { + start_byte = start_byte.saturating_add(len); + continue; + } + break; + } + + while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if should_strip_period(entity, start_byte, end_byte) { + end_byte = end_byte.saturating_sub('.'.len_utf8()); + } + + while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if start_byte >= end_byte { + return None; + } + + let cleaned_raw = entity.text.get(start_byte..end_byte)?; + if !cleaned_raw.chars().any(char::is_alphanumeric) { + return None; + } + + let display_text = collapse_display_whitespace(cleaned_raw); + let start = entity.start.saturating_add(utf16_len( + entity.text.get(..start_byte).unwrap_or_default(), + )); + let end = start.saturating_add(utf16_len(cleaned_raw)); + + let mut cleaned = entity.clone(); + cleaned.start = start; + cleaned.end = end; + cleaned.text = display_text; + Some(cleaned) +} + +fn replace_at( + entities: &mut [PipelineEntity], + index: usize, + entity: PipelineEntity, +) { + if let Some(slot) = entities.get_mut(index) { + *slot = entity; + } +} + +fn remove_at(entities: &mut Vec, index: usize) { + if index < entities.len() { + entities.remove(index); + } +} + +fn insert_at_or_push( + entities: &mut Vec, + index: usize, + entity: PipelineEntity, +) { + if index <= entities.len() { + entities.insert(index, entity); + return; + } + entities.push(entity); +} + +const fn entity_len(entity: &PipelineEntity) -> u32 { + entity.end.saturating_sub(entity.start) +} + +const fn is_caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn literal_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && matches!( + outer.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn address_contains_bare_postal( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == "address" + && inner.label == "address" + && outer.start <= inner.start + && outer.end >= inner.end + && is_bare_postal_code(&inner.text) +} + +fn legal_form_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && outer.source == DetectionSource::LegalForm + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn same_start_longest_wins( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + candidate.label == existing.label + && candidate.start == existing.start + && longest_wins_label(&candidate.label) +} + +fn country_inside_person_or_org( + country: &PipelineEntity, + container: &PipelineEntity, +) -> bool { + country.label == "country" + && matches!(container.label.as_str(), "person" | "organization") + && container.start <= country.start + && container.end >= country.end +} + +fn has_curated_literal_boundary(entity: &PipelineEntity) -> bool { + matches!( + entity.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) && entity.label != "person" + && entity.source_detail != Some(SourceDetail::GazetteerExtension) + && entity + .text + .chars() + .next() + .into_iter() + .chain(entity.text.chars().next_back()) + .any(is_literal_boundary_punct) +} + +fn is_leading_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } +} + +fn is_trailing_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } +} + +const fn is_literal_boundary_punct(ch: char) -> bool { + matches!( + ch, + '"' + | '\'' + | '“' + | '”' + | '„' + | '‟' + | '‘' + | '’' + | '‛' + | '«' + | '»' + | '!' + | '.' + ) +} + +fn should_strip_period( + entity: &PipelineEntity, + start_byte: usize, + end_byte: usize, +) -> bool { + if !matches!( + entity.label.as_str(), + "organization" | "location" | "address" + ) { + return false; + } + let Some(text) = entity.text.get(start_byte..end_byte) else { + return false; + }; + if !text.ends_with('.') || known_period_suffix(text) { + return false; + } + if entity.label == "address" && known_address_final_abbrev(text) { + return false; + } + !(entity.label == "location" && known_location_final_abbrev(text)) +} + +fn known_period_suffix(text: &str) -> bool { + LEGAL_PERIOD_SUFFIXES + .lines() + .any(|suffix| text.ends_with(suffix)) +} + +fn known_address_final_abbrev(text: &str) -> bool { + ADDRESS_FINAL_ABBREVS.lines().any(|suffix| { + text + .strip_suffix(suffix) + .is_some_and(|prefix| prefix.ends_with(char::is_whitespace)) + }) +} + +fn known_location_final_abbrev(text: &str) -> bool { + text.ends_with("D.C.") + || text + .split_whitespace() + .next_back() + .is_some_and(|token| token.chars().filter(|ch| *ch == '.').count() >= 2) +} + +fn label_allows_colon(label: &str) -> bool { + matches!(label, "ip address" | "mac address") +} + +fn longest_wins_label(label: &str) -> bool { + matches!( + label, + "date" + | "date of birth" + | "monetary amount" + | "phone number" + | "email address" + | "url" + ) +} + +fn precise_over_address(label: &str) -> bool { + matches!( + label, + "person" + | "date" + | "date of birth" + | "phone number" + | "email address" + | "monetary amount" + | "iban" + | "bank account number" + | "tax identification number" + | "registration number" + | "identity card number" + | "national identification number" + | "passport number" + | "credit card number" + ) +} + +fn person_preferred_over(label: &str) -> bool { + matches!(label, "address" | "country" | "land parcel") +} + +fn is_bare_postal_code(text: &str) -> bool { + let compact = text + .chars() + .filter(|ch| !ch.is_whitespace() && *ch != '-' && *ch != '–') + .collect::(); + let len = compact.len(); + matches!(len, 5 | 8 | 9) && compact.chars().all(|ch| ch.is_ascii_digit()) +} + +fn collapse_display_whitespace(text: &str) -> String { + let mut output = String::new(); + let mut in_whitespace = false; + + for ch in text.chars() { + if ch.is_whitespace() { + if !in_whitespace { + output.push(' '); + in_whitespace = true; + } + continue; + } + + output.push(ch); + in_whitespace = false; + } + + output +} + +fn first_char(text: &str) -> Option<(char, usize)> { + text.chars().next().map(|ch| (ch, ch.len_utf8())) +} + +fn last_char(text: &str) -> Option<(char, usize)> { + text.chars().next_back().map(|ch| (ch, ch.len_utf8())) +} + +fn utf16_len(text: &str) -> u32 { + u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +} diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs new file mode 100644 index 00000000..6d2a66e6 --- /dev/null +++ b/crates/anonymize-core/tests/resolution.rs @@ -0,0 +1,216 @@ +#![allow(clippy::expect_used, clippy::float_cmp, clippy::unwrap_used)] + +use stella_anonymize_core::{ + DetectionSource, PipelineEntity, SourceDetail, merge_and_dedup, + sanitize_entities, +}; + +fn entity( + source: DetectionSource, + score: f64, + start: u32, + end: u32, + label: &str, +) -> PipelineEntity { + PipelineEntity::detected( + start, + end, + label, + "x".repeat(usize::try_from(end.saturating_sub(start)).unwrap_or(0)), + score, + source, + ) +} + +fn text_entity( + text: &str, + label: &str, + source: DetectionSource, +) -> PipelineEntity { + PipelineEntity::detected(0, utf16_len(text), label, text, 0.9, source) +} + +fn utf16_len(text: &str) -> u32 { + u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +} + +#[test] +fn non_overlapping_entities_pass_through_sorted() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 20, 25, "person"), + entity(DetectionSource::Regex, 0.7, 0, 5, "person"), + entity(DetectionSource::Regex, 0.8, 10, 15, "person"), + ]); + + assert_eq!(result.len(), 3); + assert_eq!( + result.iter().map(|entry| entry.start).collect::>(), + vec![0, 10, 20] + ); +} + +#[test] +fn source_priority_beats_score_for_same_span() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.99, 0, 10, "person"), + entity(DetectionSource::Trigger, 0.7, 0, 10, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source, + DetectionSource::Trigger + ); +} + +#[test] +fn gazetteer_has_highest_source_priority() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.99, 5, 15, "person"), + entity(DetectionSource::Trigger, 0.99, 5, 15, "person"), + entity(DetectionSource::Gazetteer, 0.8, 5, 15, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source, + DetectionSource::Gazetteer + ); +} + +#[test] +fn same_priority_uses_score_then_length() { + let higher_score = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.85, 0, 8, "person"), + entity(DetectionSource::Regex, 0.92, 0, 8, "person"), + ]); + assert_eq!(higher_score.len(), 1); + assert_eq!(higher_score.first().expect("result").score, 0.92); + + let longer = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.9, 0, 5, "person"), + entity(DetectionSource::Ner, 0.9, 0, 10, "person"), + ]); + assert_eq!(longer.len(), 1); + assert_eq!(longer.first().expect("result").end, 10); +} + +#[test] +fn identical_spans_with_different_labels_are_kept() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 0, 5, "person"), + entity(DetectionSource::Regex, 0.9, 0, 5, "project"), + ]); + + assert_eq!(result.len(), 2); + assert_eq!( + result + .iter() + .map(|entry| entry.label.as_str()) + .collect::>(), + vec!["person", "project"] + ); +} + +#[test] +fn literal_container_beats_shorter_same_label_match() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 1.0, 0, 6, "postal code"), + entity(DetectionSource::DenyList, 1.0, 0, 11, "postal code"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::DenyList); + assert_eq!(kept.end, 11); +} + +#[test] +fn caller_owned_boundaries_win_overlap_resolution() { + let mut custom = entity(DetectionSource::Regex, 0.5, 0, 8, "person"); + custom.source_detail = Some(SourceDetail::CustomRegex); + let result = merge_and_dedup(&[ + entity(DetectionSource::Trigger, 0.99, 0, 10, "person"), + custom, + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source_detail, + Some(SourceDetail::CustomRegex) + ); +} + +#[test] +fn same_span_country_loses_to_person() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Country, 0.95, 0, 5, "country"), + entity(DetectionSource::DenyList, 0.9, 0, 5, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!(result.first().expect("result").label, "person"); +} + +#[test] +fn sanitize_trims_punctuation_and_updates_utf16_offsets() { + let mut input = + text_entity("\"Tesla Shares\"", "organization", DetectionSource::Ner); + input.start = 10; + input.end = 10_u32.saturating_add(utf16_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "Tesla Shares"); + assert_eq!(entity.start, 11); + assert_eq!(entity.end, 23); +} + +#[test] +fn sanitize_preserves_literal_dictionary_punctuation() { + let result = sanitize_entities(&[ + text_entity("Hello bank!", "organization", DetectionSource::DenyList), + text_entity( + "\"Juez y parte\"", + "organization", + DetectionSource::DenyList, + ), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Hello bank!", "\"Juez y parte\""] + ); +} + +#[test] +fn sanitize_keeps_known_period_suffixes_from_data() { + let result = sanitize_entities(&[ + text_entity("Acme Inc.", "organization", DetectionSource::Ner), + text_entity("123 Main St.", "address", DetectionSource::Ner), + text_entity("Washington, D.C.", "location", DetectionSource::Ner), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Acme Inc.", "123 Main St.", "Washington, D.C."] + ); +} + +#[test] +fn sanitize_drops_empty_entities() { + let result = sanitize_entities(&[text_entity( + "\";!", + "organization", + DetectionSource::Ner, + )]); + + assert!(result.is_empty()); +} From 1508c8430940a845ab49375e6a11878d48e99878 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:30:02 +0200 Subject: [PATCH 009/130] feat: add core boundary resolution --- crates/anonymize-core/src/lib.rs | 4 +- crates/anonymize-core/src/resolution.rs | 405 +++++++++++++++++++++- crates/anonymize-core/tests/resolution.rs | 159 ++++++++- 3 files changed, 563 insertions(+), 5 deletions(-) diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index bb58e5c7..8d3989c3 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -13,8 +13,8 @@ pub(crate) mod utf16; pub use placeholders::build_placeholder_map; pub use redact::{deanonymise, redact_text}; pub use resolution::{ - DetectionSource, PipelineEntity, SourceDetail, merge_and_dedup, - sanitize_entities, + DetectionSource, PipelineEntity, SourceDetail, enforce_boundary_consistency, + merge_and_dedup, sanitize_entities, }; pub use search::{ FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, diff --git a/crates/anonymize-core/src/resolution.rs b/crates/anonymize-core/src/resolution.rs index 2806a8f3..f92d7b1c 100644 --- a/crates/anonymize-core/src/resolution.rs +++ b/crates/anonymize-core/src/resolution.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeMap, BTreeSet}; -use crate::types::EntityKind; +use crate::types::{EntityKind, Result}; +use crate::utf16::Utf16Offsets; const LEGAL_PERIOD_SUFFIXES: &str = include_str!("../data/legal-period-suffixes.txt"); @@ -185,6 +186,274 @@ pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { sanitized } +pub fn enforce_boundary_consistency( + entities: &[PipelineEntity], + full_text: &str, +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let spans = char_spans(full_text); + let boundaries = word_boundaries(&spans); + let fixed = + fix_partial_words(entities, full_text, &offsets, &spans, &boundaries)?; + let resolved = resolve_cross_label_overlaps(&fixed, full_text, &offsets)?; + let deduped = deduplicate_spans(&resolved); + let merged = merge_adjacent(&deduped, full_text, &offsets)?; + Ok(remove_nested_same_label(&merged)) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CharSpan { + start: u32, + end: u32, + ch: char, +} + +fn fix_partial_words( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, + spans: &[CharSpan], + boundaries: &BTreeSet, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut fixed = Vec::with_capacity(sorted.len()); + + for (index, entity) in sorted.iter().enumerate() { + if has_locked_boundary(entity) || has_detector_locked_boundary(entity) { + fixed.push(entity.clone()); + continue; + } + + if entity.text != offsets.slice(full_text, entity.start, entity.end)? { + fixed.push(entity.clone()); + continue; + } + + let mut new_start = word_start_at(entity.start, boundaries, spans); + let mut new_end = word_end_at(entity.end, boundaries, spans); + + for (other_index, other) in sorted.iter().enumerate() { + if other_index == index || other.label == entity.label { + continue; + } + if other.end > new_start && other.end <= entity.start { + new_start = new_start.max(other.end); + } + if other.start >= entity.end && other.start < new_end { + new_end = new_end.min(other.start); + } + } + + if new_start == entity.start && new_end == entity.end { + fixed.push(entity.clone()); + continue; + } + + let mut adjusted = entity.clone(); + adjusted.start = new_start; + adjusted.end = new_end; + adjusted.text = offsets.slice(full_text, new_start, new_end)?; + fixed.push(adjusted); + } + + Ok(fixed) +} + +fn resolve_cross_label_overlaps( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let mut left_index = 0; + while left_index < sorted.len() { + let mut right_index = left_index.saturating_add(1); + while right_index < sorted.len() { + let Some(left) = sorted.get(left_index) else { + break; + }; + let Some(right) = sorted.get(right_index) else { + break; + }; + if right.start >= left.end { + break; + } + if left.label == right.label + || contains_span(left, right) + || contains_span(right, left) + { + right_index = right_index.saturating_add(1); + continue; + } + + let left_len = entity_len(left); + let right_len = entity_len(right); + let left_locked = has_locked_boundary(left); + let right_locked = has_locked_boundary(right); + let left_wins = if left_locked == right_locked { + match left.score.total_cmp(&right.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => left_len >= right_len, + } + } else { + left_locked + }; + + if left_wins { + let new_start = left.end; + if let Some(right_mut) = sorted.get_mut(right_index) { + right_mut.start = new_start; + right_mut.text = + offsets.slice(full_text, new_start, right_mut.end)?; + } + right_index = right_index.saturating_add(1); + continue; + } + + let new_end = right.start; + if let Some(left_mut) = sorted.get_mut(left_index) { + left_mut.end = new_end; + left_mut.text = offsets.slice(full_text, left_mut.start, new_end)?; + } + break; + } + + left_index = left_index.saturating_add(1); + } + + Ok( + sorted + .into_iter() + .filter(|entity| entity.start < entity.end) + .collect(), + ) +} + +fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { + let mut seen = BTreeMap::<(u32, u32, String), PipelineEntity>::new(); + + for entity in entities { + let key = (entity.start, entity.end, entity.label.clone()); + let replace = seen + .get(&key) + .is_none_or(|existing| entity.score.total_cmp(&existing.score).is_gt()); + if replace { + seen.insert(key, entity.clone()); + } + } + + seen.into_values().collect() +} + +fn merge_adjacent( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut result = Vec::::new(); + let mut last_by_label = BTreeMap::::new(); + + for entity in &sorted { + if has_locked_boundary(entity) { + result.push(entity.clone()); + continue; + } + + let Some(previous_index) = last_by_label.get(&entity.label).copied() else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + let Some(previous) = result.get(previous_index) else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + if !has_locked_boundary(previous) && entity.start < previous.end { + merge_into_previous( + &mut result, + previous_index, + entity, + full_text, + offsets, + )?; + continue; + } + + let gap = offsets.slice(full_text, previous.end, entity.start)?; + let gap_start = previous.end; + let gap_end = entity.start; + let gap_occupied = sorted.iter().any(|other| { + other.label != entity.label + && other.start < gap_end + && other.end > gap_start + }); + let legal_form_comma = (is_legal_form_organization(previous) + || is_legal_form_organization(entity)) + && gap.contains(','); + + if !has_locked_boundary(previous) + && !legal_form_comma + && entity.label != "country" + && !gap_occupied + && is_mergeable_gap(&gap) + { + merge_into_previous( + &mut result, + previous_index, + entity, + full_text, + offsets, + )?; + continue; + } + + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + } + + Ok(result) +} + +fn remove_nested_same_label( + entities: &[PipelineEntity], +) -> Vec { + let mut sorted = entities.to_vec(); + sorted.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| entity_len(right).cmp(&entity_len(left))) + }); + + let mut result = Vec::new(); + let mut max_end_by_label = BTreeMap::::new(); + + for entity in sorted { + if max_end_by_label + .get(&entity.label) + .is_some_and(|max_end| entity.end <= *max_end) + { + continue; + } + max_end_by_label.insert(entity.label.clone(), entity.end); + result.push(entity); + } + + result +} + fn overlapping_indexes( entities: &[PipelineEntity], entity: &PipelineEntity, @@ -443,6 +712,140 @@ fn insert_at_or_push( entities.push(entity); } +fn char_spans(text: &str) -> Vec { + let mut spans = Vec::new(); + let mut offset = 0_u32; + + for ch in text.chars() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + let end = offset.saturating_add(width); + spans.push(CharSpan { + start: offset, + end, + ch, + }); + offset = end; + } + + spans +} + +fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { + let mut boundaries = BTreeSet::new(); + let mut run_start = None::; + let mut run_end = None::; + + for span in spans { + if span.ch.is_alphanumeric() { + if run_start.is_none() { + run_start = Some(span.start); + } + run_end = Some(span.end); + continue; + } + + if let (Some(start), Some(end)) = (run_start.take(), run_end.take()) { + boundaries.insert(start); + boundaries.insert(end); + } + } + + if let (Some(start), Some(end)) = (run_start, run_end) { + boundaries.insert(start); + boundaries.insert(end); + } + + boundaries +} + +fn word_start_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + while cursor > 0 && !boundaries.contains(&cursor) { + let Some(previous) = spans.iter().rev().find(|span| span.end <= cursor) + else { + return cursor; + }; + if is_word_start_stop(previous.ch) { + return cursor; + } + cursor = previous.start; + } + cursor +} + +fn word_end_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + let text_end = spans.last().map_or(0, |span| span.end); + while cursor < text_end && !boundaries.contains(&cursor) { + let Some(next) = spans.iter().find(|span| span.start >= cursor) else { + return cursor; + }; + if is_word_end_stop(next.ch) { + return cursor; + } + cursor = next.end; + } + cursor +} + +fn merge_into_previous( + entities: &mut [PipelineEntity], + previous_index: usize, + entity: &PipelineEntity, + full_text: &str, + offsets: &Utf16Offsets, +) -> Result<()> { + if let Some(previous) = entities.get_mut(previous_index) { + previous.end = previous.end.max(entity.end); + previous.text = offsets.slice(full_text, previous.start, previous.end)?; + if entity.score.total_cmp(&previous.score).is_gt() { + previous.score = entity.score; + } + } + Ok(()) +} + +const fn contains_span(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.start <= inner.start && outer.end >= inner.end +} + +const fn has_locked_boundary(entity: &PipelineEntity) -> bool { + is_caller_owned(entity) +} + +fn has_detector_locked_boundary(entity: &PipelineEntity) -> bool { + entity.label == "phone number" && entity.source == DetectionSource::Trigger +} + +fn is_legal_form_organization(entity: &PipelineEntity) -> bool { + entity.label == "organization" && entity.source == DetectionSource::LegalForm +} + +fn is_mergeable_gap(gap: &str) -> bool { + gap.is_empty() + || (utf16_len(gap) <= 3 + && gap.chars().all(|ch| matches!(ch, ' ' | '\t' | ',' | '-'))) +} + +const fn is_word_start_stop(ch: char) -> bool { + matches!(ch, '\n' | '\r' | ',' | ';' | '(' | ')' | '[' | ']' | '&') +} + +const fn is_word_end_stop(ch: char) -> bool { + matches!( + ch, + '\n' | '\r' | ',' | ';' | '.' | '(' | ')' | '[' | ']' | '&' + ) +} + const fn entity_len(entity: &PipelineEntity) -> u32 { entity.end.saturating_sub(entity.start) } diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 6d2a66e6..90681939 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -1,8 +1,8 @@ #![allow(clippy::expect_used, clippy::float_cmp, clippy::unwrap_used)] use stella_anonymize_core::{ - DetectionSource, PipelineEntity, SourceDetail, merge_and_dedup, - sanitize_entities, + DetectionSource, PipelineEntity, SourceDetail, enforce_boundary_consistency, + merge_and_dedup, sanitize_entities, }; fn entity( @@ -214,3 +214,158 @@ fn sanitize_drops_empty_entities() { assert!(result.is_empty()); } + +#[test] +fn boundary_merges_adjacent_same_label_entities() { + let full_text = "Kontaktujte Jan Novák prosím."; + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.8, 12, 15, "person"), + entity(DetectionSource::Ner, 0.95, 16, 21, "person"), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.text, "Jan Novák"); + assert_eq!(person.start, 12); + assert_eq!(person.end, 21); + assert_eq!(person.score, 0.95); +} + +#[test] +fn boundary_expands_partial_words() { + let full_text = "Kontaktujte Novák prosím."; + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + 12, + 16, + "person", + "Nová", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.text, "Novák"); + assert_eq!(person.end, 17); +} + +#[test] +fn boundary_clamps_expansion_at_cross_label_neighbors() { + let full_text = "JanPraha"; + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.9, 0, 3, "person"), + entity(DetectionSource::Ner, 0.8, 3, 8, "address"), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 2); + let person = result + .iter() + .find(|entry| entry.label == "person") + .expect("person"); + let address = result + .iter() + .find(|entry| entry.label == "address") + .expect("address"); + assert!(person.end <= address.start); +} + +#[test] +fn boundary_resolves_cross_label_partial_overlaps() { + let full_text = "JanXPraha"; + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.9, 0, 3, "person"), + entity(DetectionSource::Ner, 0.8, 4, 9, "address"), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 2); + let person = result + .iter() + .find(|entry| entry.label == "person") + .expect("person"); + let address = result + .iter() + .find(|entry| entry.label == "address") + .expect("address"); + assert!(person.end <= address.start); +} + +#[test] +fn boundary_removes_nested_same_label_entities() { + let full_text = "Ing. Pavel Novák"; + let result = enforce_boundary_consistency( + &[ + PipelineEntity::detected( + 0, + 16, + "person", + "Ing. Pavel Novák", + 0.9, + DetectionSource::Ner, + ), + PipelineEntity::detected( + 5, + 10, + "person", + "Pavel", + 0.8, + DetectionSource::Ner, + ), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result.first().expect("person").text, "Ing. Pavel Novák"); +} + +#[test] +fn boundary_does_not_merge_legal_form_orgs_across_comma() { + let full_text = "Twitter, Inc., X Corp."; + let result = enforce_boundary_consistency( + &[ + PipelineEntity::detected( + 0, + 13, + "organization", + "Twitter, Inc.", + 0.9, + DetectionSource::LegalForm, + ), + PipelineEntity::detected( + 15, + 22, + "organization", + "X Corp.", + 0.8, + DetectionSource::LegalForm, + ), + ], + full_text, + ) + .unwrap(); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Twitter, Inc.", "X Corp."] + ); +} From dcca9be1ae3c20e6c00fb009ae6a8c5cc97cc30e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:40:47 +0200 Subject: [PATCH 010/130] chore: split core resolution modules --- crates/anonymize-core/src/resolution.rs | 1093 ----------------- .../anonymize-core/src/resolution/boundary.rs | 405 ++++++ .../anonymize-core/src/resolution/common.rs | 23 + crates/anonymize-core/src/resolution/merge.rs | 368 ++++++ crates/anonymize-core/src/resolution/mod.rs | 10 + .../anonymize-core/src/resolution/sanitize.rs | 222 ++++ crates/anonymize-core/src/resolution/types.rs | 90 ++ 7 files changed, 1118 insertions(+), 1093 deletions(-) delete mode 100644 crates/anonymize-core/src/resolution.rs create mode 100644 crates/anonymize-core/src/resolution/boundary.rs create mode 100644 crates/anonymize-core/src/resolution/common.rs create mode 100644 crates/anonymize-core/src/resolution/merge.rs create mode 100644 crates/anonymize-core/src/resolution/mod.rs create mode 100644 crates/anonymize-core/src/resolution/sanitize.rs create mode 100644 crates/anonymize-core/src/resolution/types.rs diff --git a/crates/anonymize-core/src/resolution.rs b/crates/anonymize-core/src/resolution.rs deleted file mode 100644 index f92d7b1c..00000000 --- a/crates/anonymize-core/src/resolution.rs +++ /dev/null @@ -1,1093 +0,0 @@ -use std::collections::{BTreeMap, BTreeSet}; - -use crate::types::{EntityKind, Result}; -use crate::utf16::Utf16Offsets; - -const LEGAL_PERIOD_SUFFIXES: &str = - include_str!("../data/legal-period-suffixes.txt"); -const ADDRESS_FINAL_ABBREVS: &str = - include_str!("../data/address-final-abbrevs.txt"); - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum DetectionSource { - Trigger, - Regex, - DenyList, - LegalForm, - Gazetteer, - Country, - Ner, - Coreference, -} - -impl DetectionSource { - const fn priority(self) -> u8 { - match self { - Self::Gazetteer => 5, - Self::Trigger => 4, - Self::LegalForm | Self::Regex | Self::Country => 3, - Self::DenyList | Self::Coreference => 2, - Self::Ner => 1, - } - } -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum SourceDetail { - CustomDenyList, - CustomRegex, - GazetteerExtension, -} - -#[derive(Clone, Debug, PartialEq)] -pub struct PipelineEntity { - pub start: u32, - pub end: u32, - pub label: String, - pub text: String, - pub score: f64, - pub source: DetectionSource, - pub source_detail: Option, - pub kind: EntityKind, -} - -impl PipelineEntity { - #[must_use] - pub fn detected( - start: u32, - end: u32, - label: impl Into, - text: impl Into, - score: f64, - source: DetectionSource, - ) -> Self { - Self { - start, - end, - label: label.into(), - text: text.into(), - score, - source, - source_detail: None, - kind: EntityKind::Detected, - } - } - - #[must_use] - pub fn coreference( - start: u32, - end: u32, - label: impl Into, - text: impl Into, - score: f64, - source_text: impl Into, - ) -> Self { - Self { - start, - end, - label: label.into(), - text: text.into(), - score, - source: DetectionSource::Coreference, - source_detail: None, - kind: EntityKind::Coreference { - source_text: source_text.into(), - }, - } - } -} - -#[must_use] -pub fn merge_and_dedup(entities: &[PipelineEntity]) -> Vec { - if entities.is_empty() { - return Vec::new(); - } - - let mut sorted = entities.to_vec(); - sorted.sort_by_key(|entity| entity.start); - - let Some(first) = sorted.first() else { - return Vec::new(); - }; - let mut merged = vec![first.clone()]; - - for entity in sorted.into_iter().skip(1) { - let overlaps = overlapping_indexes(&merged, &entity); - if overlaps.is_empty() { - merged.push(entity); - continue; - } - - let has_partial_overlap = overlaps.iter().any(|index| { - merged.get(*index).is_some_and(|existing| { - existing.start != entity.start || existing.end != entity.end - }) - }); - - if !has_partial_overlap { - let same_label_index = overlaps.iter().find_map(|index| { - merged - .get(*index) - .is_some_and(|existing| existing.label == entity.label) - .then_some(*index) - }); - - let Some(index) = same_label_index else { - merged.push(entity); - merged.sort_by_key(|entry| entry.start); - continue; - }; - - if let Some(existing) = merged.get(index) - && should_replace(&entity, existing) - { - replace_at(&mut merged, index, entity); - } - continue; - } - - let replaces_all = overlaps.iter().all(|index| { - merged - .get(*index) - .is_some_and(|existing| should_replace(&entity, existing)) - }); - if !replaces_all { - continue; - } - - let Some(insert_at) = overlaps.first().copied() else { - continue; - }; - for index in overlaps.iter().rev() { - remove_at(&mut merged, *index); - } - insert_at_or_push(&mut merged, insert_at, entity); - } - - resolve_same_span_label_conflicts(&sanitize_entities(&merged)) -} - -#[must_use] -pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { - let mut sanitized = Vec::new(); - - for entity in entities { - if is_caller_owned(entity) || has_curated_literal_boundary(entity) { - sanitized.push(entity.clone()); - continue; - } - - let Some(cleaned) = clean_entity_text(entity) else { - continue; - }; - sanitized.push(cleaned); - } - - sanitized -} - -pub fn enforce_boundary_consistency( - entities: &[PipelineEntity], - full_text: &str, -) -> Result> { - let offsets = Utf16Offsets::new(full_text); - let spans = char_spans(full_text); - let boundaries = word_boundaries(&spans); - let fixed = - fix_partial_words(entities, full_text, &offsets, &spans, &boundaries)?; - let resolved = resolve_cross_label_overlaps(&fixed, full_text, &offsets)?; - let deduped = deduplicate_spans(&resolved); - let merged = merge_adjacent(&deduped, full_text, &offsets)?; - Ok(remove_nested_same_label(&merged)) -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -struct CharSpan { - start: u32, - end: u32, - ch: char, -} - -fn fix_partial_words( - entities: &[PipelineEntity], - full_text: &str, - offsets: &Utf16Offsets, - spans: &[CharSpan], - boundaries: &BTreeSet, -) -> Result> { - let mut sorted = entities.to_vec(); - sorted.sort_by_key(|entity| entity.start); - let mut fixed = Vec::with_capacity(sorted.len()); - - for (index, entity) in sorted.iter().enumerate() { - if has_locked_boundary(entity) || has_detector_locked_boundary(entity) { - fixed.push(entity.clone()); - continue; - } - - if entity.text != offsets.slice(full_text, entity.start, entity.end)? { - fixed.push(entity.clone()); - continue; - } - - let mut new_start = word_start_at(entity.start, boundaries, spans); - let mut new_end = word_end_at(entity.end, boundaries, spans); - - for (other_index, other) in sorted.iter().enumerate() { - if other_index == index || other.label == entity.label { - continue; - } - if other.end > new_start && other.end <= entity.start { - new_start = new_start.max(other.end); - } - if other.start >= entity.end && other.start < new_end { - new_end = new_end.min(other.start); - } - } - - if new_start == entity.start && new_end == entity.end { - fixed.push(entity.clone()); - continue; - } - - let mut adjusted = entity.clone(); - adjusted.start = new_start; - adjusted.end = new_end; - adjusted.text = offsets.slice(full_text, new_start, new_end)?; - fixed.push(adjusted); - } - - Ok(fixed) -} - -fn resolve_cross_label_overlaps( - entities: &[PipelineEntity], - full_text: &str, - offsets: &Utf16Offsets, -) -> Result> { - let mut sorted = entities.to_vec(); - sorted.sort_by_key(|entity| entity.start); - - let mut left_index = 0; - while left_index < sorted.len() { - let mut right_index = left_index.saturating_add(1); - while right_index < sorted.len() { - let Some(left) = sorted.get(left_index) else { - break; - }; - let Some(right) = sorted.get(right_index) else { - break; - }; - if right.start >= left.end { - break; - } - if left.label == right.label - || contains_span(left, right) - || contains_span(right, left) - { - right_index = right_index.saturating_add(1); - continue; - } - - let left_len = entity_len(left); - let right_len = entity_len(right); - let left_locked = has_locked_boundary(left); - let right_locked = has_locked_boundary(right); - let left_wins = if left_locked == right_locked { - match left.score.total_cmp(&right.score) { - std::cmp::Ordering::Greater => true, - std::cmp::Ordering::Less => false, - std::cmp::Ordering::Equal => left_len >= right_len, - } - } else { - left_locked - }; - - if left_wins { - let new_start = left.end; - if let Some(right_mut) = sorted.get_mut(right_index) { - right_mut.start = new_start; - right_mut.text = - offsets.slice(full_text, new_start, right_mut.end)?; - } - right_index = right_index.saturating_add(1); - continue; - } - - let new_end = right.start; - if let Some(left_mut) = sorted.get_mut(left_index) { - left_mut.end = new_end; - left_mut.text = offsets.slice(full_text, left_mut.start, new_end)?; - } - break; - } - - left_index = left_index.saturating_add(1); - } - - Ok( - sorted - .into_iter() - .filter(|entity| entity.start < entity.end) - .collect(), - ) -} - -fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { - let mut seen = BTreeMap::<(u32, u32, String), PipelineEntity>::new(); - - for entity in entities { - let key = (entity.start, entity.end, entity.label.clone()); - let replace = seen - .get(&key) - .is_none_or(|existing| entity.score.total_cmp(&existing.score).is_gt()); - if replace { - seen.insert(key, entity.clone()); - } - } - - seen.into_values().collect() -} - -fn merge_adjacent( - entities: &[PipelineEntity], - full_text: &str, - offsets: &Utf16Offsets, -) -> Result> { - let mut sorted = entities.to_vec(); - sorted.sort_by_key(|entity| entity.start); - let mut result = Vec::::new(); - let mut last_by_label = BTreeMap::::new(); - - for entity in &sorted { - if has_locked_boundary(entity) { - result.push(entity.clone()); - continue; - } - - let Some(previous_index) = last_by_label.get(&entity.label).copied() else { - let index = result.len(); - result.push(entity.clone()); - last_by_label.insert(entity.label.clone(), index); - continue; - }; - - let Some(previous) = result.get(previous_index) else { - let index = result.len(); - result.push(entity.clone()); - last_by_label.insert(entity.label.clone(), index); - continue; - }; - - if !has_locked_boundary(previous) && entity.start < previous.end { - merge_into_previous( - &mut result, - previous_index, - entity, - full_text, - offsets, - )?; - continue; - } - - let gap = offsets.slice(full_text, previous.end, entity.start)?; - let gap_start = previous.end; - let gap_end = entity.start; - let gap_occupied = sorted.iter().any(|other| { - other.label != entity.label - && other.start < gap_end - && other.end > gap_start - }); - let legal_form_comma = (is_legal_form_organization(previous) - || is_legal_form_organization(entity)) - && gap.contains(','); - - if !has_locked_boundary(previous) - && !legal_form_comma - && entity.label != "country" - && !gap_occupied - && is_mergeable_gap(&gap) - { - merge_into_previous( - &mut result, - previous_index, - entity, - full_text, - offsets, - )?; - continue; - } - - let index = result.len(); - result.push(entity.clone()); - last_by_label.insert(entity.label.clone(), index); - } - - Ok(result) -} - -fn remove_nested_same_label( - entities: &[PipelineEntity], -) -> Vec { - let mut sorted = entities.to_vec(); - sorted.sort_by(|left, right| { - left - .start - .cmp(&right.start) - .then_with(|| entity_len(right).cmp(&entity_len(left))) - }); - - let mut result = Vec::new(); - let mut max_end_by_label = BTreeMap::::new(); - - for entity in sorted { - if max_end_by_label - .get(&entity.label) - .is_some_and(|max_end| entity.end <= *max_end) - { - continue; - } - max_end_by_label.insert(entity.label.clone(), entity.end); - result.push(entity); - } - - result -} - -fn overlapping_indexes( - entities: &[PipelineEntity], - entity: &PipelineEntity, -) -> Vec { - entities - .iter() - .enumerate() - .filter_map(|(index, existing)| { - (existing.end > entity.start && existing.start < entity.end) - .then_some(index) - }) - .collect() -} - -fn should_replace( - candidate: &PipelineEntity, - existing: &PipelineEntity, -) -> bool { - let candidate_len = entity_len(candidate); - let existing_len = entity_len(existing); - let candidate_caller_owned = is_caller_owned(candidate); - let existing_caller_owned = is_caller_owned(existing); - if candidate_caller_owned != existing_caller_owned { - return candidate_caller_owned; - } - - if literal_contains(candidate, existing) && candidate_len > existing_len { - return true; - } - if literal_contains(existing, candidate) && existing_len > candidate_len { - return false; - } - - if address_contains_bare_postal(candidate, existing) - && candidate_len > existing_len - { - return true; - } - if address_contains_bare_postal(existing, candidate) - && existing_len > candidate_len - { - return false; - } - - if legal_form_contains(candidate, existing) && candidate_len > existing_len { - return true; - } - if legal_form_contains(existing, candidate) && existing_len > candidate_len { - return false; - } - - if same_start_longest_wins(candidate, existing) - && candidate_len != existing_len - { - return candidate_len > existing_len; - } - - if country_inside_person_or_org(candidate, existing) - && existing_len > candidate_len - { - return false; - } - if country_inside_person_or_org(existing, candidate) - && candidate_len > existing_len - { - return true; - } - - let candidate_priority = candidate.source.priority(); - let existing_priority = existing.source.priority(); - if candidate_priority != existing_priority { - return candidate_priority > existing_priority; - } - - match candidate.score.total_cmp(&existing.score) { - std::cmp::Ordering::Greater => true, - std::cmp::Ordering::Less => false, - std::cmp::Ordering::Equal => candidate_len > existing_len, - } -} - -fn resolve_same_span_label_conflicts( - entities: &[PipelineEntity], -) -> Vec { - if entities.len() < 2 { - return entities.to_vec(); - } - - let mut by_offsets = BTreeMap::<(u32, u32), Vec>::new(); - for (index, entity) in entities.iter().enumerate() { - by_offsets - .entry((entity.start, entity.end)) - .or_default() - .push(index); - } - - let mut dropped = BTreeSet::::new(); - for group in by_offsets.values() { - if group.len() < 2 { - continue; - } - - let labels = group - .iter() - .filter_map(|index| entities.get(*index)) - .map(|entity| entity.label.as_str()) - .collect::>(); - if labels.len() < 2 { - continue; - } - - let has_person = labels.contains("person"); - let has_precise_non_address = labels - .iter() - .any(|label| *label != "address" && precise_over_address(label)); - let mut yielding_to_person = BTreeSet::::new(); - - if has_person { - for index in group { - let Some(entity) = entities.get(*index) else { - continue; - }; - if !is_caller_owned(entity) && person_preferred_over(&entity.label) { - yielding_to_person.insert(*index); - } - } - } - - let mut max_priority = None::; - for index in group { - let Some(entity) = entities.get(*index) else { - continue; - }; - if is_caller_owned(entity) || yielding_to_person.contains(index) { - continue; - } - max_priority = Some(max_priority.map_or_else( - || entity.source.priority(), - |priority| priority.max(entity.source.priority()), - )); - } - - for index in group { - let Some(entity) = entities.get(*index) else { - continue; - }; - if is_caller_owned(entity) { - continue; - } - if yielding_to_person.contains(index) { - dropped.insert(*index); - continue; - } - if max_priority - .is_some_and(|priority| entity.source.priority() < priority) - { - dropped.insert(*index); - continue; - } - if has_precise_non_address && entity.label == "address" { - dropped.insert(*index); - } - } - } - - entities - .iter() - .enumerate() - .filter(|(index, _)| !dropped.contains(index)) - .map(|(_, entity)| entity.clone()) - .collect() -} - -fn clean_entity_text(entity: &PipelineEntity) -> Option { - let mut start_byte = 0; - let mut end_byte = entity.text.len(); - - while let Some((ch, len)) = first_char(entity.text.get(start_byte..end_byte)?) - { - if ch.is_whitespace() || is_leading_trim(ch, &entity.label) { - start_byte = start_byte.saturating_add(len); - continue; - } - break; - } - - while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) - { - if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { - end_byte = end_byte.saturating_sub(len); - continue; - } - break; - } - - if should_strip_period(entity, start_byte, end_byte) { - end_byte = end_byte.saturating_sub('.'.len_utf8()); - } - - while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) - { - if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { - end_byte = end_byte.saturating_sub(len); - continue; - } - break; - } - - if start_byte >= end_byte { - return None; - } - - let cleaned_raw = entity.text.get(start_byte..end_byte)?; - if !cleaned_raw.chars().any(char::is_alphanumeric) { - return None; - } - - let display_text = collapse_display_whitespace(cleaned_raw); - let start = entity.start.saturating_add(utf16_len( - entity.text.get(..start_byte).unwrap_or_default(), - )); - let end = start.saturating_add(utf16_len(cleaned_raw)); - - let mut cleaned = entity.clone(); - cleaned.start = start; - cleaned.end = end; - cleaned.text = display_text; - Some(cleaned) -} - -fn replace_at( - entities: &mut [PipelineEntity], - index: usize, - entity: PipelineEntity, -) { - if let Some(slot) = entities.get_mut(index) { - *slot = entity; - } -} - -fn remove_at(entities: &mut Vec, index: usize) { - if index < entities.len() { - entities.remove(index); - } -} - -fn insert_at_or_push( - entities: &mut Vec, - index: usize, - entity: PipelineEntity, -) { - if index <= entities.len() { - entities.insert(index, entity); - return; - } - entities.push(entity); -} - -fn char_spans(text: &str) -> Vec { - let mut spans = Vec::new(); - let mut offset = 0_u32; - - for ch in text.chars() { - let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); - let end = offset.saturating_add(width); - spans.push(CharSpan { - start: offset, - end, - ch, - }); - offset = end; - } - - spans -} - -fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { - let mut boundaries = BTreeSet::new(); - let mut run_start = None::; - let mut run_end = None::; - - for span in spans { - if span.ch.is_alphanumeric() { - if run_start.is_none() { - run_start = Some(span.start); - } - run_end = Some(span.end); - continue; - } - - if let (Some(start), Some(end)) = (run_start.take(), run_end.take()) { - boundaries.insert(start); - boundaries.insert(end); - } - } - - if let (Some(start), Some(end)) = (run_start, run_end) { - boundaries.insert(start); - boundaries.insert(end); - } - - boundaries -} - -fn word_start_at( - position: u32, - boundaries: &BTreeSet, - spans: &[CharSpan], -) -> u32 { - let mut cursor = position; - while cursor > 0 && !boundaries.contains(&cursor) { - let Some(previous) = spans.iter().rev().find(|span| span.end <= cursor) - else { - return cursor; - }; - if is_word_start_stop(previous.ch) { - return cursor; - } - cursor = previous.start; - } - cursor -} - -fn word_end_at( - position: u32, - boundaries: &BTreeSet, - spans: &[CharSpan], -) -> u32 { - let mut cursor = position; - let text_end = spans.last().map_or(0, |span| span.end); - while cursor < text_end && !boundaries.contains(&cursor) { - let Some(next) = spans.iter().find(|span| span.start >= cursor) else { - return cursor; - }; - if is_word_end_stop(next.ch) { - return cursor; - } - cursor = next.end; - } - cursor -} - -fn merge_into_previous( - entities: &mut [PipelineEntity], - previous_index: usize, - entity: &PipelineEntity, - full_text: &str, - offsets: &Utf16Offsets, -) -> Result<()> { - if let Some(previous) = entities.get_mut(previous_index) { - previous.end = previous.end.max(entity.end); - previous.text = offsets.slice(full_text, previous.start, previous.end)?; - if entity.score.total_cmp(&previous.score).is_gt() { - previous.score = entity.score; - } - } - Ok(()) -} - -const fn contains_span(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { - outer.start <= inner.start && outer.end >= inner.end -} - -const fn has_locked_boundary(entity: &PipelineEntity) -> bool { - is_caller_owned(entity) -} - -fn has_detector_locked_boundary(entity: &PipelineEntity) -> bool { - entity.label == "phone number" && entity.source == DetectionSource::Trigger -} - -fn is_legal_form_organization(entity: &PipelineEntity) -> bool { - entity.label == "organization" && entity.source == DetectionSource::LegalForm -} - -fn is_mergeable_gap(gap: &str) -> bool { - gap.is_empty() - || (utf16_len(gap) <= 3 - && gap.chars().all(|ch| matches!(ch, ' ' | '\t' | ',' | '-'))) -} - -const fn is_word_start_stop(ch: char) -> bool { - matches!(ch, '\n' | '\r' | ',' | ';' | '(' | ')' | '[' | ']' | '&') -} - -const fn is_word_end_stop(ch: char) -> bool { - matches!( - ch, - '\n' | '\r' | ',' | ';' | '.' | '(' | ')' | '[' | ']' | '&' - ) -} - -const fn entity_len(entity: &PipelineEntity) -> u32 { - entity.end.saturating_sub(entity.start) -} - -const fn is_caller_owned(entity: &PipelineEntity) -> bool { - matches!( - entity.source_detail, - Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) - ) -} - -fn literal_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { - outer.label == inner.label - && matches!( - outer.source, - DetectionSource::DenyList | DetectionSource::Gazetteer - ) - && outer.start <= inner.start - && outer.end >= inner.end -} - -fn address_contains_bare_postal( - outer: &PipelineEntity, - inner: &PipelineEntity, -) -> bool { - outer.label == "address" - && inner.label == "address" - && outer.start <= inner.start - && outer.end >= inner.end - && is_bare_postal_code(&inner.text) -} - -fn legal_form_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { - outer.label == inner.label - && outer.source == DetectionSource::LegalForm - && outer.start <= inner.start - && outer.end >= inner.end -} - -fn same_start_longest_wins( - candidate: &PipelineEntity, - existing: &PipelineEntity, -) -> bool { - candidate.label == existing.label - && candidate.start == existing.start - && longest_wins_label(&candidate.label) -} - -fn country_inside_person_or_org( - country: &PipelineEntity, - container: &PipelineEntity, -) -> bool { - country.label == "country" - && matches!(container.label.as_str(), "person" | "organization") - && container.start <= country.start - && container.end >= country.end -} - -fn has_curated_literal_boundary(entity: &PipelineEntity) -> bool { - matches!( - entity.source, - DetectionSource::DenyList | DetectionSource::Gazetteer - ) && entity.label != "person" - && entity.source_detail != Some(SourceDetail::GazetteerExtension) - && entity - .text - .chars() - .next() - .into_iter() - .chain(entity.text.chars().next_back()) - .any(is_literal_boundary_punct) -} - -fn is_leading_trim(ch: char, label: &str) -> bool { - if label_allows_colon(label) { - matches!( - ch, - ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' - ) - } else { - matches!( - ch, - ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' - ) - } -} - -fn is_trailing_trim(ch: char, label: &str) -> bool { - if label_allows_colon(label) { - matches!( - ch, - ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' - ) - } else { - matches!( - ch, - ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' - ) - } -} - -const fn is_literal_boundary_punct(ch: char) -> bool { - matches!( - ch, - '"' - | '\'' - | '“' - | '”' - | '„' - | '‟' - | '‘' - | '’' - | '‛' - | '«' - | '»' - | '!' - | '.' - ) -} - -fn should_strip_period( - entity: &PipelineEntity, - start_byte: usize, - end_byte: usize, -) -> bool { - if !matches!( - entity.label.as_str(), - "organization" | "location" | "address" - ) { - return false; - } - let Some(text) = entity.text.get(start_byte..end_byte) else { - return false; - }; - if !text.ends_with('.') || known_period_suffix(text) { - return false; - } - if entity.label == "address" && known_address_final_abbrev(text) { - return false; - } - !(entity.label == "location" && known_location_final_abbrev(text)) -} - -fn known_period_suffix(text: &str) -> bool { - LEGAL_PERIOD_SUFFIXES - .lines() - .any(|suffix| text.ends_with(suffix)) -} - -fn known_address_final_abbrev(text: &str) -> bool { - ADDRESS_FINAL_ABBREVS.lines().any(|suffix| { - text - .strip_suffix(suffix) - .is_some_and(|prefix| prefix.ends_with(char::is_whitespace)) - }) -} - -fn known_location_final_abbrev(text: &str) -> bool { - text.ends_with("D.C.") - || text - .split_whitespace() - .next_back() - .is_some_and(|token| token.chars().filter(|ch| *ch == '.').count() >= 2) -} - -fn label_allows_colon(label: &str) -> bool { - matches!(label, "ip address" | "mac address") -} - -fn longest_wins_label(label: &str) -> bool { - matches!( - label, - "date" - | "date of birth" - | "monetary amount" - | "phone number" - | "email address" - | "url" - ) -} - -fn precise_over_address(label: &str) -> bool { - matches!( - label, - "person" - | "date" - | "date of birth" - | "phone number" - | "email address" - | "monetary amount" - | "iban" - | "bank account number" - | "tax identification number" - | "registration number" - | "identity card number" - | "national identification number" - | "passport number" - | "credit card number" - ) -} - -fn person_preferred_over(label: &str) -> bool { - matches!(label, "address" | "country" | "land parcel") -} - -fn is_bare_postal_code(text: &str) -> bool { - let compact = text - .chars() - .filter(|ch| !ch.is_whitespace() && *ch != '-' && *ch != '–') - .collect::(); - let len = compact.len(); - matches!(len, 5 | 8 | 9) && compact.chars().all(|ch| ch.is_ascii_digit()) -} - -fn collapse_display_whitespace(text: &str) -> String { - let mut output = String::new(); - let mut in_whitespace = false; - - for ch in text.chars() { - if ch.is_whitespace() { - if !in_whitespace { - output.push(' '); - in_whitespace = true; - } - continue; - } - - output.push(ch); - in_whitespace = false; - } - - output -} - -fn first_char(text: &str) -> Option<(char, usize)> { - text.chars().next().map(|ch| (ch, ch.len_utf8())) -} - -fn last_char(text: &str) -> Option<(char, usize)> { - text.chars().next_back().map(|ch| (ch, ch.len_utf8())) -} - -fn utf16_len(text: &str) -> u32 { - u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) -} diff --git a/crates/anonymize-core/src/resolution/boundary.rs b/crates/anonymize-core/src/resolution/boundary.rs new file mode 100644 index 00000000..4aab511c --- /dev/null +++ b/crates/anonymize-core/src/resolution/boundary.rs @@ -0,0 +1,405 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::types::Result; +use crate::utf16::Utf16Offsets; + +use super::common::{contains_span, entity_len, is_caller_owned, utf16_len}; +use super::{DetectionSource, PipelineEntity}; + +pub fn enforce_boundary_consistency( + entities: &[PipelineEntity], + full_text: &str, +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let spans = char_spans(full_text); + let boundaries = word_boundaries(&spans); + let fixed = + fix_partial_words(entities, full_text, &offsets, &spans, &boundaries)?; + let resolved = resolve_cross_label_overlaps(&fixed, full_text, &offsets)?; + let deduped = deduplicate_spans(&resolved); + let merged = merge_adjacent(&deduped, full_text, &offsets)?; + Ok(remove_nested_same_label(&merged)) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CharSpan { + start: u32, + end: u32, + ch: char, +} + +fn fix_partial_words( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, + spans: &[CharSpan], + boundaries: &BTreeSet, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut fixed = Vec::with_capacity(sorted.len()); + + for (index, entity) in sorted.iter().enumerate() { + if has_locked_boundary(entity) || has_detector_locked_boundary(entity) { + fixed.push(entity.clone()); + continue; + } + + if entity.text != offsets.slice(full_text, entity.start, entity.end)? { + fixed.push(entity.clone()); + continue; + } + + let mut new_start = word_start_at(entity.start, boundaries, spans); + let mut new_end = word_end_at(entity.end, boundaries, spans); + + for (other_index, other) in sorted.iter().enumerate() { + if other_index == index || other.label == entity.label { + continue; + } + if other.end > new_start && other.end <= entity.start { + new_start = new_start.max(other.end); + } + if other.start >= entity.end && other.start < new_end { + new_end = new_end.min(other.start); + } + } + + if new_start == entity.start && new_end == entity.end { + fixed.push(entity.clone()); + continue; + } + + let mut adjusted = entity.clone(); + adjusted.start = new_start; + adjusted.end = new_end; + adjusted.text = offsets.slice(full_text, new_start, new_end)?; + fixed.push(adjusted); + } + + Ok(fixed) +} + +fn resolve_cross_label_overlaps( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let mut left_index = 0; + while left_index < sorted.len() { + let mut right_index = left_index.saturating_add(1); + while right_index < sorted.len() { + let Some(left) = sorted.get(left_index) else { + break; + }; + let Some(right) = sorted.get(right_index) else { + break; + }; + if right.start >= left.end { + break; + } + if left.label == right.label + || contains_span(left, right) + || contains_span(right, left) + { + right_index = right_index.saturating_add(1); + continue; + } + + let left_len = entity_len(left); + let right_len = entity_len(right); + let left_locked = has_locked_boundary(left); + let right_locked = has_locked_boundary(right); + let left_wins = if left_locked == right_locked { + match left.score.total_cmp(&right.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => left_len >= right_len, + } + } else { + left_locked + }; + + if left_wins { + let new_start = left.end; + if let Some(right_mut) = sorted.get_mut(right_index) { + right_mut.start = new_start; + right_mut.text = + offsets.slice(full_text, new_start, right_mut.end)?; + } + right_index = right_index.saturating_add(1); + continue; + } + + let new_end = right.start; + if let Some(left_mut) = sorted.get_mut(left_index) { + left_mut.end = new_end; + left_mut.text = offsets.slice(full_text, left_mut.start, new_end)?; + } + break; + } + + left_index = left_index.saturating_add(1); + } + + Ok( + sorted + .into_iter() + .filter(|entity| entity.start < entity.end) + .collect(), + ) +} + +fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { + let mut seen = BTreeMap::<(u32, u32, String), PipelineEntity>::new(); + + for entity in entities { + let key = (entity.start, entity.end, entity.label.clone()); + let replace = seen + .get(&key) + .is_none_or(|existing| entity.score.total_cmp(&existing.score).is_gt()); + if replace { + seen.insert(key, entity.clone()); + } + } + + seen.into_values().collect() +} + +fn merge_adjacent( + entities: &[PipelineEntity], + full_text: &str, + offsets: &Utf16Offsets, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut result = Vec::::new(); + let mut last_by_label = BTreeMap::::new(); + + for entity in &sorted { + if has_locked_boundary(entity) { + result.push(entity.clone()); + continue; + } + + let Some(previous_index) = last_by_label.get(&entity.label).copied() else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + let Some(previous) = result.get(previous_index) else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + if !has_locked_boundary(previous) && entity.start < previous.end { + merge_into_previous( + &mut result, + previous_index, + entity, + full_text, + offsets, + )?; + continue; + } + + let gap = offsets.slice(full_text, previous.end, entity.start)?; + let gap_start = previous.end; + let gap_end = entity.start; + let gap_occupied = sorted.iter().any(|other| { + other.label != entity.label + && other.start < gap_end + && other.end > gap_start + }); + let legal_form_comma = (is_legal_form_organization(previous) + || is_legal_form_organization(entity)) + && gap.contains(','); + + if !has_locked_boundary(previous) + && !legal_form_comma + && entity.label != "country" + && !gap_occupied + && is_mergeable_gap(&gap) + { + merge_into_previous( + &mut result, + previous_index, + entity, + full_text, + offsets, + )?; + continue; + } + + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + } + + Ok(result) +} + +fn remove_nested_same_label( + entities: &[PipelineEntity], +) -> Vec { + let mut sorted = entities.to_vec(); + sorted.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| entity_len(right).cmp(&entity_len(left))) + }); + + let mut result = Vec::new(); + let mut max_end_by_label = BTreeMap::::new(); + + for entity in sorted { + if max_end_by_label + .get(&entity.label) + .is_some_and(|max_end| entity.end <= *max_end) + { + continue; + } + max_end_by_label.insert(entity.label.clone(), entity.end); + result.push(entity); + } + + result +} + +fn char_spans(text: &str) -> Vec { + let mut spans = Vec::new(); + let mut offset = 0_u32; + + for ch in text.chars() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + let end = offset.saturating_add(width); + spans.push(CharSpan { + start: offset, + end, + ch, + }); + offset = end; + } + + spans +} + +fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { + let mut boundaries = BTreeSet::new(); + let mut run_start = None::; + let mut run_end = None::; + + for span in spans { + if span.ch.is_alphanumeric() { + if run_start.is_none() { + run_start = Some(span.start); + } + run_end = Some(span.end); + continue; + } + + if let (Some(start), Some(end)) = (run_start.take(), run_end.take()) { + boundaries.insert(start); + boundaries.insert(end); + } + } + + if let (Some(start), Some(end)) = (run_start, run_end) { + boundaries.insert(start); + boundaries.insert(end); + } + + boundaries +} + +fn word_start_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + while cursor > 0 && !boundaries.contains(&cursor) { + let Some(previous) = spans.iter().rev().find(|span| span.end <= cursor) + else { + return cursor; + }; + if is_word_start_stop(previous.ch) { + return cursor; + } + cursor = previous.start; + } + cursor +} + +fn word_end_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + let text_end = spans.last().map_or(0, |span| span.end); + while cursor < text_end && !boundaries.contains(&cursor) { + let Some(next) = spans.iter().find(|span| span.start >= cursor) else { + return cursor; + }; + if is_word_end_stop(next.ch) { + return cursor; + } + cursor = next.end; + } + cursor +} + +fn merge_into_previous( + entities: &mut [PipelineEntity], + previous_index: usize, + entity: &PipelineEntity, + full_text: &str, + offsets: &Utf16Offsets, +) -> Result<()> { + if let Some(previous) = entities.get_mut(previous_index) { + previous.end = previous.end.max(entity.end); + previous.text = offsets.slice(full_text, previous.start, previous.end)?; + if entity.score.total_cmp(&previous.score).is_gt() { + previous.score = entity.score; + } + } + Ok(()) +} + +const fn has_locked_boundary(entity: &PipelineEntity) -> bool { + is_caller_owned(entity) +} + +fn has_detector_locked_boundary(entity: &PipelineEntity) -> bool { + entity.label == "phone number" && entity.source == DetectionSource::Trigger +} + +fn is_legal_form_organization(entity: &PipelineEntity) -> bool { + entity.label == "organization" && entity.source == DetectionSource::LegalForm +} + +fn is_mergeable_gap(gap: &str) -> bool { + gap.is_empty() + || (utf16_len(gap) <= 3 + && gap.chars().all(|ch| matches!(ch, ' ' | '\t' | ',' | '-'))) +} + +const fn is_word_start_stop(ch: char) -> bool { + matches!(ch, '\n' | '\r' | ',' | ';' | '(' | ')' | '[' | ']' | '&') +} + +const fn is_word_end_stop(ch: char) -> bool { + matches!( + ch, + '\n' | '\r' | ',' | ';' | '.' | '(' | ')' | '[' | ']' | '&' + ) +} diff --git a/crates/anonymize-core/src/resolution/common.rs b/crates/anonymize-core/src/resolution/common.rs new file mode 100644 index 00000000..0bc00b2a --- /dev/null +++ b/crates/anonymize-core/src/resolution/common.rs @@ -0,0 +1,23 @@ +use super::{PipelineEntity, SourceDetail}; + +pub(crate) const fn contains_span( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.start <= inner.start && outer.end >= inner.end +} + +pub(crate) const fn entity_len(entity: &PipelineEntity) -> u32 { + entity.end.saturating_sub(entity.start) +} + +pub(crate) const fn is_caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +pub(crate) fn utf16_len(text: &str) -> u32 { + u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +} diff --git a/crates/anonymize-core/src/resolution/merge.rs b/crates/anonymize-core/src/resolution/merge.rs new file mode 100644 index 00000000..ed2de2a3 --- /dev/null +++ b/crates/anonymize-core/src/resolution/merge.rs @@ -0,0 +1,368 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use super::common::{entity_len, is_caller_owned}; +use super::sanitize::sanitize_entities; +use super::{DetectionSource, PipelineEntity}; + +#[must_use] +pub fn merge_and_dedup(entities: &[PipelineEntity]) -> Vec { + if entities.is_empty() { + return Vec::new(); + } + + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let Some(first) = sorted.first() else { + return Vec::new(); + }; + let mut merged = vec![first.clone()]; + + for entity in sorted.into_iter().skip(1) { + let overlaps = overlapping_indexes(&merged, &entity); + if overlaps.is_empty() { + merged.push(entity); + continue; + } + + let has_partial_overlap = overlaps.iter().any(|index| { + merged.get(*index).is_some_and(|existing| { + existing.start != entity.start || existing.end != entity.end + }) + }); + + if !has_partial_overlap { + let same_label_index = overlaps.iter().find_map(|index| { + merged + .get(*index) + .is_some_and(|existing| existing.label == entity.label) + .then_some(*index) + }); + + let Some(index) = same_label_index else { + merged.push(entity); + merged.sort_by_key(|entry| entry.start); + continue; + }; + + if let Some(existing) = merged.get(index) + && should_replace(&entity, existing) + { + replace_at(&mut merged, index, entity); + } + continue; + } + + let replaces_all = overlaps.iter().all(|index| { + merged + .get(*index) + .is_some_and(|existing| should_replace(&entity, existing)) + }); + if !replaces_all { + continue; + } + + let Some(insert_at) = overlaps.first().copied() else { + continue; + }; + for index in overlaps.iter().rev() { + remove_at(&mut merged, *index); + } + insert_at_or_push(&mut merged, insert_at, entity); + } + + resolve_same_span_label_conflicts(&sanitize_entities(&merged)) +} + +fn overlapping_indexes( + entities: &[PipelineEntity], + entity: &PipelineEntity, +) -> Vec { + entities + .iter() + .enumerate() + .filter_map(|(index, existing)| { + (existing.end > entity.start && existing.start < entity.end) + .then_some(index) + }) + .collect() +} + +fn should_replace( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + let candidate_len = entity_len(candidate); + let existing_len = entity_len(existing); + let candidate_caller_owned = is_caller_owned(candidate); + let existing_caller_owned = is_caller_owned(existing); + if candidate_caller_owned != existing_caller_owned { + return candidate_caller_owned; + } + + if literal_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if literal_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if address_contains_bare_postal(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if address_contains_bare_postal(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if legal_form_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if legal_form_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if same_start_longest_wins(candidate, existing) + && candidate_len != existing_len + { + return candidate_len > existing_len; + } + + if country_inside_person_or_org(candidate, existing) + && existing_len > candidate_len + { + return false; + } + if country_inside_person_or_org(existing, candidate) + && candidate_len > existing_len + { + return true; + } + + let candidate_priority = candidate.source.priority(); + let existing_priority = existing.source.priority(); + if candidate_priority != existing_priority { + return candidate_priority > existing_priority; + } + + match candidate.score.total_cmp(&existing.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => candidate_len > existing_len, + } +} + +fn resolve_same_span_label_conflicts( + entities: &[PipelineEntity], +) -> Vec { + if entities.len() < 2 { + return entities.to_vec(); + } + + let mut by_offsets = BTreeMap::<(u32, u32), Vec>::new(); + for (index, entity) in entities.iter().enumerate() { + by_offsets + .entry((entity.start, entity.end)) + .or_default() + .push(index); + } + + let mut dropped = BTreeSet::::new(); + for group in by_offsets.values() { + if group.len() < 2 { + continue; + } + + let labels = group + .iter() + .filter_map(|index| entities.get(*index)) + .map(|entity| entity.label.as_str()) + .collect::>(); + if labels.len() < 2 { + continue; + } + + let has_person = labels.contains("person"); + let has_precise_non_address = labels + .iter() + .any(|label| *label != "address" && precise_over_address(label)); + let mut yielding_to_person = BTreeSet::::new(); + + if has_person { + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if !is_caller_owned(entity) && person_preferred_over(&entity.label) { + yielding_to_person.insert(*index); + } + } + } + + let mut max_priority = None::; + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) || yielding_to_person.contains(index) { + continue; + } + max_priority = Some(max_priority.map_or_else( + || entity.source.priority(), + |priority| priority.max(entity.source.priority()), + )); + } + + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) { + continue; + } + if yielding_to_person.contains(index) { + dropped.insert(*index); + continue; + } + if max_priority + .is_some_and(|priority| entity.source.priority() < priority) + { + dropped.insert(*index); + continue; + } + if has_precise_non_address && entity.label == "address" { + dropped.insert(*index); + } + } + } + + entities + .iter() + .enumerate() + .filter(|(index, _)| !dropped.contains(index)) + .map(|(_, entity)| entity.clone()) + .collect() +} + +fn replace_at( + entities: &mut [PipelineEntity], + index: usize, + entity: PipelineEntity, +) { + if let Some(slot) = entities.get_mut(index) { + *slot = entity; + } +} + +fn remove_at(entities: &mut Vec, index: usize) { + if index < entities.len() { + entities.remove(index); + } +} + +fn insert_at_or_push( + entities: &mut Vec, + index: usize, + entity: PipelineEntity, +) { + if index <= entities.len() { + entities.insert(index, entity); + return; + } + entities.push(entity); +} + +fn literal_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && matches!( + outer.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn address_contains_bare_postal( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == "address" + && inner.label == "address" + && outer.start <= inner.start + && outer.end >= inner.end + && is_bare_postal_code(&inner.text) +} + +fn legal_form_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && outer.source == DetectionSource::LegalForm + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn same_start_longest_wins( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + candidate.label == existing.label + && candidate.start == existing.start + && longest_wins_label(&candidate.label) +} + +fn country_inside_person_or_org( + country: &PipelineEntity, + container: &PipelineEntity, +) -> bool { + country.label == "country" + && matches!(container.label.as_str(), "person" | "organization") + && container.start <= country.start + && container.end >= country.end +} + +fn longest_wins_label(label: &str) -> bool { + matches!( + label, + "date" + | "date of birth" + | "monetary amount" + | "phone number" + | "email address" + | "url" + ) +} + +fn precise_over_address(label: &str) -> bool { + matches!( + label, + "person" + | "date" + | "date of birth" + | "phone number" + | "email address" + | "monetary amount" + | "iban" + | "bank account number" + | "tax identification number" + | "registration number" + | "identity card number" + | "national identification number" + | "passport number" + | "credit card number" + ) +} + +fn person_preferred_over(label: &str) -> bool { + matches!(label, "address" | "country" | "land parcel") +} + +fn is_bare_postal_code(text: &str) -> bool { + let compact = text + .chars() + .filter(|ch| !ch.is_whitespace() && *ch != '-' && *ch != '–') + .collect::(); + let len = compact.len(); + matches!(len, 5 | 8 | 9) && compact.chars().all(|ch| ch.is_ascii_digit()) +} diff --git a/crates/anonymize-core/src/resolution/mod.rs b/crates/anonymize-core/src/resolution/mod.rs new file mode 100644 index 00000000..379bae77 --- /dev/null +++ b/crates/anonymize-core/src/resolution/mod.rs @@ -0,0 +1,10 @@ +mod boundary; +mod common; +mod merge; +mod sanitize; +mod types; + +pub use boundary::enforce_boundary_consistency; +pub use merge::merge_and_dedup; +pub use sanitize::sanitize_entities; +pub use types::{DetectionSource, PipelineEntity, SourceDetail}; diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs new file mode 100644 index 00000000..0c06b39c --- /dev/null +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -0,0 +1,222 @@ +use super::common::{is_caller_owned, utf16_len}; +use super::{DetectionSource, PipelineEntity, SourceDetail}; + +const LEGAL_PERIOD_SUFFIXES: &str = + include_str!("../../data/legal-period-suffixes.txt"); +const ADDRESS_FINAL_ABBREVS: &str = + include_str!("../../data/address-final-abbrevs.txt"); + +#[must_use] +pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { + let mut sanitized = Vec::new(); + + for entity in entities { + if is_caller_owned(entity) || has_curated_literal_boundary(entity) { + sanitized.push(entity.clone()); + continue; + } + + let Some(cleaned) = clean_entity_text(entity) else { + continue; + }; + sanitized.push(cleaned); + } + + sanitized +} + +fn clean_entity_text(entity: &PipelineEntity) -> Option { + let mut start_byte = 0; + let mut end_byte = entity.text.len(); + + while let Some((ch, len)) = first_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_leading_trim(ch, &entity.label) { + start_byte = start_byte.saturating_add(len); + continue; + } + break; + } + + while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if should_strip_period(entity, start_byte, end_byte) { + end_byte = end_byte.saturating_sub('.'.len_utf8()); + } + + while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) + { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if start_byte >= end_byte { + return None; + } + + let cleaned_raw = entity.text.get(start_byte..end_byte)?; + if !cleaned_raw.chars().any(char::is_alphanumeric) { + return None; + } + + let display_text = collapse_display_whitespace(cleaned_raw); + let start = entity.start.saturating_add(utf16_len( + entity.text.get(..start_byte).unwrap_or_default(), + )); + let end = start.saturating_add(utf16_len(cleaned_raw)); + + let mut cleaned = entity.clone(); + cleaned.start = start; + cleaned.end = end; + cleaned.text = display_text; + Some(cleaned) +} + +fn has_curated_literal_boundary(entity: &PipelineEntity) -> bool { + matches!( + entity.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) && entity.label != "person" + && entity.source_detail != Some(SourceDetail::GazetteerExtension) + && entity + .text + .chars() + .next() + .into_iter() + .chain(entity.text.chars().next_back()) + .any(is_literal_boundary_punct) +} + +fn is_leading_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } +} + +fn is_trailing_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } +} + +const fn is_literal_boundary_punct(ch: char) -> bool { + matches!( + ch, + '"' + | '\'' + | '“' + | '”' + | '„' + | '‟' + | '‘' + | '’' + | '‛' + | '«' + | '»' + | '!' + | '.' + ) +} + +fn should_strip_period( + entity: &PipelineEntity, + start_byte: usize, + end_byte: usize, +) -> bool { + if !matches!( + entity.label.as_str(), + "organization" | "location" | "address" + ) { + return false; + } + let Some(text) = entity.text.get(start_byte..end_byte) else { + return false; + }; + if !text.ends_with('.') || known_period_suffix(text) { + return false; + } + if entity.label == "address" && known_address_final_abbrev(text) { + return false; + } + !(entity.label == "location" && known_location_final_abbrev(text)) +} + +fn known_period_suffix(text: &str) -> bool { + LEGAL_PERIOD_SUFFIXES + .lines() + .any(|suffix| text.ends_with(suffix)) +} + +fn known_address_final_abbrev(text: &str) -> bool { + ADDRESS_FINAL_ABBREVS.lines().any(|suffix| { + text + .strip_suffix(suffix) + .is_some_and(|prefix| prefix.ends_with(char::is_whitespace)) + }) +} + +fn known_location_final_abbrev(text: &str) -> bool { + text.ends_with("D.C.") + || text + .split_whitespace() + .next_back() + .is_some_and(|token| token.chars().filter(|ch| *ch == '.').count() >= 2) +} + +fn label_allows_colon(label: &str) -> bool { + matches!(label, "ip address" | "mac address") +} + +fn collapse_display_whitespace(text: &str) -> String { + let mut output = String::new(); + let mut in_whitespace = false; + + for ch in text.chars() { + if ch.is_whitespace() { + if !in_whitespace { + output.push(' '); + in_whitespace = true; + } + continue; + } + + output.push(ch); + in_whitespace = false; + } + + output +} + +fn first_char(text: &str) -> Option<(char, usize)> { + text.chars().next().map(|ch| (ch, ch.len_utf8())) +} + +fn last_char(text: &str) -> Option<(char, usize)> { + text.chars().next_back().map(|ch| (ch, ch.len_utf8())) +} diff --git a/crates/anonymize-core/src/resolution/types.rs b/crates/anonymize-core/src/resolution/types.rs new file mode 100644 index 00000000..606b9683 --- /dev/null +++ b/crates/anonymize-core/src/resolution/types.rs @@ -0,0 +1,90 @@ +use crate::types::EntityKind; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DetectionSource { + Trigger, + Regex, + DenyList, + LegalForm, + Gazetteer, + Country, + Ner, + Coreference, +} + +impl DetectionSource { + pub(crate) const fn priority(self) -> u8 { + match self { + Self::Gazetteer => 5, + Self::Trigger => 4, + Self::LegalForm | Self::Regex | Self::Country => 3, + Self::DenyList | Self::Coreference => 2, + Self::Ner => 1, + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SourceDetail { + CustomDenyList, + CustomRegex, + GazetteerExtension, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct PipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: DetectionSource, + pub source_detail: Option, + pub kind: EntityKind, +} + +impl PipelineEntity { + #[must_use] + pub fn detected( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source: DetectionSource, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source, + source_detail: None, + kind: EntityKind::Detected, + } + } + + #[must_use] + pub fn coreference( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source_text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source: DetectionSource::Coreference, + source_detail: None, + kind: EntityKind::Coreference { + source_text: source_text.into(), + }, + } + } +} From 8067ca60773ee7ca3a4d876d6a03c8dd9c0c7af5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:40:51 +0200 Subject: [PATCH 011/130] feat: add core match processors --- crates/anonymize-core/src/lib.rs | 5 + crates/anonymize-core/src/processors.rs | 291 ++++++++++++++++++++++ crates/anonymize-core/src/types.rs | 9 + crates/anonymize-core/tests/processors.rs | 197 +++++++++++++++ 4 files changed, 502 insertions(+) create mode 100644 crates/anonymize-core/src/processors.rs create mode 100644 crates/anonymize-core/tests/processors.rs diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 8d3989c3..d04be26c 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -4,6 +4,7 @@ pub(crate) mod normalize; mod placeholders; +mod processors; mod redact; mod resolution; mod search; @@ -11,6 +12,10 @@ mod types; pub(crate) mod utf16; pub use placeholders::build_placeholder_map; +pub use processors::{ + CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, + process_country_matches, process_gazetteer_matches, process_regex_matches, +}; pub use redact::{deanonymise, redact_text}; pub use resolution::{ DetectionSource, PipelineEntity, SourceDetail, enforce_boundary_consistency, diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs new file mode 100644 index 00000000..89782832 --- /dev/null +++ b/crates/anonymize-core/src/processors.rs @@ -0,0 +1,291 @@ +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result, SearchMatch}; +use crate::utf16::Utf16Offsets; + +const MIN_PHONE_LENGTH: usize = 7; +const GAZETTEER_EXACT_SCORE: f64 = 0.9; +const GAZETTEER_FUZZY_SCORE: f64 = 0.85; +const COUNTRY_SCORE: f64 = 0.95; +const MAX_GAZETTEER_PREFIX_OVERSHOOT: u32 = 7; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct PatternSlice { + pub start: u32, + pub end: u32, +} + +impl PatternSlice { + #[must_use] + pub const fn contains(self, pattern: u32) -> bool { + pattern >= self.start && pattern < self.end + } + + fn local_index(self, pattern: u32) -> Option { + if !self.contains(pattern) { + return None; + } + usize::try_from(pattern.saturating_sub(self.start)).ok() + } +} + +#[derive(Clone, Debug, PartialEq)] +pub struct RegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: bool, +} + +impl RegexMatchMeta { + #[must_use] + pub fn new(label: impl Into, score: f64) -> Self { + Self { + label: label.into(), + score, + source_detail: None, + requires_validation: false, + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct GazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CountryMatchData { + pub labels: Vec, +} + +pub fn process_regex_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + meta: &[RegexMatchMeta], +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let pattern = found.pattern(); + let Some(local_index) = slice.local_index(pattern) else { + continue; + }; + let Some(entry) = meta.get(local_index) else { + continue; + }; + if entry.requires_validation { + return Err(Error::UnsupportedRegexValidation { pattern }); + } + + let text = offsets.slice(full_text, found.start(), found.end())?; + if entry.source_detail != Some(SourceDetail::CustomRegex) + && entry.label == "phone number" + && text.encode_utf16().count() < MIN_PHONE_LENGTH + { + continue; + } + + let mut entity = PipelineEntity::detected( + found.start(), + found.end(), + entry.label.clone(), + text, + entry.score, + DetectionSource::Regex, + ); + entity.source_detail = entry.source_detail; + results.push(entity); + } + + Ok(results) +} + +pub fn process_gazetteer_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &GazetteerMatchData, +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let mut results = Vec::new(); + let mut exact_spans = Vec::<(u32, u32)>::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + if data.is_fuzzy.get(local_index).copied().unwrap_or(false) { + continue; + } + + let Some(label) = data.labels.get(local_index) else { + continue; + }; + let extended = try_gazetteer_prefix_extension(full_text, &offsets, found)?; + let (end, text, source_detail) = if let Some(extension) = extended { + extension + } else { + ( + found.end(), + offsets.slice(full_text, found.start(), found.end())?, + None, + ) + }; + + exact_spans.push((found.start(), end)); + let mut entity = PipelineEntity::detected( + found.start(), + end, + label.clone(), + text, + GAZETTEER_EXACT_SCORE, + DetectionSource::Gazetteer, + ); + entity.source_detail = source_detail; + results.push(entity); + } + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + if !data.is_fuzzy.get(local_index).copied().unwrap_or(false) { + continue; + } + if fuzzy_distance(found) == Some(0) { + continue; + } + + let Some(label) = data.labels.get(local_index) else { + continue; + }; + if exact_spans + .iter() + .any(|(start, end)| found.start() < *end && found.end() > *start) + { + continue; + } + + results.push(PipelineEntity::detected( + found.start(), + found.end(), + label.clone(), + offsets.slice(full_text, found.start(), found.end())?, + GAZETTEER_FUZZY_SCORE, + DetectionSource::Gazetteer, + )); + } + + Ok(results) +} + +pub fn process_country_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &CountryMatchData, +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(label) = data.labels.get(local_index) else { + continue; + }; + if !starts_as_proper_noun(full_text, &offsets, found.start())? { + continue; + } + + results.push(PipelineEntity::detected( + found.start(), + found.end(), + label.clone(), + offsets.slice(full_text, found.start(), found.end())?, + COUNTRY_SCORE, + DetectionSource::Country, + )); + } + + Ok(results) +} + +fn try_gazetteer_prefix_extension( + full_text: &str, + offsets: &Utf16Offsets, + found: &SearchMatch, +) -> Result)>> { + let full_len = offsets.len()?; + let max_end = found + .end() + .saturating_add(MAX_GAZETTEER_PREFIX_OVERSHOOT) + .min(full_len); + if max_end <= found.end().saturating_add(1) { + return Ok(None); + } + + let after = offsets.slice(full_text, found.end(), max_end)?; + if !after.starts_with(' ') { + return Ok(None); + } + + let suffix_end = next_space_offset_after_initial(&after); + if suffix_end <= 1 { + return Ok(None); + } + + let new_end = found.end().saturating_add(suffix_end); + Ok(Some(( + new_end, + offsets.slice(full_text, found.start(), new_end)?, + Some(SourceDetail::GazetteerExtension), + ))) +} + +fn next_space_offset_after_initial(text: &str) -> u32 { + let mut offset = 0_u32; + + for ch in text.chars() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + if offset > 0 && ch == ' ' { + return offset; + } + offset = offset.saturating_add(width); + } + + offset +} + +fn starts_as_proper_noun( + full_text: &str, + offsets: &Utf16Offsets, + start: u32, +) -> Result { + let start_byte = offsets.validate_offset(start)?; + let Some(ch) = full_text + .get(start_byte..) + .and_then(|tail| tail.chars().next()) + else { + return Ok(false); + }; + + let upper = ch.to_uppercase().to_string(); + let lower = ch.to_lowercase().to_string(); + if upper == lower { + return Ok(true); + } + + Ok(ch.to_string() == upper) +} + +const fn fuzzy_distance(found: &SearchMatch) -> Option { + let SearchMatch::Fuzzy { distance, .. } = found else { + return None; + }; + Some(*distance) +} diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index cd194cd2..95f97a7d 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -26,6 +26,9 @@ pub enum Error { PatternIndexOutOfRange { index: usize, }, + UnsupportedRegexValidation { + pattern: u32, + }, } impl fmt::Display for Error { @@ -55,6 +58,12 @@ impl fmt::Display for Error { Self::PatternIndexOutOfRange { index } => { write!(formatter, "Search pattern index exceeds u32 range: {index}") } + Self::UnsupportedRegexValidation { pattern } => { + write!( + formatter, + "Regex pattern {pattern} requires validation that is not available in core" + ) + } } } } diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs new file mode 100644 index 00000000..53774c56 --- /dev/null +++ b/crates/anonymize-core/tests/processors.rs @@ -0,0 +1,197 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use stella_anonymize_core::{ + CountryMatchData, DetectionSource, GazetteerMatchData, PatternSlice, + PipelineEntity, RegexMatchMeta, SearchMatch, SourceDetail, + process_country_matches, process_gazetteer_matches, process_regex_matches, +}; + +#[test] +fn regex_processor_filters_slice_and_short_phone_matches() { + let matches = vec![ + SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 13, + end: 18, + }, + SearchMatch::Regex { + pattern: 2, + start: 20, + end: 32, + }, + ]; + let meta = vec![ + RegexMatchMeta::new("person", 0.8), + RegexMatchMeta::new("phone number", 0.8), + ]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Alice called 12345 then 123456789012", + &meta, + ) + .unwrap(); + + assert_eq!( + entities, + vec![PipelineEntity::detected( + 0, + 5, + "person", + "Alice", + 0.8, + DetectionSource::Regex + )] + ); +} + +#[test] +fn regex_processor_rejects_unported_validators() { + let matches = vec![SearchMatch::Regex { + pattern: 7, + start: 0, + end: 5, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("tax identification number"), + score: 0.9, + source_detail: None, + requires_validation: true, + }]; + + let err = process_regex_matches( + &matches, + PatternSlice { start: 7, end: 8 }, + "12345", + &meta, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Regex pattern 7 requires validation that is not available in core" + ); +} + +#[test] +fn regex_processor_preserves_custom_regex_source_detail() { + let matches = vec![SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("matter id"), + score: 0.7, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "A-123", + &meta, + ) + .unwrap(); + + assert_eq!(entities[0].source_detail, Some(SourceDetail::CustomRegex)); +} + +#[test] +fn gazetteer_processor_extends_exact_matches_and_drops_overlapping_fuzzy() { + let matches = vec![ + SearchMatch::Literal { + pattern: 10, + start: 0, + end: 4, + }, + SearchMatch::Fuzzy { + pattern: 11, + start: 0, + end: 4, + distance: 1, + }, + ]; + let data = GazetteerMatchData { + labels: vec![String::from("organization"), String::from("organization")], + is_fuzzy: vec![false, true], + }; + + let entities = process_gazetteer_matches( + &matches, + PatternSlice { start: 10, end: 12 }, + "Acme s.r.o. signed", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Acme s.r.o."); + assert_eq!( + entities[0].source_detail, + Some(SourceDetail::GazetteerExtension) + ); +} + +#[test] +fn gazetteer_processor_emits_non_overlapping_fuzzy_matches() { + let matches = vec![SearchMatch::Fuzzy { + pattern: 2, + start: 10, + end: 15, + distance: 1, + }]; + let data = GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![true], + }; + + let entities = process_gazetteer_matches( + &matches, + PatternSlice { start: 2, end: 3 }, + "Signed by Akmee today", + &data, + ) + .unwrap(); + + assert_eq!(entities[0].text, "Akmee"); + assert_eq!(entities[0].score.to_bits(), 0.85_f64.to_bits()); +} + +#[test] +fn country_processor_requires_uppercase_letter_start() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }, + SearchMatch::Literal { + pattern: 0, + start: 11, + end: 17, + }, + ]; + let data = CountryMatchData { + labels: vec![String::from("country")], + }; + + let entities = process_country_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "turkey and Turkey", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Turkey"); + assert_eq!(entities[0].source, DetectionSource::Country); +} From 6c3b7e21515679d37a650851b923c53cb9b025fe Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:46:09 +0200 Subject: [PATCH 012/130] feat: add core search normalization --- crates/anonymize-core/src/lib.rs | 1 + crates/anonymize-core/src/normalize.rs | 29 ++++++++++++++++++++++++ crates/anonymize-core/tests/normalize.rs | 20 ++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 crates/anonymize-core/tests/normalize.rs diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index d04be26c..4e8cd601 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -11,6 +11,7 @@ mod search; mod types; pub(crate) mod utf16; +pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; pub use processors::{ CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 55ea6f6b..99b92a8b 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -2,6 +2,26 @@ const PHONE_NOISE: [char; 3] = ['(', ')', '-']; const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; const IDENTIFIER_CUES: &str = include_str!("../data/identifier-cues.txt"); +#[must_use] +pub fn normalize_for_search(text: &str) -> String { + let mut has_replacement = false; + for ch in text.chars() { + if replacement_char(ch) != ch { + has_replacement = true; + break; + } + } + if !has_replacement { + return text.to_owned(); + } + + let mut output = String::with_capacity(text.len()); + for ch in text.chars() { + output.push(replacement_char(ch)); + } + output +} + // Normalization decides placeholder identity. pub(crate) fn label_key(label: &str) -> String { let uppercase = uppercase(label); @@ -407,3 +427,12 @@ fn contains_word(text: &str, word: &str) -> bool { false } + +const fn replacement_char(ch: char) -> char { + match ch { + '\u{00a0}' | '\u{2007}' | '\u{202f}' => ' ', + '\u{2013}' | '\u{2014}' => '-', + '\u{201c}' | '\u{201d}' => '"', + _ => ch, + } +} diff --git a/crates/anonymize-core/tests/normalize.rs b/crates/anonymize-core/tests/normalize.rs new file mode 100644 index 00000000..7d24ee4c --- /dev/null +++ b/crates/anonymize-core/tests/normalize.rs @@ -0,0 +1,20 @@ +use stella_anonymize_core::normalize_for_search; + +#[test] +fn normalize_for_search_matches_ts_replacements() { + assert_eq!(normalize_for_search("hello\u{00a0}world"), "hello world"); + assert_eq!(normalize_for_search("1\u{2007}000"), "1 000"); + assert_eq!(normalize_for_search("a\u{202f}b"), "a b"); + assert_eq!(normalize_for_search("2020\u{2013}2024"), "2020-2024"); + assert_eq!(normalize_for_search("a\u{2014}b"), "a-b"); + assert_eq!(normalize_for_search("\u{201c}hello\u{201d}"), "\"hello\""); +} + +#[test] +fn normalize_for_search_preserves_utf16_width() { + let input = "a\u{00a0}\u{1f600}\u{2013}b"; + let output = normalize_for_search(input); + + assert_eq!(output, "a \u{1f600}-b"); + assert_eq!(output.encode_utf16().count(), input.encode_utf16().count()); +} From 9c534abee11621c7b2e15f5badd18a7919794d21 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:46:09 +0200 Subject: [PATCH 013/130] feat: support literal pattern options --- crates/anonymize-core/src/search.rs | 124 +++++++++++++++++++------- crates/anonymize-core/tests/search.rs | 82 +++++++++++++++++ 2 files changed, 176 insertions(+), 30 deletions(-) diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 8b2f2f11..4b165196 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use stella_aho_corasick_core as literal_core; use stella_fuzzy_search_core as fuzzy_core; use stella_regex_set_core as regex_core; @@ -8,6 +9,11 @@ use crate::types::{Error, Result, SearchEngine, SearchMatch}; #[derive(Clone, Debug, Eq, PartialEq)] pub enum SearchPattern { Literal(String), + LiteralWithOptions { + pattern: String, + case_insensitive: Option, + whole_words: Option, + }, Regex(String), Fuzzy { pattern: String, @@ -22,7 +28,7 @@ pub struct SearchOptions { pub fuzzy: FuzzySearchOptions, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd)] pub struct LiteralSearchOptions { pub case_insensitive: bool, pub whole_words: bool, @@ -51,21 +57,27 @@ impl Default for FuzzySearchOptions { } pub struct SearchIndex { - literal: Option, - literal_pattern_indexes: Vec, + literal: Vec, regex: Option, regex_pattern_indexes: Vec, fuzzy: Option, fuzzy_pattern_indexes: Vec, } +struct LiteralSlot { + engine: literal_core::AhoCorasick, + pattern_indexes: Vec, +} + +type LiteralPatternGroup = (Vec, Vec); + impl SearchIndex { pub fn new( patterns: Vec, options: SearchOptions, ) -> Result { - let mut literal_patterns = Vec::::new(); - let mut literal_pattern_indexes = Vec::::new(); + let mut literal_groups = + BTreeMap::::new(); let mut regex_patterns = Vec::::new(); let mut regex_pattern_indexes = Vec::::new(); let mut fuzzy_patterns = Vec::::new(); @@ -75,8 +87,28 @@ impl SearchIndex { let pattern_index = pattern_index(index)?; match entry { SearchPattern::Literal(value) => { - literal_patterns.push(value); - literal_pattern_indexes.push(pattern_index); + push_literal_pattern( + &mut literal_groups, + options.literal, + value, + pattern_index, + ); + } + SearchPattern::LiteralWithOptions { + pattern, + case_insensitive, + whole_words, + } => { + push_literal_pattern( + &mut literal_groups, + LiteralSearchOptions { + case_insensitive: case_insensitive + .unwrap_or(options.literal.case_insensitive), + whole_words: whole_words.unwrap_or(options.literal.whole_words), + }, + pattern, + pattern_index, + ); } SearchPattern::Regex(value) => { regex_patterns.push(value); @@ -95,13 +127,12 @@ impl SearchIndex { } } - let literal = build_literal(literal_patterns, options)?; - let regex = build_regex(regex_patterns, options)?; - let fuzzy = build_fuzzy(fuzzy_patterns, options)?; + let literal = build_literal_slots(literal_groups)?; + let regex = build_regex(regex_patterns, options.regex)?; + let fuzzy = build_fuzzy(fuzzy_patterns, options.fuzzy)?; Ok(Self { literal, - literal_pattern_indexes, regex, regex_pattern_indexes, fuzzy, @@ -112,13 +143,14 @@ impl SearchIndex { pub fn find_iter(&self, haystack: &str) -> Result> { let mut matches = Vec::new(); - if let Some(literal) = &self.literal { + for slot in &self.literal { // Downstream merge priority chooses among overlaps. extend_triple_matches( &mut matches, SearchEngine::Literal, - &self.literal_pattern_indexes, - &literal + &slot.pattern_indexes, + &slot + .engine .find_overlapping_iter_packed(haystack) .map_err(|err| Error::Search { engine: SearchEngine::Literal, @@ -175,13 +207,17 @@ impl SearchIndex { } pub fn is_match(&self, haystack: &str) -> Result { - if let Some(literal) = &self.literal - && literal.is_match(haystack).map_err(|err| Error::Search { - engine: SearchEngine::Literal, - reason: err.to_string(), - })? - { - return Ok(true); + for slot in &self.literal { + if slot + .engine + .is_match(haystack) + .map_err(|err| Error::Search { + engine: SearchEngine::Literal, + reason: err.to_string(), + })? + { + return Ok(true); + } } if let Some(regex) = &self.regex @@ -203,9 +239,37 @@ impl SearchIndex { } } +fn push_literal_pattern( + groups: &mut BTreeMap, + options: LiteralSearchOptions, + pattern: String, + pattern_index: u32, +) { + let (patterns, pattern_indexes) = groups.entry(options).or_default(); + patterns.push(pattern); + pattern_indexes.push(pattern_index); +} + +fn build_literal_slots( + groups: BTreeMap, +) -> Result> { + let mut slots = Vec::new(); + + for (options, (patterns, pattern_indexes)) in groups { + if let Some(engine) = build_literal(patterns, options)? { + slots.push(LiteralSlot { + engine, + pattern_indexes, + }); + } + } + + Ok(slots) +} + fn build_literal( patterns: Vec, - options: SearchOptions, + options: LiteralSearchOptions, ) -> Result> { if patterns.is_empty() { return Ok(None); @@ -215,9 +279,9 @@ fn build_literal( patterns, literal_core::Options { match_kind: literal_core::MatchKind::LeftmostFirst, - case_insensitive: options.literal.case_insensitive, + case_insensitive: options.case_insensitive, dfa: false, - whole_words: options.literal.whole_words, + whole_words: options.whole_words, }, ) .map(Some) @@ -229,7 +293,7 @@ fn build_literal( fn build_regex( patterns: Vec, - options: SearchOptions, + options: RegexSearchOptions, ) -> Result> { if patterns.is_empty() { return Ok(None); @@ -238,7 +302,7 @@ fn build_regex( regex_core::RegexSet::new( patterns, regex_core::Options { - whole_words: options.regex.whole_words, + whole_words: options.whole_words, unicode_boundaries: true, }, ) @@ -251,7 +315,7 @@ fn build_regex( fn build_fuzzy( patterns: Vec, - options: SearchOptions, + options: FuzzySearchOptions, ) -> Result> { if patterns.is_empty() { return Ok(None); @@ -261,10 +325,10 @@ fn build_fuzzy( patterns, fuzzy_core::Options { metric: fuzzy_core::Metric::Levenshtein, - normalize_diacritics: options.fuzzy.normalize_diacritics, + normalize_diacritics: options.normalize_diacritics, unicode_boundaries: true, - whole_words: options.fuzzy.whole_words, - case_insensitive: options.fuzzy.case_insensitive, + whole_words: options.whole_words, + case_insensitive: options.case_insensitive, }, ) .map(Some) diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index 52eb8e50..a8e37f18 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -127,6 +127,88 @@ fn search_index_returns_overlapping_literal_matches() { ); } +#[test] +fn search_index_supports_per_pattern_literal_word_boundaries() { + let index = SearchIndex::new( + vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("he"), + case_insensitive: None, + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("s.r.o."), + case_insensitive: None, + whole_words: Some(false), + }, + ], + SearchOptions::default(), + ) + .unwrap(); + + let matches = index.find_iter("shell Acme s.r.o. he").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 1, + start: 11, + end: 17, + }, + SearchMatch::Literal { + pattern: 0, + start: 18, + end: 20, + }, + ] + ); +} + +#[test] +fn search_index_supports_per_pattern_literal_case_sensitivity() { + let index = SearchIndex::new( + vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("alice"), + case_insensitive: Some(true), + whole_words: None, + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("bob"), + case_insensitive: Some(false), + whole_words: None, + }, + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Bob bob").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Literal { + pattern: 1, + start: 10, + end: 13, + }, + ] + ); +} + #[test] fn search_index_reports_match_presence_across_engines() { let index = SearchIndex::new( From 7af815bcb4fccc28cfd39342674ce17fa27acfc0 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:50:05 +0200 Subject: [PATCH 014/130] feat: add prepared core search --- crates/anonymize-core/src/lib.rs | 5 + crates/anonymize-core/src/prepared.rs | 140 ++++++++++++++++++++++++ crates/anonymize-core/src/processors.rs | 2 +- crates/anonymize-core/tests/prepared.rs | 118 ++++++++++++++++++++ 4 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 crates/anonymize-core/src/prepared.rs create mode 100644 crates/anonymize-core/tests/prepared.rs diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 4e8cd601..75315cec 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -4,6 +4,7 @@ pub(crate) mod normalize; mod placeholders; +mod prepared; mod processors; mod redact; mod resolution; @@ -13,6 +14,10 @@ pub(crate) mod utf16; pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; +pub use prepared::{ + PreparedSearch, PreparedSearchConfig, PreparedSearchMatches, + PreparedSearchSlices, StaticDetectionResult, +}; pub use processors::{ CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, process_country_matches, process_gazetteer_matches, process_regex_matches, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs new file mode 100644 index 00000000..2f9c9b0c --- /dev/null +++ b/crates/anonymize-core/src/prepared.rs @@ -0,0 +1,140 @@ +use crate::normalize::normalize_for_search; +use crate::processors::{ + CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, + process_country_matches, process_gazetteer_matches, process_regex_matches, +}; +use crate::resolution::PipelineEntity; +use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::types::{Result, SearchMatch}; + +pub struct PreparedSearch { + regex: SearchIndex, + custom_regex: SearchIndex, + literals: SearchIndex, + slices: PreparedSearchSlices, + regex_meta: Vec, + custom_regex_meta: Vec, + gazetteer_data: Option, + country_data: Option, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PreparedSearchSlices { + pub regex: PatternSlice, + pub custom_regex: PatternSlice, + pub legal_forms: PatternSlice, + pub triggers: PatternSlice, + pub deny_list: PatternSlice, + pub street_types: PatternSlice, + pub gazetteer: PatternSlice, + pub countries: PatternSlice, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct PreparedSearchConfig { + pub regex_patterns: Vec, + pub custom_regex_patterns: Vec, + pub literal_patterns: Vec, + pub regex_options: SearchOptions, + pub custom_regex_options: SearchOptions, + pub literal_options: SearchOptions, + pub slices: PreparedSearchSlices, + pub regex_meta: Vec, + pub custom_regex_meta: Vec, + pub gazetteer_data: Option, + pub country_data: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PreparedSearchMatches { + pub regex: Vec, + pub custom_regex: Vec, + pub literal: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct StaticDetectionResult { + pub matches: PreparedSearchMatches, + pub regex_entities: Vec, + pub custom_regex_entities: Vec, + pub gazetteer_entities: Vec, + pub country_entities: Vec, +} + +impl PreparedSearch { + pub fn new(config: PreparedSearchConfig) -> Result { + Ok(Self { + regex: SearchIndex::new(config.regex_patterns, config.regex_options)?, + custom_regex: SearchIndex::new( + config.custom_regex_patterns, + config.custom_regex_options, + )?, + literals: SearchIndex::new( + config.literal_patterns, + config.literal_options, + )?, + slices: config.slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + }) + } + + pub fn find_matches(&self, full_text: &str) -> Result { + let normalized = normalize_for_search(full_text); + + Ok(PreparedSearchMatches { + regex: self.regex.find_iter(full_text)?, + custom_regex: self.custom_regex.find_iter(full_text)?, + literal: self.literals.find_iter(&normalized)?, + }) + } + + pub fn detect_static_entities( + &self, + full_text: &str, + ) -> Result { + let matches = self.find_matches(full_text)?; + let regex_entities = process_regex_matches( + &matches.regex, + self.slices.regex, + full_text, + &self.regex_meta, + )?; + let custom_regex_entities = process_regex_matches( + &matches.custom_regex, + self.slices.custom_regex, + full_text, + &self.custom_regex_meta, + )?; + let gazetteer_entities = if let Some(data) = &self.gazetteer_data { + process_gazetteer_matches( + &matches.literal, + self.slices.gazetteer, + full_text, + data, + )? + } else { + Vec::new() + }; + let country_entities = if let Some(data) = &self.country_data { + process_country_matches( + &matches.literal, + self.slices.countries, + full_text, + data, + )? + } else { + Vec::new() + }; + + Ok(StaticDetectionResult { + matches, + regex_entities, + custom_regex_entities, + gazetteer_entities, + country_entities, + }) + } +} diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 89782832..01d54f90 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -8,7 +8,7 @@ const GAZETTEER_FUZZY_SCORE: f64 = 0.85; const COUNTRY_SCORE: f64 = 0.95; const MAX_GAZETTEER_PREFIX_OVERSHOOT: u32 = 7; -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub struct PatternSlice { pub start: u32, pub end: u32, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs new file mode 100644 index 00000000..7da9febe --- /dev/null +++ b/crates/anonymize-core/tests/prepared.rs @@ -0,0 +1,118 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use stella_anonymize_core::{ + CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, + LiteralSearchOptions, PatternSlice, PreparedSearch, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, + SearchPattern, SourceDetail, +}; + +#[test] +fn prepared_search_runs_normalized_literal_pass() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + slices: PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![], + custom_regex_meta: vec![], + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Acme\u{00a0}Corp. signed") + .unwrap(); + + assert_eq!(result.gazetteer_entities.len(), 1); + assert_eq!(result.gazetteer_entities[0].text, "Acme\u{00a0}Corp"); +} + +#[test] +fn prepared_search_emits_static_detector_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bMAT-\d{3}\b", + ))], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Turkey"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + fuzzy: FuzzySearchOptions::default(), + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + custom_regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + countries: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![RegexMatchMeta { + label: String::from("matter id"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + }], + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: Some(CountryMatchData { + labels: vec![String::from("country")], + }), + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Acme s.r.o. filed AB1234 in Turkey under MAT-123") + .unwrap(); + + assert_eq!(result.regex_entities[0].label, "registration number"); + assert_eq!(result.custom_regex_entities[0].label, "matter id"); + assert_eq!( + result.custom_regex_entities[0].source_detail, + Some(SourceDetail::CustomRegex) + ); + assert_eq!(result.gazetteer_entities[0].text, "Acme s.r.o."); + assert_eq!(result.country_entities[0].source, DetectionSource::Country); +} From c792979f004c7a793fe04f112a180d3b7ec8ad81 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 10:54:16 +0200 Subject: [PATCH 015/130] feat: add static core redaction --- .../data/legal-period-suffixes.txt | 6 ++ crates/anonymize-core/src/lib.rs | 2 +- crates/anonymize-core/src/prepared.rs | 76 ++++++++++++++++++- crates/anonymize-core/tests/prepared.rs | 70 ++++++++++++++++- 4 files changed, 148 insertions(+), 6 deletions(-) diff --git a/crates/anonymize-core/data/legal-period-suffixes.txt b/crates/anonymize-core/data/legal-period-suffixes.txt index a4055448..81f05910 100644 --- a/crates/anonymize-core/data/legal-period-suffixes.txt +++ b/crates/anonymize-core/data/legal-period-suffixes.txt @@ -4,4 +4,10 @@ Corp. N.A. Kft. S.A. +a.s. a. s. +s.r.o. +spol. s r.o. +Pty Ltd. +Ltda. +S.a.s. diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 75315cec..2e79c9c5 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -16,7 +16,7 @@ pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; pub use prepared::{ PreparedSearch, PreparedSearchConfig, PreparedSearchMatches, - PreparedSearchSlices, StaticDetectionResult, + PreparedSearchSlices, StaticDetectionResult, StaticRedactionResult, }; pub use processors::{ CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 2f9c9b0c..738399a5 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -3,9 +3,15 @@ use crate::processors::{ CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, process_country_matches, process_gazetteer_matches, process_regex_matches, }; -use crate::resolution::PipelineEntity; +use crate::redact::redact_text; +use crate::resolution::{ + PipelineEntity, enforce_boundary_consistency, merge_and_dedup, + sanitize_entities, +}; use crate::search::{SearchIndex, SearchOptions, SearchPattern}; -use crate::types::{Result, SearchMatch}; +use crate::types::{ + Entity, EntityKind, OperatorConfig, RedactionResult, Result, SearchMatch, +}; pub struct PreparedSearch { regex: SearchIndex, @@ -61,6 +67,13 @@ pub struct StaticDetectionResult { pub country_entities: Vec, } +#[derive(Clone, Debug, PartialEq)] +pub struct StaticRedactionResult { + pub detections: StaticDetectionResult, + pub resolved_entities: Vec, + pub redaction: RedactionResult, +} + impl PreparedSearch { pub fn new(config: PreparedSearchConfig) -> Result { Ok(Self { @@ -137,4 +150,63 @@ impl PreparedSearch { country_entities, }) } + + pub fn redact_static_entities( + &self, + full_text: &str, + operators: &OperatorConfig, + ) -> Result { + let detections = self.detect_static_entities(full_text)?; + let raw_entities = detections.all_entities(); + let merged = merge_and_dedup(&raw_entities); + let consistent = enforce_boundary_consistency(&merged, full_text)?; + let resolved_entities = sanitize_entities(&consistent); + let redaction_entities = resolved_entities + .iter() + .map(to_redaction_entity) + .collect::>(); + let redaction = redact_text(full_text, &redaction_entities, operators)?; + + Ok(StaticRedactionResult { + detections, + resolved_entities, + redaction, + }) + } +} + +impl StaticDetectionResult { + #[must_use] + pub fn all_entities(&self) -> Vec { + let capacity = self + .regex_entities + .len() + .saturating_add(self.custom_regex_entities.len()) + .saturating_add(self.gazetteer_entities.len()) + .saturating_add(self.country_entities.len()); + let mut entities = Vec::with_capacity(capacity); + entities.extend(self.regex_entities.iter().cloned()); + entities.extend(self.custom_regex_entities.iter().cloned()); + entities.extend(self.gazetteer_entities.iter().cloned()); + entities.extend(self.country_entities.iter().cloned()); + entities + } +} + +fn to_redaction_entity(entity: &PipelineEntity) -> Entity { + match &entity.kind { + EntityKind::Detected => Entity::detected( + entity.start, + entity.end, + entity.label.clone(), + entity.text.clone(), + ), + EntityKind::Coreference { source_text } => Entity::coreference( + entity.start, + entity.end, + entity.label.clone(), + entity.text.clone(), + source_text.clone(), + ), + } } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 7da9febe..5c2fb4bb 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -2,9 +2,9 @@ use stella_anonymize_core::{ CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, - LiteralSearchOptions, PatternSlice, PreparedSearch, PreparedSearchConfig, - PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, - SearchPattern, SourceDetail, + LiteralSearchOptions, OperatorConfig, PatternSlice, PreparedSearch, + PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, }; #[test] @@ -116,3 +116,67 @@ fn prepared_search_emits_static_detector_entities() { assert_eq!(result.gazetteer_entities[0].text, "Acme s.r.o."); assert_eq!(result.country_entities[0].source, DetectionSource::Country); } + +#[test] +fn prepared_search_redacts_static_entities_end_to_end() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Turkey"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + countries: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![], + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: Some(CountryMatchData { + labels: vec![String::from("country")], + }), + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Acme s.r.o. filed AB1234 in Turkey.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] filed [REGISTRATION_NUMBER_1] in [COUNTRY_1]." + ); + assert_eq!(result.redaction.entity_count, 3); + assert_eq!(result.resolved_entities.len(), 3); +} From 93cf0e6b7d5cdd0964a105dba8a63c4d1640272e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 11:03:50 +0200 Subject: [PATCH 016/130] feat: add core language bindings --- Cargo.lock | 417 +++++++++++++++++++++++++++++++ Cargo.toml | 6 +- crates/anonymize-napi/Cargo.toml | 25 ++ crates/anonymize-napi/build.rs | 3 + crates/anonymize-napi/src/lib.rs | 389 ++++++++++++++++++++++++++++ crates/anonymize-py/Cargo.toml | 21 ++ crates/anonymize-py/src/lib.rs | 410 ++++++++++++++++++++++++++++++ 7 files changed, 1270 insertions(+), 1 deletion(-) create mode 100644 crates/anonymize-napi/Cargo.toml create mode 100644 crates/anonymize-napi/build.rs create mode 100644 crates/anonymize-napi/src/lib.rs create mode 100644 crates/anonymize-py/Cargo.toml create mode 100644 crates/anonymize-py/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 4f08eb01..5cf8d7d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,33 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "convert_case" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "affbf0190ed2caf063e3def54ff444b449371d55c58e513a95ab98eca50adb49" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "ctor" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01334b89b69ff726750c5ce5073fc8bd860e99aa9a8fc5ca11b04730e3aee97a" + [[package]] name = "fancy-regex" version = "0.18.0" @@ -37,12 +64,286 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "memchr" version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" +[[package]] +name = "napi" +version = "3.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41bda2ac390efb5e8d22025d925ccc3f3807d8c1bea6d19b36127247c4b8f83" +dependencies = [ + "bitflags", + "ctor", + "futures", + "napi-build", + "napi-sys", + "nohash-hasher", + "rustc-hash", + "serde", + "serde_json", +] + +[[package]] +name = "napi-build" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9c366d2c8c60b86fa632df75f745509b52f9128f91a6bad4c796e44abb505e1" + +[[package]] +name = "napi-derive" +version = "3.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61d66f70256ad5aef58659966064471d0ad90e2897bc36a5a5e0389c85aabc1e" +dependencies = [ + "convert_case", + "ctor", + "napi-derive-backend", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "napi-derive-backend" +version = "5.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b4b08f15eed7a2a20c3f4c6314013fc3ac890a3afa9892b594485299ebdb2d" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "semver", + "syn", +] + +[[package]] +name = "napi-sys" +version = "3.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f5bcdf71abd3a50d00b49c1c2c75251cb3c913777d6139cd37dabc093a5e400" +dependencies = [ + "libloading", +] + +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd274650b21d4bfc26a0a47587962c1edb425f69287324355cd040c3ea66071c" +dependencies = [ + "libc", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", +] + +[[package]] +name = "pyo3-build-config" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5e2a7d2f0d013342f295c048ad19237add5154a55b1c5a254c0ec93d4109078" +dependencies = [ + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca85c467da1bbc8d866eea5deff9cf29ea5f7785054a17da36e65bda9c05845b" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac53762fd065daa3194dd09337a38bd793a188100fd1a9304c4ab312d901771" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca3a1557399783172dc5bf39cfca835157732532cba56b71d2292161e53b362" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + [[package]] name = "regex" version = "1.12.4" @@ -72,6 +373,67 @@ version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + [[package]] name = "stella-aho-corasick-core" version = "1.0.4" @@ -90,6 +452,26 @@ dependencies = [ "stella-regex-set-core", ] +[[package]] +name = "stella-anonymize-napi" +version = "1.5.0" +dependencies = [ + "napi", + "napi-build", + "napi-derive", + "stella-anonymize-core", +] + +[[package]] +name = "stella-anonymize-py" +version = "1.5.0" +dependencies = [ + "pyo3", + "serde", + "serde_json", + "stella-anonymize-core", +] + [[package]] name = "stella-fuzzy-search-core" version = "1.1.2" @@ -112,6 +494,23 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + [[package]] name = "tinyvec" version = "1.11.0" @@ -133,6 +532,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e9026503b74f3207a4c04e6bf4ea735daa8edf6c0bbfa044cae597bb947a9db" +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + [[package]] name = "unicode-normalization" version = "0.1.25" @@ -147,3 +552,15 @@ name = "unicode-segmentation" version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 2bc63b08..502300e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,9 @@ [workspace] -members = ["crates/anonymize-core"] +members = [ + "crates/anonymize-core", + "crates/anonymize-napi", + "crates/anonymize-py", +] resolver = "3" [workspace.package] diff --git a/crates/anonymize-napi/Cargo.toml b/crates/anonymize-napi/Cargo.toml new file mode 100644 index 00000000..6cab0467 --- /dev/null +++ b/crates/anonymize-napi/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "stella-anonymize-napi" +version.workspace = true +edition.workspace = true +description = "Native bindings for Stella anonymization core" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[lib] +crate-type = ["cdylib"] + +[dependencies] +napi = { version = "3", default-features = false, features = [ + "napi9", + "serde-json", +] } +napi-derive = "3" +stella-anonymize-core = { path = "../anonymize-core" } + +[build-dependencies] +napi-build = "2" + +[lints] +workspace = true diff --git a/crates/anonymize-napi/build.rs b/crates/anonymize-napi/build.rs new file mode 100644 index 00000000..bbfc9e4b --- /dev/null +++ b/crates/anonymize-napi/build.rs @@ -0,0 +1,3 @@ +fn main() { + napi_build::setup(); +} diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs new file mode 100644 index 00000000..e1b1c230 --- /dev/null +++ b/crates/anonymize-napi/src/lib.rs @@ -0,0 +1,389 @@ +use std::collections::BTreeMap; + +use napi::bindgen_prelude::*; +use napi_derive::napi; +use stella_anonymize_core::{ + CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, + LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, + StaticRedactionResult, +}; + +#[napi(object)] +pub struct JsSearchPattern { + pub kind: String, + pub pattern: String, + pub distance: Option, + pub case_insensitive: Option, + pub whole_words: Option, +} + +#[napi(object)] +pub struct JsSearchOptions { + pub literal_case_insensitive: Option, + pub literal_whole_words: Option, + pub regex_whole_words: Option, + pub fuzzy_case_insensitive: Option, + pub fuzzy_whole_words: Option, + pub fuzzy_normalize_diacritics: Option, +} + +#[napi(object)] +pub struct JsPatternSlice { + pub start: u32, + pub end: u32, +} + +#[napi(object)] +pub struct JsPreparedSearchSlices { + pub regex: Option, + pub custom_regex: Option, + pub legal_forms: Option, + pub triggers: Option, + pub deny_list: Option, + pub street_types: Option, + pub gazetteer: Option, + pub countries: Option, +} + +#[napi(object)] +pub struct JsRegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: Option, +} + +#[napi(object)] +pub struct JsGazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[napi(object)] +pub struct JsCountryMatchData { + pub labels: Vec, +} + +#[napi(object)] +pub struct JsPreparedSearchConfig { + pub regex_patterns: Vec, + pub custom_regex_patterns: Vec, + pub literal_patterns: Vec, + pub regex_options: Option, + pub custom_regex_options: Option, + pub literal_options: Option, + pub slices: JsPreparedSearchSlices, + pub regex_meta: Vec, + pub custom_regex_meta: Vec, + pub gazetteer_data: Option, + pub country_data: Option, +} + +#[napi(object)] +pub struct JsOperatorConfig { + pub operators: Option>, + pub redact_string: Option, +} + +#[napi(object)] +pub struct JsRedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[napi(object)] +pub struct JsOperatorEntry { + pub placeholder: String, + pub operator: String, +} + +#[napi(object)] +pub struct JsRedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: u32, +} + +#[napi(object)] +pub struct JsPipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: String, + pub source_detail: Option, +} + +#[napi(object)] +pub struct JsStaticRedactionResult { + pub resolved_entities: Vec, + pub redaction: JsRedactionResult, +} + +#[napi] +#[must_use] +#[allow(clippy::needless_pass_by_value)] +pub fn normalize_for_search(text: String) -> String { + stella_anonymize_core::normalize_for_search(&text) +} + +#[napi] +pub struct NativePreparedSearch { + inner: PreparedSearch, +} + +#[napi] +impl NativePreparedSearch { + #[napi(constructor)] + pub fn new(config: JsPreparedSearchConfig) -> Result { + PreparedSearch::new(to_prepared_search_config(config)?) + .map(|inner| Self { inner }) + .map_err(|error| to_napi_error(&error)) + } + + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operator_config = to_operator_config(operators)?; + self + .inner + .redact_static_entities(&full_text, &operator_config) + .map(to_static_redaction_result) + .map_err(|error| to_napi_error(&error)) + } +} + +fn to_prepared_search_config( + config: JsPreparedSearchConfig, +) -> Result { + Ok(PreparedSearchConfig { + regex_patterns: to_search_patterns(config.regex_patterns)?, + custom_regex_patterns: to_search_patterns(config.custom_regex_patterns)?, + literal_patterns: to_search_patterns(config.literal_patterns)?, + regex_options: to_search_options(config.regex_options), + custom_regex_options: to_search_options(config.custom_regex_options), + literal_options: to_search_options(config.literal_options), + slices: to_slices(config.slices), + regex_meta: to_regex_meta(config.regex_meta), + custom_regex_meta: to_regex_meta(config.custom_regex_meta), + gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { + labels: data.labels, + is_fuzzy: data.is_fuzzy, + }), + country_data: config.country_data.map(|data| CountryMatchData { + labels: data.labels, + }), + }) +} + +fn to_search_patterns( + patterns: Vec, +) -> Result> { + patterns + .into_iter() + .map(|pattern| match pattern.kind.as_str() { + "literal" => Ok(SearchPattern::Literal(pattern.pattern)), + "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { + pattern: pattern.pattern, + case_insensitive: pattern.case_insensitive, + whole_words: pattern.whole_words, + }), + "regex" => Ok(SearchPattern::Regex(pattern.pattern)), + "fuzzy" => Ok(SearchPattern::Fuzzy { + pattern: pattern.pattern, + distance: pattern + .distance + .map(|distance| { + u8::try_from(distance).map_err(|_| { + Error::from_reason(format!( + "Fuzzy distance exceeds u8 range: {distance}" + )) + }) + }) + .transpose()?, + }), + _ => Err(Error::from_reason(format!( + "Unsupported search pattern kind: {}", + pattern.kind + ))), + }) + .collect() +} + +fn to_search_options(options: Option) -> SearchOptions { + let Some(options) = options else { + return SearchOptions::default(); + }; + + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: options.literal_case_insensitive.unwrap_or(false), + whole_words: options.literal_whole_words.unwrap_or(false), + }, + regex: RegexSearchOptions { + whole_words: options.regex_whole_words.unwrap_or(false), + }, + fuzzy: FuzzySearchOptions { + case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), + whole_words: options.fuzzy_whole_words.unwrap_or(true), + normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), + }, + } +} + +fn to_slices(slices: JsPreparedSearchSlices) -> PreparedSearchSlices { + PreparedSearchSlices { + regex: to_slice(slices.regex), + custom_regex: to_slice(slices.custom_regex), + legal_forms: to_slice(slices.legal_forms), + triggers: to_slice(slices.triggers), + deny_list: to_slice(slices.deny_list), + street_types: to_slice(slices.street_types), + gazetteer: to_slice(slices.gazetteer), + countries: to_slice(slices.countries), + } +} + +fn to_slice(slice: Option) -> PatternSlice { + slice.map_or_else(PatternSlice::default, |slice| PatternSlice { + start: slice.start, + end: slice.end, + }) +} + +fn to_regex_meta(meta: Vec) -> Vec { + meta + .into_iter() + .map(|entry| RegexMatchMeta { + label: entry.label, + score: entry.score, + source_detail: entry.source_detail.as_deref().and_then(to_source_detail), + requires_validation: entry.requires_validation.unwrap_or(false), + }) + .collect() +} + +fn to_source_detail(value: &str) -> Option { + match value { + "custom-deny-list" => Some(SourceDetail::CustomDenyList), + "custom-regex" => Some(SourceDetail::CustomRegex), + "gazetteer-extension" => Some(SourceDetail::GazetteerExtension), + _ => None, + } +} + +fn to_operator_config( + config: Option, +) -> Result { + let Some(config) = config else { + return Ok(OperatorConfig::default()); + }; + + let mut operators = BTreeMap::new(); + for (label, value) in config.operators.unwrap_or_default() { + operators.insert(label, to_operator_type(&value)?); + } + + Ok(OperatorConfig { + operators, + redact_string: config + .redact_string + .unwrap_or_else(|| String::from("[REDACTED]")), + }) +} + +fn to_operator_type(value: &str) -> Result { + match value { + "replace" => Ok(OperatorType::Replace), + "redact" => Ok(OperatorType::Redact), + _ => Err(Error::from_reason(format!( + "Unsupported anonymization operator: {value}" + ))), + } +} + +fn to_static_redaction_result( + result: StaticRedactionResult, +) -> JsStaticRedactionResult { + JsStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(|entity| JsPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: detection_source_name(entity.source), + source_detail: entity.source_detail.map(source_detail_name), + }) + .collect(), + redaction: JsRedactionResult { + redacted_text: result.redaction.redacted_text, + redaction_map: result + .redaction + .redaction_map + .into_iter() + .map(|entry| JsRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: result + .redaction + .operator_map + .into_iter() + .map(|entry| JsOperatorEntry { + placeholder: entry.placeholder, + operator: operator_name(entry.operator), + }) + .collect(), + entity_count: u32::try_from(result.redaction.entity_count) + .unwrap_or(u32::MAX), + }, + } +} + +fn detection_source_name(source: DetectionSource) -> String { + match source { + DetectionSource::Trigger => "trigger", + DetectionSource::Regex => "regex", + DetectionSource::DenyList => "deny-list", + DetectionSource::LegalForm => "legal-form", + DetectionSource::Gazetteer => "gazetteer", + DetectionSource::Country => "country", + DetectionSource::Ner => "ner", + DetectionSource::Coreference => "coreference", + } + .to_owned() +} + +fn source_detail_name(detail: SourceDetail) -> String { + match detail { + SourceDetail::CustomDenyList => "custom-deny-list", + SourceDetail::CustomRegex => "custom-regex", + SourceDetail::GazetteerExtension => "gazetteer-extension", + } + .to_owned() +} + +fn operator_name(operator: OperatorType) -> String { + match operator { + OperatorType::Replace => "replace", + OperatorType::Redact => "redact", + } + .to_owned() +} + +fn to_napi_error(error: &stella_anonymize_core::Error) -> Error { + Error::from_reason(error.to_string()) +} diff --git a/crates/anonymize-py/Cargo.toml b/crates/anonymize-py/Cargo.toml new file mode 100644 index 00000000..1dd9bc66 --- /dev/null +++ b/crates/anonymize-py/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "stella-anonymize-py" +version.workspace = true +edition.workspace = true +description = "Python bindings for Stella anonymization core" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[lib] +name = "stella_anonymize_core_py" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.29", features = ["extension-module", "abi3-py311"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +stella-anonymize-core = { path = "../anonymize-core" } + +[lints] +workspace = true diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs new file mode 100644 index 00000000..1d195504 --- /dev/null +++ b/crates/anonymize-py/src/lib.rs @@ -0,0 +1,410 @@ +use std::collections::BTreeMap; + +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use serde::Deserialize; +use stella_anonymize_core::{ + CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, + LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, + PreparedSearch as CorePreparedSearch, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, + SearchPattern, SourceDetail, StaticRedactionResult, +}; + +#[derive(Deserialize)] +struct SearchPatternDto { + kind: String, + pattern: String, + distance: Option, + case_insensitive: Option, + whole_words: Option, +} + +#[derive(Deserialize)] +struct SearchOptionsDto { + literal_case_insensitive: Option, + literal_whole_words: Option, + regex_whole_words: Option, + fuzzy_case_insensitive: Option, + fuzzy_whole_words: Option, + fuzzy_normalize_diacritics: Option, +} + +#[derive(Deserialize)] +struct PatternSliceDto { + start: u32, + end: u32, +} + +#[derive(Deserialize)] +struct PreparedSearchSlicesDto { + regex: Option, + custom_regex: Option, + legal_forms: Option, + triggers: Option, + deny_list: Option, + street_types: Option, + gazetteer: Option, + countries: Option, +} + +#[derive(Deserialize)] +struct RegexMatchMetaDto { + label: String, + score: f64, + source_detail: Option, + requires_validation: Option, +} + +#[derive(Deserialize)] +struct GazetteerMatchDataDto { + labels: Vec, + is_fuzzy: Vec, +} + +#[derive(Deserialize)] +struct CountryMatchDataDto { + labels: Vec, +} + +#[derive(Deserialize)] +struct PreparedSearchConfigDto { + regex_patterns: Vec, + custom_regex_patterns: Vec, + literal_patterns: Vec, + regex_options: Option, + custom_regex_options: Option, + literal_options: Option, + slices: PreparedSearchSlicesDto, + regex_meta: Vec, + custom_regex_meta: Vec, + gazetteer_data: Option, + country_data: Option, +} + +#[derive(Default, Deserialize)] +struct OperatorConfigDto { + operators: Option>, + redact_string: Option, +} + +#[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyRedactionEntry { + placeholder: String, + original: String, +} + +#[pyclass(name = "OperatorEntry", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyOperatorEntry { + placeholder: String, + operator: String, +} + +#[pyclass(name = "RedactionResult", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyRedactionResult { + redacted_text: String, + redaction_map: Vec, + operator_map: Vec, + entity_count: usize, +} + +#[pyclass(name = "PipelineEntity", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyPipelineEntity { + start: u32, + end: u32, + label: String, + text: String, + score: f64, + source: String, + source_detail: Option, +} + +#[pyclass(name = "StaticRedactionResult", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyStaticRedactionResult { + resolved_entities: Vec, + redaction: PyRedactionResult, +} + +#[pyclass(name = "PreparedSearch")] +pub struct PyPreparedSearch { + inner: CorePreparedSearch, +} + +#[pymethods] +impl PyPreparedSearch { + #[new] + fn new(config_json: &str) -> PyResult { + let config: PreparedSearchConfigDto = serde_json::from_str(config_json) + .map_err(|error| to_py_value_error(&error))?; + let inner = CorePreparedSearch::new(to_prepared_search_config(config)?) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { inner }) + } + + fn redact_static_entities( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let operators = to_operator_config(operators_json)?; + self + .inner + .redact_static_entities(full_text, &operators) + .map(to_static_redaction_result) + .map_err(|error| to_py_core_error(&error)) + } +} + +#[pyfunction] +fn normalize_for_search(text: &str) -> String { + stella_anonymize_core::normalize_for_search(text) +} + +fn to_prepared_search_config( + config: PreparedSearchConfigDto, +) -> PyResult { + Ok(PreparedSearchConfig { + regex_patterns: to_search_patterns(config.regex_patterns)?, + custom_regex_patterns: to_search_patterns(config.custom_regex_patterns)?, + literal_patterns: to_search_patterns(config.literal_patterns)?, + regex_options: to_search_options(config.regex_options), + custom_regex_options: to_search_options(config.custom_regex_options), + literal_options: to_search_options(config.literal_options), + slices: to_slices(config.slices), + regex_meta: to_regex_meta(config.regex_meta), + custom_regex_meta: to_regex_meta(config.custom_regex_meta), + gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { + labels: data.labels, + is_fuzzy: data.is_fuzzy, + }), + country_data: config.country_data.map(|data| CountryMatchData { + labels: data.labels, + }), + }) +} + +fn to_search_patterns( + patterns: Vec, +) -> PyResult> { + patterns + .into_iter() + .map(|pattern| match pattern.kind.as_str() { + "literal" => Ok(SearchPattern::Literal(pattern.pattern)), + "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { + pattern: pattern.pattern, + case_insensitive: pattern.case_insensitive, + whole_words: pattern.whole_words, + }), + "regex" => Ok(SearchPattern::Regex(pattern.pattern)), + "fuzzy" => Ok(SearchPattern::Fuzzy { + pattern: pattern.pattern, + distance: pattern + .distance + .map(|distance| { + u8::try_from(distance).map_err(|_| { + PyValueError::new_err(format!( + "Fuzzy distance exceeds u8 range: {distance}" + )) + }) + }) + .transpose()?, + }), + _ => Err(PyValueError::new_err(format!( + "Unsupported search pattern kind: {}", + pattern.kind + ))), + }) + .collect() +} + +fn to_search_options(options: Option) -> SearchOptions { + let Some(options) = options else { + return SearchOptions::default(); + }; + + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: options.literal_case_insensitive.unwrap_or(false), + whole_words: options.literal_whole_words.unwrap_or(false), + }, + regex: RegexSearchOptions { + whole_words: options.regex_whole_words.unwrap_or(false), + }, + fuzzy: FuzzySearchOptions { + case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), + whole_words: options.fuzzy_whole_words.unwrap_or(true), + normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), + }, + } +} + +fn to_slices(slices: PreparedSearchSlicesDto) -> PreparedSearchSlices { + PreparedSearchSlices { + regex: to_slice(slices.regex), + custom_regex: to_slice(slices.custom_regex), + legal_forms: to_slice(slices.legal_forms), + triggers: to_slice(slices.triggers), + deny_list: to_slice(slices.deny_list), + street_types: to_slice(slices.street_types), + gazetteer: to_slice(slices.gazetteer), + countries: to_slice(slices.countries), + } +} + +fn to_slice(slice: Option) -> PatternSlice { + slice.map_or_else(PatternSlice::default, |slice| PatternSlice { + start: slice.start, + end: slice.end, + }) +} + +fn to_regex_meta(meta: Vec) -> Vec { + meta + .into_iter() + .map(|entry| RegexMatchMeta { + label: entry.label, + score: entry.score, + source_detail: entry.source_detail.as_deref().and_then(to_source_detail), + requires_validation: entry.requires_validation.unwrap_or(false), + }) + .collect() +} + +fn to_source_detail(value: &str) -> Option { + match value { + "custom-deny-list" => Some(SourceDetail::CustomDenyList), + "custom-regex" => Some(SourceDetail::CustomRegex), + "gazetteer-extension" => Some(SourceDetail::GazetteerExtension), + _ => None, + } +} + +fn to_operator_config( + operators_json: Option<&str>, +) -> PyResult { + let Some(operators_json) = operators_json else { + return Ok(OperatorConfig::default()); + }; + let config: OperatorConfigDto = serde_json::from_str(operators_json) + .map_err(|error| to_py_value_error(&error))?; + + let mut operators = BTreeMap::new(); + for (label, value) in config.operators.unwrap_or_default() { + operators.insert(label, to_operator_type(&value)?); + } + + Ok(OperatorConfig { + operators, + redact_string: config + .redact_string + .unwrap_or_else(|| String::from("[REDACTED]")), + }) +} + +fn to_operator_type(value: &str) -> PyResult { + match value { + "replace" => Ok(OperatorType::Replace), + "redact" => Ok(OperatorType::Redact), + _ => Err(PyValueError::new_err(format!( + "Unsupported anonymization operator: {value}" + ))), + } +} + +fn to_static_redaction_result( + result: StaticRedactionResult, +) -> PyStaticRedactionResult { + PyStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(|entity| PyPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: detection_source_name(entity.source), + source_detail: entity.source_detail.map(source_detail_name), + }) + .collect(), + redaction: PyRedactionResult { + redacted_text: result.redaction.redacted_text, + redaction_map: result + .redaction + .redaction_map + .into_iter() + .map(|entry| PyRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: result + .redaction + .operator_map + .into_iter() + .map(|entry| PyOperatorEntry { + placeholder: entry.placeholder, + operator: operator_name(entry.operator), + }) + .collect(), + entity_count: result.redaction.entity_count, + }, + } +} + +fn detection_source_name(source: DetectionSource) -> String { + match source { + DetectionSource::Trigger => "trigger", + DetectionSource::Regex => "regex", + DetectionSource::DenyList => "deny-list", + DetectionSource::LegalForm => "legal-form", + DetectionSource::Gazetteer => "gazetteer", + DetectionSource::Country => "country", + DetectionSource::Ner => "ner", + DetectionSource::Coreference => "coreference", + } + .to_owned() +} + +fn source_detail_name(detail: SourceDetail) -> String { + match detail { + SourceDetail::CustomDenyList => "custom-deny-list", + SourceDetail::CustomRegex => "custom-regex", + SourceDetail::GazetteerExtension => "gazetteer-extension", + } + .to_owned() +} + +fn operator_name(operator: OperatorType) -> String { + match operator { + OperatorType::Replace => "replace", + OperatorType::Redact => "redact", + } + .to_owned() +} + +fn to_py_core_error(error: &stella_anonymize_core::Error) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +fn to_py_value_error(error: &serde_json::Error) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +#[pymodule] +fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_function(wrap_pyfunction!(normalize_for_search, module)?)?; + Ok(()) +} From 0e2cd08777ff27d65df082a4bbee32fe3861ffb0 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 11:38:15 +0200 Subject: [PATCH 017/130] feat: share native adapter contract --- .github/tools/sync-runtime-version.mjs | 99 ++++ .github/workflows/ci.yml | 6 + .github/workflows/dependency-review.yml | 1 + Cargo.lock | 14 +- Cargo.toml | 1 + crates/anonymize-adapter-contract/Cargo.toml | 18 + .../examples/native_adapter_perf.rs | 70 +++ crates/anonymize-adapter-contract/src/lib.rs | 412 +++++++++++++++++ crates/anonymize-napi/Cargo.toml | 2 + crates/anonymize-napi/src/lib.rs | 345 +++++++------- crates/anonymize-py/Cargo.toml | 5 +- crates/anonymize-py/build.rs | 3 + crates/anonymize-py/pyproject.toml | 20 + crates/anonymize-py/src/lib.rs | 430 ++++++------------ packages/anonymize/package.json | 1 + .../anonymize/scripts/native-adapter-perf.mjs | 304 +++++++++++++ .../__test__/native-adapter-parity.test.ts | 418 +++++++++++++++++ 17 files changed, 1680 insertions(+), 469 deletions(-) create mode 100644 crates/anonymize-adapter-contract/Cargo.toml create mode 100644 crates/anonymize-adapter-contract/examples/native_adapter_perf.rs create mode 100644 crates/anonymize-adapter-contract/src/lib.rs create mode 100644 crates/anonymize-py/build.rs create mode 100644 crates/anonymize-py/pyproject.toml create mode 100644 packages/anonymize/scripts/native-adapter-perf.mjs create mode 100644 packages/anonymize/src/__test__/native-adapter-parity.test.ts diff --git a/.github/tools/sync-runtime-version.mjs b/.github/tools/sync-runtime-version.mjs index 75b7cfad..b6e07cda 100644 --- a/.github/tools/sync-runtime-version.mjs +++ b/.github/tools/sync-runtime-version.mjs @@ -10,7 +10,16 @@ const PACKAGE_FILES = [ "packages/cli/package.json", ]; +const CARGO_WORKSPACE_MANIFEST = "Cargo.toml"; +const CARGO_LOCKED_PACKAGES = [ + "stella-anonymize-adapter-contract", + "stella-anonymize-core", + "stella-anonymize-napi", + "stella-anonymize-py", +]; +const PYPROJECT_FILES = ["crates/anonymize-py/pyproject.toml"]; const LOCK_FILE = "bun.lock"; +const CARGO_LOCK_FILE = "Cargo.lock"; const checkOnly = process.argv.includes("--check"); const version = readFileSync("VERSION", "utf8").trim(); @@ -30,6 +39,29 @@ const SYNCED_DEPENDENCY_RANGE_RE = /("@stll\/anonymize": "\^)([^"]+)(")/g; const escapeRegExp = (value) => value.replaceAll(/[.*+?^${}()|[\]\\]/g, "\\$&"); +const syncTextVersion = ({ file, label, re }) => { + const text = readFileSync(file, "utf8"); + const match = text.match(re); + if (!match) { + console.error(`${file} has no ${label} version entry`); + hasMismatch = true; + return; + } + const current = match[2]; + if (current === version) { + return; + } + if (checkOnly) { + console.error( + `${file} has ${label} version ${current}; expected ${version}`, + ); + hasMismatch = true; + return; + } + writeFileSync(file, text.replace(re, `$1${version}$3`)); + console.log(`Updated ${file} ${label} version to ${version}`); +}; + for (const file of PACKAGE_FILES) { const pkg = JSON.parse(readFileSync(file, "utf8")); const wantedRange = `^${version}`; @@ -61,6 +93,34 @@ for (const file of PACKAGE_FILES) { console.log(`Updated ${file} to ${version}`); } +syncTextVersion({ + file: CARGO_WORKSPACE_MANIFEST, + label: "Cargo workspace", + re: /(\[workspace\.package\][\s\S]*?\nversion\s*=\s*")([^"]+)(")/, +}); + +for (const file of PYPROJECT_FILES) { + const text = readFileSync(file, "utf8"); + const explicitVersion = text.match(/^version\s*=\s*"([^"]+)"/m); + if (explicitVersion) { + syncTextVersion({ + file, + label: "Python project", + re: /(^version\s*=\s*")([^"]+)(")/m, + }); + continue; + } + + if (/\bdynamic\s*=\s*\[[^\]]*"version"[^\]]*\]/m.test(text)) { + continue; + } + + console.error( + `${file} must either derive version dynamically from Cargo or match VERSION`, + ); + hasMismatch = true; +} + const lockText = readFileSync(LOCK_FILE, "utf8"); let lockChanged = false; let syncedLockText = lockText.replaceAll( @@ -116,6 +176,45 @@ if (lockChanged) { ); } +const cargoLockText = readFileSync(CARGO_LOCK_FILE, "utf8"); +let cargoLockChanged = false; +let syncedCargoLockText = cargoLockText; + +for (const packageName of CARGO_LOCKED_PACKAGES) { + const packageVersionRe = new RegExp( + `(\\[\\[package\\]\\]\\nname = "${escapeRegExp(packageName)}"\\nversion = ")([^"]+)(")`, + ); + const match = syncedCargoLockText.match(packageVersionRe); + if (!match) { + console.error(`${CARGO_LOCK_FILE} has no package entry for ${packageName}`); + hasMismatch = true; + continue; + } + const lockedVersion = match[2]; + if (lockedVersion === version) { + continue; + } + if (checkOnly) { + console.error( + `${CARGO_LOCK_FILE} package ${packageName} has version ${lockedVersion}; expected ${version}`, + ); + hasMismatch = true; + continue; + } + syncedCargoLockText = syncedCargoLockText.replace( + packageVersionRe, + `$1${version}$3`, + ); + cargoLockChanged = true; +} + +if (cargoLockChanged) { + writeFileSync(CARGO_LOCK_FILE, syncedCargoLockText); + console.log( + `Updated ${CARGO_LOCK_FILE} local package versions to ${version}`, + ); +} + if (hasMismatch) { process.exit(1); } diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1320484a..a6e2f1a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,3 +87,9 @@ jobs: # noisy); promote to a hard gate once a stable baseline is set. continue-on-error: true run: bun run --cwd packages/anonymize perf:contracts + + - name: Native adapter performance (informational) + # Emits aggregate TS/NAPI and Python/PyO3 timings for the shared + # Rust-backed static adapter contract. + continue-on-error: true + run: bun run --cwd packages/anonymize perf:native-adapters diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 2dc53ebb..6f2c5b4d 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -28,6 +28,7 @@ jobs: allow-licenses: >- MIT, Apache-2.0, + Apache-2.0 WITH LLVM-exception, BSD-2-Clause, BSD-3-Clause, ISC, diff --git a/Cargo.lock b/Cargo.lock index 5cf8d7d0..a0b687aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -443,6 +443,15 @@ dependencies = [ "unicode-case-mapping", ] +[[package]] +name = "stella-anonymize-adapter-contract" +version = "1.5.0" +dependencies = [ + "serde", + "serde_json", + "stella-anonymize-core", +] + [[package]] name = "stella-anonymize-core" version = "1.5.0" @@ -459,6 +468,8 @@ dependencies = [ "napi", "napi-build", "napi-derive", + "serde_json", + "stella-anonymize-adapter-contract", "stella-anonymize-core", ] @@ -467,8 +478,9 @@ name = "stella-anonymize-py" version = "1.5.0" dependencies = [ "pyo3", - "serde", + "pyo3-build-config", "serde_json", + "stella-anonymize-adapter-contract", "stella-anonymize-core", ] diff --git a/Cargo.toml b/Cargo.toml index 502300e2..233582b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [workspace] members = [ + "crates/anonymize-adapter-contract", "crates/anonymize-core", "crates/anonymize-napi", "crates/anonymize-py", diff --git a/crates/anonymize-adapter-contract/Cargo.toml b/crates/anonymize-adapter-contract/Cargo.toml new file mode 100644 index 00000000..14a109fe --- /dev/null +++ b/crates/anonymize-adapter-contract/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "stella-anonymize-adapter-contract" +version.workspace = true +edition.workspace = true +description = "Shared adapter contract for Stella anonymization bindings" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[dependencies] +serde = { version = "1", features = ["derive"] } +stella-anonymize-core = { path = "../anonymize-core" } + +[dev-dependencies] +serde_json = "1" + +[lints] +workspace = true diff --git a/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs new file mode 100644 index 00000000..6d0a204e --- /dev/null +++ b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs @@ -0,0 +1,70 @@ +#![allow(clippy::print_stdout)] + +use std::env; +use std::io::Write; +use std::time::Instant; + +use serde::Deserialize; +use serde_json::json; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingPreparedSearchConfig, + operator_config_from_binding, prepared_search_config_from_binding, +}; +use stella_anonymize_core::PreparedSearch; + +#[derive(Deserialize)] +struct Payload { + config_json: String, + iterations: usize, + cases: Vec, +} + +#[derive(Deserialize)] +struct Case { + text: String, + operators_json: Option, +} + +fn main() -> Result<(), Box> { + let payload = env::var("STELLA_ANONYMIZE_PERF_PAYLOAD")?; + let payload = serde_json::from_str::(&payload)?; + let config = + serde_json::from_str::(&payload.config_json)?; + + let prepare_start = Instant::now(); + let prepared = + PreparedSearch::new(prepared_search_config_from_binding(config)?)?; + let prepare_ms = elapsed_ms(prepare_start); + + let run_start = Instant::now(); + let mut entity_count = 0_usize; + for _ in 0..payload.iterations { + for item in &payload.cases { + let operators = item + .operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose()?; + let operators = operator_config_from_binding(operators)?; + let result = prepared.redact_static_entities(&item.text, &operators)?; + entity_count = entity_count.saturating_add(result.redaction.entity_count); + } + } + let run_ms = elapsed_ms(run_start); + + let mut stdout = std::io::stdout().lock(); + writeln!( + stdout, + "{}", + json!({ + "prepareMs": prepare_ms, + "runMs": run_ms, + "entityCount": entity_count, + }) + )?; + Ok(()) +} + +fn elapsed_ms(start: Instant) -> f64 { + start.elapsed().as_secs_f64() * 1_000.0 +} diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs new file mode 100644 index 00000000..5108c331 --- /dev/null +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -0,0 +1,412 @@ +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use stella_anonymize_core::{ + CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, + LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, + PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, + StaticRedactionResult, +}; + +pub type Result = std::result::Result; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ContractError { + FuzzyDistanceOutOfRange { distance: u32 }, + UnsupportedOperator { value: String }, + UnsupportedSearchPatternKind { kind: String }, + UnsupportedSourceDetail { value: String }, +} + +impl std::fmt::Display for ContractError { + fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::FuzzyDistanceOutOfRange { distance } => { + write!(formatter, "Fuzzy distance exceeds u8 range: {distance}") + } + Self::UnsupportedOperator { value } => { + write!(formatter, "Unsupported anonymization operator: {value}") + } + Self::UnsupportedSearchPatternKind { kind } => { + write!(formatter, "Unsupported search pattern kind: {kind}") + } + Self::UnsupportedSourceDetail { value } => { + write!(formatter, "Unsupported source detail: {value}") + } + } + } +} + +impl std::error::Error for ContractError {} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSearchPattern { + pub kind: String, + pub pattern: String, + pub distance: Option, + pub case_insensitive: Option, + pub whole_words: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSearchOptions { + pub literal_case_insensitive: Option, + pub literal_whole_words: Option, + pub regex_whole_words: Option, + pub fuzzy_case_insensitive: Option, + pub fuzzy_whole_words: Option, + pub fuzzy_normalize_diacritics: Option, +} + +#[derive( + Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize, +)] +pub struct BindingPatternSlice { + pub start: u32, + pub end: u32, +} + +#[derive( + Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize, +)] +pub struct BindingPreparedSearchSlices { + pub regex: Option, + pub custom_regex: Option, + pub legal_forms: Option, + pub triggers: Option, + pub deny_list: Option, + pub street_types: Option, + pub gazetteer: Option, + pub countries: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingRegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingGazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCountryMatchData { + pub labels: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingPreparedSearchConfig { + #[serde(default)] + pub regex_patterns: Vec, + #[serde(default)] + pub custom_regex_patterns: Vec, + #[serde(default)] + pub literal_patterns: Vec, + #[serde(default)] + pub regex_options: Option, + #[serde(default)] + pub custom_regex_options: Option, + #[serde(default)] + pub literal_options: Option, + #[serde(default)] + pub slices: BindingPreparedSearchSlices, + #[serde(default)] + pub regex_meta: Vec, + #[serde(default)] + pub custom_regex_meta: Vec, + #[serde(default)] + pub gazetteer_data: Option, + #[serde(default)] + pub country_data: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingOperatorConfig { + pub operators: Option>, + pub redact_string: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingRedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingOperatorEntry { + pub placeholder: String, + pub operator: String, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingRedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingPipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: String, + pub source_detail: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionResult { + pub resolved_entities: Vec, + pub redaction: BindingRedactionResult, +} + +pub fn prepared_search_config_from_binding( + config: BindingPreparedSearchConfig, +) -> Result { + Ok(PreparedSearchConfig { + regex_patterns: search_patterns_from_binding(config.regex_patterns)?, + custom_regex_patterns: search_patterns_from_binding( + config.custom_regex_patterns, + )?, + literal_patterns: search_patterns_from_binding(config.literal_patterns)?, + regex_options: search_options_from_binding(config.regex_options), + custom_regex_options: search_options_from_binding( + config.custom_regex_options, + ), + literal_options: search_options_from_binding(config.literal_options), + slices: slices_from_binding(&config.slices), + regex_meta: regex_meta_from_binding(config.regex_meta)?, + custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, + gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { + labels: data.labels, + is_fuzzy: data.is_fuzzy, + }), + country_data: config.country_data.map(|data| CountryMatchData { + labels: data.labels, + }), + }) +} + +pub fn operator_config_from_binding( + config: Option, +) -> Result { + let Some(config) = config else { + return Ok(OperatorConfig::default()); + }; + + let mut operators = BTreeMap::new(); + for (label, value) in config.operators.unwrap_or_default() { + operators.insert(label, operator_type_from_binding(&value)?); + } + + Ok(OperatorConfig { + operators, + redact_string: config + .redact_string + .unwrap_or_else(|| String::from("[REDACTED]")), + }) +} + +#[must_use] +pub fn static_redaction_result_to_binding( + result: StaticRedactionResult, +) -> BindingStaticRedactionResult { + BindingStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(|entity| BindingPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: detection_source_name(entity.source), + source_detail: entity.source_detail.map(source_detail_name), + }) + .collect(), + redaction: BindingRedactionResult { + redacted_text: result.redaction.redacted_text, + redaction_map: result + .redaction + .redaction_map + .into_iter() + .map(|entry| BindingRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: result + .redaction + .operator_map + .into_iter() + .map(|entry| BindingOperatorEntry { + placeholder: entry.placeholder, + operator: operator_name(entry.operator), + }) + .collect(), + entity_count: result.redaction.entity_count, + }, + } +} + +fn search_patterns_from_binding( + patterns: Vec, +) -> Result> { + patterns + .into_iter() + .map(search_pattern_from_binding) + .collect() +} + +fn search_pattern_from_binding( + pattern: BindingSearchPattern, +) -> Result { + match pattern.kind.as_str() { + "literal" => Ok(SearchPattern::Literal(pattern.pattern)), + "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { + pattern: pattern.pattern, + case_insensitive: pattern.case_insensitive, + whole_words: pattern.whole_words, + }), + "regex" => Ok(SearchPattern::Regex(pattern.pattern)), + "fuzzy" => Ok(SearchPattern::Fuzzy { + pattern: pattern.pattern, + distance: pattern + .distance + .map(|distance| { + u8::try_from(distance) + .map_err(|_| ContractError::FuzzyDistanceOutOfRange { distance }) + }) + .transpose()?, + }), + _ => { + Err(ContractError::UnsupportedSearchPatternKind { kind: pattern.kind }) + } + } +} + +fn search_options_from_binding( + options: Option, +) -> SearchOptions { + let Some(options) = options else { + return SearchOptions::default(); + }; + + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: options.literal_case_insensitive.unwrap_or(false), + whole_words: options.literal_whole_words.unwrap_or(false), + }, + regex: RegexSearchOptions { + whole_words: options.regex_whole_words.unwrap_or(false), + }, + fuzzy: FuzzySearchOptions { + case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), + whole_words: options.fuzzy_whole_words.unwrap_or(true), + normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), + }, + } +} + +fn slices_from_binding( + slices: &BindingPreparedSearchSlices, +) -> PreparedSearchSlices { + PreparedSearchSlices { + regex: slice_from_binding(slices.regex), + custom_regex: slice_from_binding(slices.custom_regex), + legal_forms: slice_from_binding(slices.legal_forms), + triggers: slice_from_binding(slices.triggers), + deny_list: slice_from_binding(slices.deny_list), + street_types: slice_from_binding(slices.street_types), + gazetteer: slice_from_binding(slices.gazetteer), + countries: slice_from_binding(slices.countries), + } +} + +fn slice_from_binding(slice: Option) -> PatternSlice { + slice.map_or_else(PatternSlice::default, |slice| PatternSlice { + start: slice.start, + end: slice.end, + }) +} + +fn regex_meta_from_binding( + meta: Vec, +) -> Result> { + meta + .into_iter() + .map(|entry| { + Ok(RegexMatchMeta { + label: entry.label, + score: entry.score, + source_detail: entry + .source_detail + .map(|value| source_detail_from_binding(&value)) + .transpose()?, + requires_validation: entry.requires_validation.unwrap_or(false), + }) + }) + .collect() +} + +fn source_detail_from_binding(value: &str) -> Result { + match value { + "custom-deny-list" => Ok(SourceDetail::CustomDenyList), + "custom-regex" => Ok(SourceDetail::CustomRegex), + "gazetteer-extension" => Ok(SourceDetail::GazetteerExtension), + _ => Err(ContractError::UnsupportedSourceDetail { + value: value.to_owned(), + }), + } +} + +fn operator_type_from_binding(value: &str) -> Result { + match value { + "replace" => Ok(OperatorType::Replace), + "redact" => Ok(OperatorType::Redact), + _ => Err(ContractError::UnsupportedOperator { + value: value.to_owned(), + }), + } +} + +fn detection_source_name(source: DetectionSource) -> String { + match source { + DetectionSource::Trigger => "trigger", + DetectionSource::Regex => "regex", + DetectionSource::DenyList => "deny-list", + DetectionSource::LegalForm => "legal-form", + DetectionSource::Gazetteer => "gazetteer", + DetectionSource::Country => "country", + DetectionSource::Ner => "ner", + DetectionSource::Coreference => "coreference", + } + .to_owned() +} + +fn source_detail_name(detail: SourceDetail) -> String { + match detail { + SourceDetail::CustomDenyList => "custom-deny-list", + SourceDetail::CustomRegex => "custom-regex", + SourceDetail::GazetteerExtension => "gazetteer-extension", + } + .to_owned() +} + +fn operator_name(operator: OperatorType) -> String { + match operator { + OperatorType::Replace => "replace", + OperatorType::Redact => "redact", + } + .to_owned() +} diff --git a/crates/anonymize-napi/Cargo.toml b/crates/anonymize-napi/Cargo.toml index 6cab0467..0964fdf7 100644 --- a/crates/anonymize-napi/Cargo.toml +++ b/crates/anonymize-napi/Cargo.toml @@ -16,6 +16,8 @@ napi = { version = "3", default-features = false, features = [ "serde-json", ] } napi-derive = "3" +serde_json = "1" +stella-anonymize-adapter-contract = { path = "../anonymize-adapter-contract" } stella-anonymize-core = { path = "../anonymize-core" } [build-dependencies] diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index e1b1c230..a2c86487 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -2,13 +2,15 @@ use std::collections::BTreeMap; use napi::bindgen_prelude::*; use napi_derive::napi; -use stella_anonymize_core::{ - CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, - LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, - PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, - StaticRedactionResult, +use stella_anonymize_adapter_contract::{ + BindingCountryMatchData, BindingGazetteerMatchData, BindingOperatorConfig, + BindingOperatorEntry, BindingPatternSlice, BindingPreparedSearchConfig, + BindingPreparedSearchSlices, BindingRedactionResult, BindingRegexMatchMeta, + BindingSearchOptions, BindingSearchPattern, BindingStaticRedactionResult, + ContractError, operator_config_from_binding, + prepared_search_config_from_binding, static_redaction_result_to_binding, }; +use stella_anonymize_core::PreparedSearch; #[napi(object)] pub struct JsSearchPattern { @@ -131,6 +133,38 @@ pub fn normalize_for_search(text: String) -> String { stella_anonymize_core::normalize_for_search(&text) } +#[napi] +#[allow(clippy::needless_pass_by_value)] +pub fn redact_static_entities_json( + config_json: String, + full_text: String, + operators_json: Option, +) -> Result { + let config = + serde_json::from_str::(&config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let operators = operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_napi_serde_error(&error))?; + let prepared = PreparedSearch::new( + prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + let result = prepared + .redact_static_entities( + &full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map(static_redaction_result_to_binding) + .map_err(|error| to_napi_core_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) +} + #[napi] pub struct NativePreparedSearch { inner: PreparedSearch, @@ -140,9 +174,11 @@ pub struct NativePreparedSearch { impl NativePreparedSearch { #[napi(constructor)] pub fn new(config: JsPreparedSearchConfig) -> Result { - PreparedSearch::new(to_prepared_search_config(config)?) + let config = prepared_search_config_from_binding(to_binding_config(config)) + .map_err(|error| to_napi_contract_error(&error))?; + PreparedSearch::new(config) .map(|inner| Self { inner }) - .map_err(|error| to_napi_error(&error)) + .map_err(|error| to_napi_core_error(&error)) } #[napi] @@ -152,168 +188,121 @@ impl NativePreparedSearch { full_text: String, operators: Option, ) -> Result { - let operator_config = to_operator_config(operators)?; + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; self .inner - .redact_static_entities(&full_text, &operator_config) - .map(to_static_redaction_result) - .map_err(|error| to_napi_error(&error)) + .redact_static_entities(&full_text, &operators) + .map(static_redaction_result_to_binding) + .map(to_js_static_redaction_result) + .map_err(|error| to_napi_core_error(&error))? } } -fn to_prepared_search_config( +fn to_binding_config( config: JsPreparedSearchConfig, -) -> Result { - Ok(PreparedSearchConfig { - regex_patterns: to_search_patterns(config.regex_patterns)?, - custom_regex_patterns: to_search_patterns(config.custom_regex_patterns)?, - literal_patterns: to_search_patterns(config.literal_patterns)?, - regex_options: to_search_options(config.regex_options), - custom_regex_options: to_search_options(config.custom_regex_options), - literal_options: to_search_options(config.literal_options), - slices: to_slices(config.slices), - regex_meta: to_regex_meta(config.regex_meta), - custom_regex_meta: to_regex_meta(config.custom_regex_meta), - gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { - labels: data.labels, - is_fuzzy: data.is_fuzzy, +) -> BindingPreparedSearchConfig { + BindingPreparedSearchConfig { + regex_patterns: to_binding_patterns(config.regex_patterns), + custom_regex_patterns: to_binding_patterns(config.custom_regex_patterns), + literal_patterns: to_binding_patterns(config.literal_patterns), + regex_options: config.regex_options.as_ref().map(to_binding_options), + custom_regex_options: config + .custom_regex_options + .as_ref() + .map(to_binding_options), + literal_options: config.literal_options.as_ref().map(to_binding_options), + slices: to_binding_slices(&config.slices), + regex_meta: to_binding_regex_meta(config.regex_meta), + custom_regex_meta: to_binding_regex_meta(config.custom_regex_meta), + gazetteer_data: config.gazetteer_data.map(|data| { + BindingGazetteerMatchData { + labels: data.labels, + is_fuzzy: data.is_fuzzy, + } }), - country_data: config.country_data.map(|data| CountryMatchData { + country_data: config.country_data.map(|data| BindingCountryMatchData { labels: data.labels, }), - }) + } } -fn to_search_patterns( +fn to_binding_patterns( patterns: Vec, -) -> Result> { +) -> Vec { patterns .into_iter() - .map(|pattern| match pattern.kind.as_str() { - "literal" => Ok(SearchPattern::Literal(pattern.pattern)), - "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { - pattern: pattern.pattern, - case_insensitive: pattern.case_insensitive, - whole_words: pattern.whole_words, - }), - "regex" => Ok(SearchPattern::Regex(pattern.pattern)), - "fuzzy" => Ok(SearchPattern::Fuzzy { - pattern: pattern.pattern, - distance: pattern - .distance - .map(|distance| { - u8::try_from(distance).map_err(|_| { - Error::from_reason(format!( - "Fuzzy distance exceeds u8 range: {distance}" - )) - }) - }) - .transpose()?, - }), - _ => Err(Error::from_reason(format!( - "Unsupported search pattern kind: {}", - pattern.kind - ))), + .map(|pattern| BindingSearchPattern { + kind: pattern.kind, + pattern: pattern.pattern, + distance: pattern.distance, + case_insensitive: pattern.case_insensitive, + whole_words: pattern.whole_words, }) .collect() } -fn to_search_options(options: Option) -> SearchOptions { - let Some(options) = options else { - return SearchOptions::default(); - }; - - SearchOptions { - literal: LiteralSearchOptions { - case_insensitive: options.literal_case_insensitive.unwrap_or(false), - whole_words: options.literal_whole_words.unwrap_or(false), - }, - regex: RegexSearchOptions { - whole_words: options.regex_whole_words.unwrap_or(false), - }, - fuzzy: FuzzySearchOptions { - case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), - whole_words: options.fuzzy_whole_words.unwrap_or(true), - normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), - }, +const fn to_binding_options(options: &JsSearchOptions) -> BindingSearchOptions { + BindingSearchOptions { + literal_case_insensitive: options.literal_case_insensitive, + literal_whole_words: options.literal_whole_words, + regex_whole_words: options.regex_whole_words, + fuzzy_case_insensitive: options.fuzzy_case_insensitive, + fuzzy_whole_words: options.fuzzy_whole_words, + fuzzy_normalize_diacritics: options.fuzzy_normalize_diacritics, } } -fn to_slices(slices: JsPreparedSearchSlices) -> PreparedSearchSlices { - PreparedSearchSlices { - regex: to_slice(slices.regex), - custom_regex: to_slice(slices.custom_regex), - legal_forms: to_slice(slices.legal_forms), - triggers: to_slice(slices.triggers), - deny_list: to_slice(slices.deny_list), - street_types: to_slice(slices.street_types), - gazetteer: to_slice(slices.gazetteer), - countries: to_slice(slices.countries), +fn to_binding_slices( + slices: &JsPreparedSearchSlices, +) -> BindingPreparedSearchSlices { + BindingPreparedSearchSlices { + regex: slices.regex.as_ref().map(to_binding_slice), + custom_regex: slices.custom_regex.as_ref().map(to_binding_slice), + legal_forms: slices.legal_forms.as_ref().map(to_binding_slice), + triggers: slices.triggers.as_ref().map(to_binding_slice), + deny_list: slices.deny_list.as_ref().map(to_binding_slice), + street_types: slices.street_types.as_ref().map(to_binding_slice), + gazetteer: slices.gazetteer.as_ref().map(to_binding_slice), + countries: slices.countries.as_ref().map(to_binding_slice), } } -fn to_slice(slice: Option) -> PatternSlice { - slice.map_or_else(PatternSlice::default, |slice| PatternSlice { +const fn to_binding_slice(slice: &JsPatternSlice) -> BindingPatternSlice { + BindingPatternSlice { start: slice.start, end: slice.end, - }) + } } -fn to_regex_meta(meta: Vec) -> Vec { +fn to_binding_regex_meta( + meta: Vec, +) -> Vec { meta .into_iter() - .map(|entry| RegexMatchMeta { + .map(|entry| BindingRegexMatchMeta { label: entry.label, score: entry.score, - source_detail: entry.source_detail.as_deref().and_then(to_source_detail), - requires_validation: entry.requires_validation.unwrap_or(false), + source_detail: entry.source_detail, + requires_validation: entry.requires_validation, }) .collect() } -fn to_source_detail(value: &str) -> Option { - match value { - "custom-deny-list" => Some(SourceDetail::CustomDenyList), - "custom-regex" => Some(SourceDetail::CustomRegex), - "gazetteer-extension" => Some(SourceDetail::GazetteerExtension), - _ => None, - } -} - -fn to_operator_config( - config: Option, -) -> Result { - let Some(config) = config else { - return Ok(OperatorConfig::default()); - }; - - let mut operators = BTreeMap::new(); - for (label, value) in config.operators.unwrap_or_default() { - operators.insert(label, to_operator_type(&value)?); - } - - Ok(OperatorConfig { - operators, - redact_string: config - .redact_string - .unwrap_or_else(|| String::from("[REDACTED]")), - }) -} - -fn to_operator_type(value: &str) -> Result { - match value { - "replace" => Ok(OperatorType::Replace), - "redact" => Ok(OperatorType::Redact), - _ => Err(Error::from_reason(format!( - "Unsupported anonymization operator: {value}" - ))), +fn to_binding_operator_config( + config: JsOperatorConfig, +) -> BindingOperatorConfig { + BindingOperatorConfig { + operators: config.operators, + redact_string: config.redact_string, } } -fn to_static_redaction_result( - result: StaticRedactionResult, -) -> JsStaticRedactionResult { - JsStaticRedactionResult { +fn to_js_static_redaction_result( + result: BindingStaticRedactionResult, +) -> Result { + Ok(JsStaticRedactionResult { resolved_entities: result .resolved_entities .into_iter() @@ -323,67 +312,57 @@ fn to_static_redaction_result( label: entity.label, text: entity.text, score: entity.score, - source: detection_source_name(entity.source), - source_detail: entity.source_detail.map(source_detail_name), + source: entity.source, + source_detail: entity.source_detail, }) .collect(), - redaction: JsRedactionResult { - redacted_text: result.redaction.redacted_text, - redaction_map: result - .redaction - .redaction_map - .into_iter() - .map(|entry| JsRedactionEntry { - placeholder: entry.placeholder, - original: entry.original, - }) - .collect(), - operator_map: result - .redaction - .operator_map - .into_iter() - .map(|entry| JsOperatorEntry { - placeholder: entry.placeholder, - operator: operator_name(entry.operator), - }) - .collect(), - entity_count: u32::try_from(result.redaction.entity_count) - .unwrap_or(u32::MAX), - }, - } + redaction: to_js_redaction_result(result.redaction)?, + }) } -fn detection_source_name(source: DetectionSource) -> String { - match source { - DetectionSource::Trigger => "trigger", - DetectionSource::Regex => "regex", - DetectionSource::DenyList => "deny-list", - DetectionSource::LegalForm => "legal-form", - DetectionSource::Gazetteer => "gazetteer", - DetectionSource::Country => "country", - DetectionSource::Ner => "ner", - DetectionSource::Coreference => "coreference", - } - .to_owned() +fn to_js_redaction_result( + result: BindingRedactionResult, +) -> Result { + Ok(JsRedactionResult { + redacted_text: result.redacted_text, + redaction_map: result + .redaction_map + .into_iter() + .map(|entry| JsRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: to_js_operator_entries(result.operator_map), + entity_count: u32::try_from(result.entity_count).map_err(|_| { + Error::from_reason(format!( + "Entity count exceeds u32 range: {}", + result.entity_count + )) + })?, + }) } -fn source_detail_name(detail: SourceDetail) -> String { - match detail { - SourceDetail::CustomDenyList => "custom-deny-list", - SourceDetail::CustomRegex => "custom-regex", - SourceDetail::GazetteerExtension => "gazetteer-extension", - } - .to_owned() +fn to_js_operator_entries( + entries: Vec, +) -> Vec { + entries + .into_iter() + .map(|entry| JsOperatorEntry { + placeholder: entry.placeholder, + operator: entry.operator, + }) + .collect() } -fn operator_name(operator: OperatorType) -> String { - match operator { - OperatorType::Replace => "replace", - OperatorType::Redact => "redact", - } - .to_owned() +fn to_napi_core_error(error: &stella_anonymize_core::Error) -> Error { + Error::from_reason(error.to_string()) +} + +fn to_napi_contract_error(error: &ContractError) -> Error { + Error::from_reason(error.to_string()) } -fn to_napi_error(error: &stella_anonymize_core::Error) -> Error { +fn to_napi_serde_error(error: &serde_json::Error) -> Error { Error::from_reason(error.to_string()) } diff --git a/crates/anonymize-py/Cargo.toml b/crates/anonymize-py/Cargo.toml index 1dd9bc66..ece0b065 100644 --- a/crates/anonymize-py/Cargo.toml +++ b/crates/anonymize-py/Cargo.toml @@ -13,9 +13,12 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.29", features = ["extension-module", "abi3-py311"] } -serde = { version = "1", features = ["derive"] } serde_json = "1" +stella-anonymize-adapter-contract = { path = "../anonymize-adapter-contract" } stella-anonymize-core = { path = "../anonymize-core" } +[build-dependencies] +pyo3-build-config = { version = "0.29", features = ["extension-module"] } + [lints] workspace = true diff --git a/crates/anonymize-py/build.rs b/crates/anonymize-py/build.rs new file mode 100644 index 00000000..a781ce15 --- /dev/null +++ b/crates/anonymize-py/build.rs @@ -0,0 +1,3 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/crates/anonymize-py/pyproject.toml b/crates/anonymize-py/pyproject.toml new file mode 100644 index 00000000..d8962e21 --- /dev/null +++ b/crates/anonymize-py/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["maturin>=1.14,<2"] +build-backend = "maturin" + +[project] +name = "stella-anonymize-core" +dynamic = ["version"] +description = "Python bindings for Stella anonymization core" +readme = "../../README.md" +requires-python = ">=3.11" +license = "MIT" +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Rust", + "Typing :: Typed", +] + +[tool.maturin] +manifest-path = "Cargo.toml" +module-name = "stella_anonymize_core_py" diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index 1d195504..abd6a00d 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -1,92 +1,12 @@ -use std::collections::BTreeMap; - use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use serde::Deserialize; -use stella_anonymize_core::{ - CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, - LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, - PreparedSearch as CorePreparedSearch, PreparedSearchConfig, - PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, - SearchPattern, SourceDetail, StaticRedactionResult, +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingOperatorEntry, BindingPipelineEntity, + BindingPreparedSearchConfig, BindingRedactionEntry, BindingRedactionResult, + BindingStaticRedactionResult, ContractError, operator_config_from_binding, + prepared_search_config_from_binding, static_redaction_result_to_binding, }; - -#[derive(Deserialize)] -struct SearchPatternDto { - kind: String, - pattern: String, - distance: Option, - case_insensitive: Option, - whole_words: Option, -} - -#[derive(Deserialize)] -struct SearchOptionsDto { - literal_case_insensitive: Option, - literal_whole_words: Option, - regex_whole_words: Option, - fuzzy_case_insensitive: Option, - fuzzy_whole_words: Option, - fuzzy_normalize_diacritics: Option, -} - -#[derive(Deserialize)] -struct PatternSliceDto { - start: u32, - end: u32, -} - -#[derive(Deserialize)] -struct PreparedSearchSlicesDto { - regex: Option, - custom_regex: Option, - legal_forms: Option, - triggers: Option, - deny_list: Option, - street_types: Option, - gazetteer: Option, - countries: Option, -} - -#[derive(Deserialize)] -struct RegexMatchMetaDto { - label: String, - score: f64, - source_detail: Option, - requires_validation: Option, -} - -#[derive(Deserialize)] -struct GazetteerMatchDataDto { - labels: Vec, - is_fuzzy: Vec, -} - -#[derive(Deserialize)] -struct CountryMatchDataDto { - labels: Vec, -} - -#[derive(Deserialize)] -struct PreparedSearchConfigDto { - regex_patterns: Vec, - custom_regex_patterns: Vec, - literal_patterns: Vec, - regex_options: Option, - custom_regex_options: Option, - literal_options: Option, - slices: PreparedSearchSlicesDto, - regex_meta: Vec, - custom_regex_meta: Vec, - gazetteer_data: Option, - country_data: Option, -} - -#[derive(Default, Deserialize)] -struct OperatorConfigDto { - operators: Option>, - redact_string: Option, -} +use stella_anonymize_core::PreparedSearch as CorePreparedSearch; #[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] #[derive(Clone)] @@ -139,10 +59,12 @@ pub struct PyPreparedSearch { impl PyPreparedSearch { #[new] fn new(config_json: &str) -> PyResult { - let config: PreparedSearchConfigDto = serde_json::from_str(config_json) - .map_err(|error| to_py_value_error(&error))?; - let inner = CorePreparedSearch::new(to_prepared_search_config(config)?) - .map_err(|error| to_py_core_error(&error))?; + let config = parse_prepared_search_config(config_json)?; + let inner = CorePreparedSearch::new( + prepared_search_config_from_binding(config) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map_err(|error| to_py_core_error(&error))?; Ok(Self { inner }) } @@ -151,13 +73,38 @@ impl PyPreparedSearch { full_text: &str, operators_json: Option<&str>, ) -> PyResult { - let operators = to_operator_config(operators_json)?; + let operators = parse_operator_config(operators_json)?; self .inner - .redact_static_entities(full_text, &operators) - .map(to_static_redaction_result) + .redact_static_entities( + full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map(static_redaction_result_to_binding) + .map(to_py_static_redaction_result) .map_err(|error| to_py_core_error(&error)) } + + fn redact_static_entities_json( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let result = self.redact_static_entities(full_text, operators_json)?; + serde_json::to_string(&to_binding_static_redaction_result(result)) + .map_err(|error| to_py_serde_error(&error)) + } +} + +#[pyfunction] +fn redact_static_entities_json( + config_json: &str, + full_text: &str, + operators_json: Option<&str>, +) -> PyResult { + let prepared = PyPreparedSearch::new(config_json)?; + prepared.redact_static_entities_json(full_text, operators_json) } #[pyfunction] @@ -165,239 +112,152 @@ fn normalize_for_search(text: &str) -> String { stella_anonymize_core::normalize_for_search(text) } -fn to_prepared_search_config( - config: PreparedSearchConfigDto, -) -> PyResult { - Ok(PreparedSearchConfig { - regex_patterns: to_search_patterns(config.regex_patterns)?, - custom_regex_patterns: to_search_patterns(config.custom_regex_patterns)?, - literal_patterns: to_search_patterns(config.literal_patterns)?, - regex_options: to_search_options(config.regex_options), - custom_regex_options: to_search_options(config.custom_regex_options), - literal_options: to_search_options(config.literal_options), - slices: to_slices(config.slices), - regex_meta: to_regex_meta(config.regex_meta), - custom_regex_meta: to_regex_meta(config.custom_regex_meta), - gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { - labels: data.labels, - is_fuzzy: data.is_fuzzy, - }), - country_data: config.country_data.map(|data| CountryMatchData { - labels: data.labels, - }), - }) +fn parse_prepared_search_config( + config_json: &str, +) -> PyResult { + serde_json::from_str(config_json).map_err(|error| to_py_serde_error(&error)) } -fn to_search_patterns( - patterns: Vec, -) -> PyResult> { - patterns - .into_iter() - .map(|pattern| match pattern.kind.as_str() { - "literal" => Ok(SearchPattern::Literal(pattern.pattern)), - "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { - pattern: pattern.pattern, - case_insensitive: pattern.case_insensitive, - whole_words: pattern.whole_words, - }), - "regex" => Ok(SearchPattern::Regex(pattern.pattern)), - "fuzzy" => Ok(SearchPattern::Fuzzy { - pattern: pattern.pattern, - distance: pattern - .distance - .map(|distance| { - u8::try_from(distance).map_err(|_| { - PyValueError::new_err(format!( - "Fuzzy distance exceeds u8 range: {distance}" - )) - }) - }) - .transpose()?, - }), - _ => Err(PyValueError::new_err(format!( - "Unsupported search pattern kind: {}", - pattern.kind - ))), - }) - .collect() +fn parse_operator_config( + operators_json: Option<&str>, +) -> PyResult> { + operators_json + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_py_serde_error(&error)) } -fn to_search_options(options: Option) -> SearchOptions { - let Some(options) = options else { - return SearchOptions::default(); - }; - - SearchOptions { - literal: LiteralSearchOptions { - case_insensitive: options.literal_case_insensitive.unwrap_or(false), - whole_words: options.literal_whole_words.unwrap_or(false), - }, - regex: RegexSearchOptions { - whole_words: options.regex_whole_words.unwrap_or(false), - }, - fuzzy: FuzzySearchOptions { - case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), - whole_words: options.fuzzy_whole_words.unwrap_or(true), - normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), - }, +fn to_py_static_redaction_result( + result: BindingStaticRedactionResult, +) -> PyStaticRedactionResult { + PyStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(to_py_pipeline_entity) + .collect(), + redaction: to_py_redaction_result(result.redaction), } } -fn to_slices(slices: PreparedSearchSlicesDto) -> PreparedSearchSlices { - PreparedSearchSlices { - regex: to_slice(slices.regex), - custom_regex: to_slice(slices.custom_regex), - legal_forms: to_slice(slices.legal_forms), - triggers: to_slice(slices.triggers), - deny_list: to_slice(slices.deny_list), - street_types: to_slice(slices.street_types), - gazetteer: to_slice(slices.gazetteer), - countries: to_slice(slices.countries), +fn to_py_pipeline_entity(entity: BindingPipelineEntity) -> PyPipelineEntity { + PyPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.source_detail, } } -fn to_slice(slice: Option) -> PatternSlice { - slice.map_or_else(PatternSlice::default, |slice| PatternSlice { - start: slice.start, - end: slice.end, - }) -} - -fn to_regex_meta(meta: Vec) -> Vec { - meta - .into_iter() - .map(|entry| RegexMatchMeta { - label: entry.label, - score: entry.score, - source_detail: entry.source_detail.as_deref().and_then(to_source_detail), - requires_validation: entry.requires_validation.unwrap_or(false), - }) - .collect() -} - -fn to_source_detail(value: &str) -> Option { - match value { - "custom-deny-list" => Some(SourceDetail::CustomDenyList), - "custom-regex" => Some(SourceDetail::CustomRegex), - "gazetteer-extension" => Some(SourceDetail::GazetteerExtension), - _ => None, +fn to_py_redaction_result(result: BindingRedactionResult) -> PyRedactionResult { + PyRedactionResult { + redacted_text: result.redacted_text, + redaction_map: result + .redaction_map + .into_iter() + .map(to_py_redaction_entry) + .collect(), + operator_map: result + .operator_map + .into_iter() + .map(to_py_operator_entry) + .collect(), + entity_count: result.entity_count, } } -fn to_operator_config( - operators_json: Option<&str>, -) -> PyResult { - let Some(operators_json) = operators_json else { - return Ok(OperatorConfig::default()); - }; - let config: OperatorConfigDto = serde_json::from_str(operators_json) - .map_err(|error| to_py_value_error(&error))?; - - let mut operators = BTreeMap::new(); - for (label, value) in config.operators.unwrap_or_default() { - operators.insert(label, to_operator_type(&value)?); +fn to_py_redaction_entry(entry: BindingRedactionEntry) -> PyRedactionEntry { + PyRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, } - - Ok(OperatorConfig { - operators, - redact_string: config - .redact_string - .unwrap_or_else(|| String::from("[REDACTED]")), - }) } -fn to_operator_type(value: &str) -> PyResult { - match value { - "replace" => Ok(OperatorType::Replace), - "redact" => Ok(OperatorType::Redact), - _ => Err(PyValueError::new_err(format!( - "Unsupported anonymization operator: {value}" - ))), +fn to_py_operator_entry(entry: BindingOperatorEntry) -> PyOperatorEntry { + PyOperatorEntry { + placeholder: entry.placeholder, + operator: entry.operator, } } -fn to_static_redaction_result( - result: StaticRedactionResult, -) -> PyStaticRedactionResult { - PyStaticRedactionResult { +fn to_binding_static_redaction_result( + result: PyStaticRedactionResult, +) -> BindingStaticRedactionResult { + BindingStaticRedactionResult { resolved_entities: result .resolved_entities .into_iter() - .map(|entity| PyPipelineEntity { - start: entity.start, - end: entity.end, - label: entity.label, - text: entity.text, - score: entity.score, - source: detection_source_name(entity.source), - source_detail: entity.source_detail.map(source_detail_name), - }) + .map(to_binding_pipeline_entity) .collect(), - redaction: PyRedactionResult { - redacted_text: result.redaction.redacted_text, - redaction_map: result - .redaction - .redaction_map - .into_iter() - .map(|entry| PyRedactionEntry { - placeholder: entry.placeholder, - original: entry.original, - }) - .collect(), - operator_map: result - .redaction - .operator_map - .into_iter() - .map(|entry| PyOperatorEntry { - placeholder: entry.placeholder, - operator: operator_name(entry.operator), - }) - .collect(), - entity_count: result.redaction.entity_count, - }, + redaction: to_binding_redaction_result(result.redaction), + } +} + +fn to_binding_pipeline_entity( + entity: PyPipelineEntity, +) -> BindingPipelineEntity { + BindingPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.source_detail, } } -fn detection_source_name(source: DetectionSource) -> String { - match source { - DetectionSource::Trigger => "trigger", - DetectionSource::Regex => "regex", - DetectionSource::DenyList => "deny-list", - DetectionSource::LegalForm => "legal-form", - DetectionSource::Gazetteer => "gazetteer", - DetectionSource::Country => "country", - DetectionSource::Ner => "ner", - DetectionSource::Coreference => "coreference", +fn to_binding_redaction_result( + result: PyRedactionResult, +) -> BindingRedactionResult { + BindingRedactionResult { + redacted_text: result.redacted_text, + redaction_map: result + .redaction_map + .into_iter() + .map(to_binding_redaction_entry) + .collect(), + operator_map: result + .operator_map + .into_iter() + .map(to_binding_operator_entry) + .collect(), + entity_count: result.entity_count, } - .to_owned() } -fn source_detail_name(detail: SourceDetail) -> String { - match detail { - SourceDetail::CustomDenyList => "custom-deny-list", - SourceDetail::CustomRegex => "custom-regex", - SourceDetail::GazetteerExtension => "gazetteer-extension", +fn to_binding_redaction_entry( + entry: PyRedactionEntry, +) -> BindingRedactionEntry { + BindingRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, } - .to_owned() } -fn operator_name(operator: OperatorType) -> String { - match operator { - OperatorType::Replace => "replace", - OperatorType::Redact => "redact", +fn to_binding_operator_entry(entry: PyOperatorEntry) -> BindingOperatorEntry { + BindingOperatorEntry { + placeholder: entry.placeholder, + operator: entry.operator, } - .to_owned() } fn to_py_core_error(error: &stella_anonymize_core::Error) -> PyErr { PyValueError::new_err(error.to_string()) } -fn to_py_value_error(error: &serde_json::Error) -> PyErr { +fn to_py_contract_error(error: &ContractError) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +fn to_py_serde_error(error: &serde_json::Error) -> PyErr { PyValueError::new_err(error.to_string()) } -#[pymodule] +#[pymodule(gil_used = false)] fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module.add_class::()?; @@ -405,6 +265,8 @@ fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module.add_class::()?; module.add_class::()?; + module + .add_function(wrap_pyfunction!(redact_static_entities_json, module)?)?; module.add_function(wrap_pyfunction!(normalize_for_search, module)?)?; Ok(()) } diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 3e90db75..b23fbb7f 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -34,6 +34,7 @@ "test": "bun test --preload ./src/__test__/setup.ts --timeout 15000", "test:fast": "bun run test src/__test__/*.test.ts", "perf:contracts": "bun scripts/contract-perf.mjs", + "perf:native-adapters": "bun scripts/native-adapter-perf.mjs", "smoke:dist": "bun scripts/dist-smoke.mjs", "format": "oxfmt ." }, diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs new file mode 100644 index 00000000..bdb41d8f --- /dev/null +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -0,0 +1,304 @@ +import { spawnSync } from "node:child_process"; +import { copyFileSync, mkdtempSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createRequire } from "node:module"; + +const ROOT_DIR = join(import.meta.dir, "..", "..", ".."); +const ITERATIONS = Number(process.env.ANONYMIZE_NATIVE_PERF_ITERATIONS ?? 100); + +const configJson = JSON.stringify({ + regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], + custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], + literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Acme", + case_insensitive: true, + whole_words: false, + }, + { kind: "fuzzy", pattern: "Fuzztown", distance: 1 }, + { + kind: "literal-with-options", + pattern: "Turkey", + case_insensitive: true, + whole_words: true, + }, + ], + regex_options: { regex_whole_words: false }, + custom_regex_options: { regex_whole_words: false }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: false, + fuzzy_case_insensitive: true, + fuzzy_whole_words: true, + fuzzy_normalize_diacritics: true, + }, + slices: { + regex: { start: 0, end: 1 }, + custom_regex: { start: 0, end: 1 }, + gazetteer: { start: 0, end: 2 }, + countries: { start: 2, end: 3 }, + }, + regex_meta: [{ label: "registration number", score: 0.9 }], + custom_regex_meta: [ + { label: "matter id", score: 1, source_detail: "custom-regex" }, + ], + gazetteer_data: { + labels: ["organization", "address"], + is_fuzzy: [false, true], + }, + country_data: { labels: ["country"] }, +}); + +const pythonScript = ` +import importlib.util +import json +import os +import pathlib +import time + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(os.environ["STELLA_ANONYMIZE_PERF_PAYLOAD"]) +prepare_start = time.perf_counter_ns() +prepared = module.PreparedSearch(payload["config_json"]) +prepare_ms = (time.perf_counter_ns() - prepare_start) / 1_000_000 +start = time.perf_counter_ns() +for _ in range(payload["iterations"]): + for item in payload["cases"]: + prepared.redact_static_entities( + item["text"], + item.get("operators_json"), + ) +elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000 +print(json.dumps({"prepareMs": prepare_ms, "runMs": elapsed_ms})) +`; + +runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "-p", + "stella-anonymize-py", + "--release", + "--locked", +]); + +const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-perf-")); +const napiPath = join(tempDir, "stella_anonymize_napi.node"); +const pythonModulePath = join(tempDir, "stella_anonymize_core_py.so"); +copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); +copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); + +const native = createRequire(import.meta.url)(napiPath); +const cases = buildCases(); +const payload = { + config_json: configJson, + iterations: ITERATIONS, + cases: cases.map(({ text, operatorsJson }) => ({ + text, + operators_json: operatorsJson, + })), +}; + +const rustOutput = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_perf", + "--release", + "--locked", + ], + { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + }, +); +const rustSummary = JSON.parse(rustOutput); +printSummary("rust-core", rustSummary, cases.length, ITERATIONS); + +const tsPrepareStart = Bun.nanoseconds(); +const prepared = new native.NativePreparedSearch( + toNapiConfig(JSON.parse(configJson)), +); +const tsPrepareMs = elapsedMs(tsPrepareStart); +const tsStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { + for (const item of cases) { + prepared.redactStaticEntities( + item.text, + item.operatorsJson === undefined + ? undefined + : JSON.parse(item.operatorsJson), + ); + } +} +const tsRunMs = elapsedMs(tsStart); +printSummary( + "ts-napi", + { prepareMs: tsPrepareMs, runMs: tsRunMs }, + cases.length, + ITERATIONS, +); + +const pyOutput = runCommand("python3", ["-c", pythonScript], { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, +}); +const pySummary = JSON.parse(pyOutput); +printSummary("python-pyo3", pySummary, cases.length, ITERATIONS); + +function buildCases() { + const places = ["Fuzztovn", "Fuzztawn", "Fuzztowm"]; + const operators = [ + undefined, + JSON.stringify({ operators: { country: "redact" } }), + JSON.stringify({ operators: { address: "redact", country: "redact" } }), + ]; + const fixtureCases = []; + + for (let index = 0; index < 24; index += 1) { + const registration = `AB${String(index).padStart(4, "0")}`; + const matter = `MAT-${String(index % 1_000).padStart(3, "0")}`; + const place = places[index % places.length]; + fixtureCases.push({ + text: + `Reference ${registration} for Acme s.r.o. near ` + + `${place}, Turkey, matter ${matter}.`, + operatorsJson: operators[index % operators.length], + }); + } + + return fixtureCases; +} + +function toNapiConfig(config) { + return { + regexPatterns: config.regex_patterns.map(toNapiPattern), + customRegexPatterns: config.custom_regex_patterns.map(toNapiPattern), + literalPatterns: config.literal_patterns.map(toNapiPattern), + regexOptions: toNapiOptions(config.regex_options), + customRegexOptions: toNapiOptions(config.custom_regex_options), + literalOptions: toNapiOptions(config.literal_options), + slices: { + regex: config.slices.regex, + customRegex: config.slices.custom_regex, + legalForms: config.slices.legal_forms, + triggers: config.slices.triggers, + denyList: config.slices.deny_list, + streetTypes: config.slices.street_types, + gazetteer: config.slices.gazetteer, + countries: config.slices.countries, + }, + regexMeta: config.regex_meta.map(toNapiRegexMeta), + customRegexMeta: config.custom_regex_meta.map(toNapiRegexMeta), + gazetteerData: + config.gazetteer_data === undefined + ? undefined + : { + labels: config.gazetteer_data.labels, + isFuzzy: config.gazetteer_data.is_fuzzy, + }, + countryData: config.country_data, + }; +} + +function toNapiPattern(pattern) { + return { + kind: pattern.kind, + pattern: pattern.pattern, + distance: pattern.distance, + caseInsensitive: pattern.case_insensitive, + wholeWords: pattern.whole_words, + }; +} + +function toNapiOptions(options) { + if (options === undefined) { + return undefined; + } + return { + literalCaseInsensitive: options.literal_case_insensitive, + literalWholeWords: options.literal_whole_words, + regexWholeWords: options.regex_whole_words, + fuzzyCaseInsensitive: options.fuzzy_case_insensitive, + fuzzyWholeWords: options.fuzzy_whole_words, + fuzzyNormalizeDiacritics: options.fuzzy_normalize_diacritics, + }; +} + +function toNapiRegexMeta(meta) { + return { + label: meta.label, + score: meta.score, + sourceDetail: meta.source_detail, + requiresValidation: meta.requires_validation, + }; +} + +function nativeLibraryPath(name) { + if (process.platform === "darwin") { + return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(ROOT_DIR, "target", "release", `lib${name}.so`); + } + return join(ROOT_DIR, "target", "release", `${name}.dll`); +} + +function printSummary(adapter, summary, fixtureCount, iterations) { + const calls = fixtureCount * iterations; + const runMs = Number(summary.runMs); + const prepareMs = Number(summary.prepareMs); + console.log( + JSON.stringify({ + event: "native-adapter-perf", + adapter, + fixtureCount, + iterations, + calls, + prepareMs: roundMs(prepareMs), + runMs: roundMs(runMs), + totalMs: roundMs(prepareMs + runMs), + avgCallMs: roundMs(runMs / calls), + }), + ); +} + +function elapsedMs(start) { + return (Bun.nanoseconds() - start) / 1_000_000; +} + +function roundMs(ms) { + return Math.round(ms * 1_000) / 1_000; +} + +function runCommand(command, args, env = {}) { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + env: { ...process.env, ...env }, + }); + + if (result.status === 0) { + return result.stdout; + } + + throw new Error( + [ + `${command} ${args.join(" ")} failed with status ${result.status}`, + result.stdout, + result.stderr, + ] + .filter(Boolean) + .join("\n"), + ); +} diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts new file mode 100644 index 00000000..880cd192 --- /dev/null +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -0,0 +1,418 @@ +import { spawnSync } from "node:child_process"; +import { copyFileSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createRequire } from "node:module"; +import { describe, expect, setDefaultTimeout, test } from "bun:test"; +import fc from "fast-check"; + +setDefaultTimeout(120_000); + +type NativeAdapter = { + normalizeForSearch: (text: string) => string; + redactStaticEntitiesJson: ( + configJson: string, + fullText: string, + operatorsJson?: string, + ) => string; +}; + +type RedactionEntry = { + placeholder: string; + original: string; +}; + +type StaticRedactionResult = { + resolved_entities: Array<{ + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + source_detail?: string | null; + }>; + redaction: { + redacted_text: string; + redaction_map: RedactionEntry[]; + operator_map: Array<{ + placeholder: string; + operator: string; + }>; + entity_count: number; + }; +}; + +type GeneratedNativeCase = { + text: string; + operators: Record | null; + sensitiveValues: string[]; +}; + +const ROOT_DIR = join(import.meta.dir, "..", "..", "..", ".."); +const TARGET_DIR = join(ROOT_DIR, "target", "debug"); +const CONFIG_JSON = JSON.stringify({ + regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], + custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], + literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Acme", + case_insensitive: true, + whole_words: false, + }, + { kind: "fuzzy", pattern: "Fuzztown", distance: 1 }, + { + kind: "literal-with-options", + pattern: "Turkey", + case_insensitive: true, + whole_words: true, + }, + ], + regex_options: { regex_whole_words: false }, + custom_regex_options: { regex_whole_words: false }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: false, + fuzzy_case_insensitive: true, + fuzzy_whole_words: true, + fuzzy_normalize_diacritics: true, + }, + slices: { + regex: { start: 0, end: 1 }, + custom_regex: { start: 0, end: 1 }, + gazetteer: { start: 0, end: 2 }, + countries: { start: 2, end: 3 }, + }, + regex_meta: [{ label: "registration number", score: 0.9 }], + custom_regex_meta: [ + { label: "matter id", score: 1, source_detail: "custom-regex" }, + ], + gazetteer_data: { + labels: ["organization", "address"], + is_fuzzy: [false, true], + }, + country_data: { labels: ["country"] }, +}); + +const PYTHON_ADAPTER_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +results = [ + json.loads( + module.redact_static_entities_json( + payload["config_json"], + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps(results)) +`; + +let loadedAdapters: { + native: NativeAdapter; + pythonModulePath: string; + tempDir: string; +} | null = null; + +const gapArb = fc + .array( + fc.constantFrom( + " ", + "\t", + "\n", + ".", + ",", + ";", + ":", + "(", + ")", + "a", + "e", + "n", + "r", + "s", + "t", + "č", + "ř", + "á", + "ü", + ), + { maxLength: 12 }, + ) + .map((chars) => chars.join("")); + +const registrationArb = fc + .record({ + prefix: fc.tuple( + fc.constantFrom("A", "B", "C", "D", "E", "F"), + fc.constantFrom("G", "H", "I", "J", "K", "L"), + ), + serial: fc.integer({ min: 0, max: 9999 }), + }) + .map( + ({ prefix, serial }) => + `${prefix.join("")}${String(serial).padStart(4, "0")}`, + ); + +const matterArb = fc + .integer({ min: 0, max: 999 }) + .map((value) => `MAT-${String(value).padStart(3, "0")}`); + +const fuzzyPlaceArb = fc.constantFrom("Fuzztovn", "Fuzztawn", "Fuzztowm"); + +const operatorsArb = fc.option( + fc.constantFrom( + { country: "redact" }, + { address: "redact", country: "redact" }, + { "matter id": "redact" }, + ), + { nil: null }, +); + +const generatedCaseArb: fc.Arbitrary = fc + .record({ + left: gapArb, + middle: gapArb, + right: gapArb, + registration: registrationArb, + matter: matterArb, + fuzzyPlace: fuzzyPlaceArb, + operators: operatorsArb, + }) + .map( + ({ left, middle, right, registration, matter, fuzzyPlace, operators }) => { + const text = + `${left}Reference ${registration} for Acme s.r.o. near ` + + `${fuzzyPlace}, Turkey, matter ${matter}.${middle}${right}`; + return { + text, + operators, + sensitiveValues: [ + registration, + "Acme s.r.o.", + fuzzyPlace, + "Turkey", + matter, + ], + }; + }, + ); + +describe("native adapter parity", () => { + test("normalization is identical through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = "Číslo\u00a0PAS - 1234 / Fuzztovn"; + + expect(callPythonNormalize(adapters.pythonModulePath, text)).toBe( + adapters.native.normalizeForSearch(text), + ); + }); + + test("generated static-redaction fixtures match exactly", () => { + const adapters = getAdapters(); + + fc.assert( + fc.property( + fc.array(generatedCaseArb, { minLength: 10, maxLength: 40 }), + (cases) => { + const tsResults = cases.map(({ text, operators }) => + runTsAdapter(adapters.native, text, operators), + ); + const pyResults = runPythonAdapters( + adapters.pythonModulePath, + cases, + adapters.tempDir, + ); + + expect(pyResults).toEqual(tsResults); + for (const [index, item] of cases.entries()) { + const result = tsResults.at(index); + expect(result).toBeDefined(); + expect(result?.redaction.entity_count).toBe(5); + for (const value of item.sensitiveValues) { + expect(result?.redaction.redacted_text).not.toContain(value); + } + } + }, + ), + { numRuns: 5, seed: 20_260_624 }, + ); + }); +}); + +const getAdapters = () => { + if (loadedAdapters !== null) { + return loadedAdapters; + } + + runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "-p", + "stella-anonymize-py", + "--locked", + ]); + + const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-native-")); + const napiPath = join(tempDir, "stella_anonymize_napi.node"); + const pythonModulePath = join(tempDir, "stella_anonymize_core_py.so"); + copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); + copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); + + const native = loadNativeAdapter(napiPath); + loadedAdapters = { native, pythonModulePath, tempDir }; + return loadedAdapters; +}; + +const nativeLibraryPath = (name: string): string => { + if (process.platform === "darwin") { + return join(TARGET_DIR, `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(TARGET_DIR, `lib${name}.so`); + } + return join(TARGET_DIR, `${name}.dll`); +}; + +const loadNativeAdapter = (nativePath: string): NativeAdapter => { + const nativeRequire = createRequire(import.meta.url); + const loaded: unknown = nativeRequire(nativePath); + const normalizeForSearch = Reflect.get(Object(loaded), "normalizeForSearch"); + const redactStaticEntitiesJson = Reflect.get( + Object(loaded), + "redactStaticEntitiesJson", + ); + if ( + typeof normalizeForSearch !== "function" || + typeof redactStaticEntitiesJson !== "function" + ) { + throw new TypeError("Native anonymize adapter exports are incomplete"); + } + return { normalizeForSearch, redactStaticEntitiesJson }; +}; + +const runTsAdapter = ( + adapter: NativeAdapter, + text: string, + operators: Record | null, +): StaticRedactionResult => { + const operatorsJson = operatorConfigJson(operators); + return JSON.parse( + adapter.redactStaticEntitiesJson(CONFIG_JSON, text, operatorsJson), + ); +}; + +const runPythonAdapters = ( + pythonModulePath: string, + cases: GeneratedNativeCase[], + tempDir: string, +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: operatorConfigJson(operators), + })), + }), + ); + + const output = runCommand("python3", ["-c", PYTHON_ADAPTER_SCRIPT], { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + +const callPythonNormalize = ( + pythonModulePath: string, + text: string, +): string => { + const payloadDir = mkdtempSync(join(tmpdir(), "stella-anonymize-normalize-")); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync(payloadPath, JSON.stringify({ text })); + try { + return runCommand( + "python3", + [ + "-c", + ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +print(module.normalize_for_search(payload["text"])) +`, + ], + { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ).trimEnd(); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + +const operatorConfigJson = ( + operators: Record | null, +): string | undefined => { + if (operators === null) { + return undefined; + } + return JSON.stringify({ operators }); +}; + +const runCommand = ( + command: string, + args: string[], + env: Record = {}, +): string => { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + env: { ...process.env, ...env }, + }); + + if (result.status === 0) { + return result.stdout; + } + + throw new Error( + [ + `${command} ${args.join(" ")} failed with status ${result.status}`, + result.stdout, + result.stderr, + ] + .filter(Boolean) + .join("\n"), + ); +}; From a41fe2b635fcde3ee811e687f00e3d7666624906 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 11:47:12 +0200 Subject: [PATCH 018/130] fix: tighten native redaction contracts --- crates/anonymize-core/src/prepared.rs | 23 ++++++- crates/anonymize-core/src/processors.rs | 5 ++ crates/anonymize-core/src/redact.rs | 47 +++++++------- .../anonymize-core/src/resolution/boundary.rs | 42 +++++++++++- crates/anonymize-core/src/types.rs | 9 +++ crates/anonymize-core/tests/prepared.rs | 65 ++++++++++++++++++- crates/anonymize-core/tests/redaction.rs | 28 +++++++- crates/anonymize-core/tests/resolution.rs | 47 ++++++++++++++ 8 files changed, 237 insertions(+), 29 deletions(-) diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 738399a5..816231d8 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -10,7 +10,8 @@ use crate::resolution::{ }; use crate::search::{SearchIndex, SearchOptions, SearchPattern}; use crate::types::{ - Entity, EntityKind, OperatorConfig, RedactionResult, Result, SearchMatch, + Entity, EntityKind, Error, OperatorConfig, RedactionResult, Result, + SearchMatch, }; pub struct PreparedSearch { @@ -76,6 +77,8 @@ pub struct StaticRedactionResult { impl PreparedSearch { pub fn new(config: PreparedSearchConfig) -> Result { + validate_supported_slices(&config.slices)?; + Ok(Self { regex: SearchIndex::new(config.regex_patterns, config.regex_options)?, custom_regex: SearchIndex::new( @@ -175,6 +178,24 @@ impl PreparedSearch { } } +fn validate_supported_slices(slices: &PreparedSearchSlices) -> Result<()> { + reject_unsupported_slice(slices.legal_forms, "legal_forms")?; + reject_unsupported_slice(slices.triggers, "triggers")?; + reject_unsupported_slice(slices.deny_list, "deny_list")?; + reject_unsupported_slice(slices.street_types, "street_types") +} + +const fn reject_unsupported_slice( + slice: PatternSlice, + name: &'static str, +) -> Result<()> { + if slice.is_empty() { + return Ok(()); + } + + Err(Error::UnsupportedStaticSlice { slice: name }) +} + impl StaticDetectionResult { #[must_use] pub fn all_entities(&self) -> Vec { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 01d54f90..00cac250 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -15,6 +15,11 @@ pub struct PatternSlice { } impl PatternSlice { + #[must_use] + pub const fn is_empty(self) -> bool { + self.start >= self.end + } + #[must_use] pub const fn contains(self, pattern: u32) -> bool { pattern >= self.start && pattern < self.end diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index 675693ef..e51925b1 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -23,20 +23,17 @@ pub fn redact_text( let offsets = Utf16Offsets::new(full_text); validate_spans(entities, &offsets)?; - // Reversible originals come from the source span, not caller display text. - let entities = entities_with_source_text(full_text, entities, &offsets)?; - - let placeholder_map = build_placeholder_map(&entities, full_text); - let mut sorted = entities; - sorted.sort_by_key(|entity| entity.start); + let placeholder_map = build_placeholder_map(entities, full_text); + let mut sorted = redaction_spans(full_text, entities, &offsets)?; + sorted.sort_by_key(|span| span.entity.start); // Existing contract: first accepted span wins overlaps. - let mut non_overlapping = Vec::::new(); + let mut non_overlapping = Vec::::new(); let mut last_end = 0; - for entity in sorted { - if entity.start >= last_end { - last_end = entity.end; - non_overlapping.push(entity); + for span in sorted { + if span.entity.start >= last_end { + last_end = span.entity.end; + non_overlapping.push(span); } } @@ -45,7 +42,8 @@ pub fn redact_text( let mut operator_map = Vec::::new(); let mut cursor = 0; - for entity in &non_overlapping { + for span in &non_overlapping { + let entity = &span.entity; if entity.start > cursor { parts.push(offsets.slice(full_text, cursor, entity.start)?); } @@ -67,7 +65,7 @@ pub fn redact_text( { redaction_map.push(RedactionEntry { placeholder: placeholder.clone(), - original: entity_original_text(entity), + original: redaction_original_text(span), }); } @@ -118,18 +116,23 @@ fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { Ok(()) } -fn entities_with_source_text( +struct RedactionSpan { + entity: Entity, + source_text: String, +} + +fn redaction_spans( full_text: &str, entities: &[Entity], offsets: &Utf16Offsets, -) -> Result> { +) -> Result> { let mut resolved = Vec::with_capacity(entities.len()); for entity in entities { - let mut resolved_entity = entity.clone(); - resolved_entity.text = - offsets.slice(full_text, entity.start, entity.end)?; - resolved.push(resolved_entity); + resolved.push(RedactionSpan { + entity: entity.clone(), + source_text: offsets.slice(full_text, entity.start, entity.end)?, + }); } Ok(resolved) @@ -172,9 +175,9 @@ fn redaction_value<'a>( .map(|entry| entry.original.as_str()) } -fn entity_original_text(entity: &Entity) -> String { - match &entity.kind { - EntityKind::Detected => entity.text.clone(), +fn redaction_original_text(span: &RedactionSpan) -> String { + match &span.entity.kind { + EntityKind::Detected => span.source_text.clone(), EntityKind::Coreference { source_text } => source_text.clone(), } } diff --git a/crates/anonymize-core/src/resolution/boundary.rs b/crates/anonymize-core/src/resolution/boundary.rs index 4aab511c..43e5950f 100644 --- a/crates/anonymize-core/src/resolution/boundary.rs +++ b/crates/anonymize-core/src/resolution/boundary.rs @@ -297,8 +297,8 @@ fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { let mut run_start = None::; let mut run_end = None::; - for span in spans { - if span.ch.is_alphanumeric() { + for (index, span) in spans.iter().enumerate() { + if is_word_body(span.ch) || is_word_connector_between(spans, index) { if run_start.is_none() { run_start = Some(span.start); } @@ -320,6 +320,44 @@ fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { boundaries } +fn is_word_connector_between(spans: &[CharSpan], index: usize) -> bool { + let Some(span) = spans.get(index) else { + return false; + }; + if !is_word_connector(span.ch) { + return false; + } + + let Some(previous) = index.checked_sub(1).and_then(|prev| spans.get(prev)) + else { + return false; + }; + let Some(next) = spans.get(index.saturating_add(1)) else { + return false; + }; + + is_word_body(previous.ch) && is_word_body(next.ch) +} + +const fn is_word_connector(ch: char) -> bool { + matches!(ch, '\'' | '\u{2018}' | '\u{2019}' | '\u{02bc}' | '\u{ff07}') +} + +fn is_word_body(ch: char) -> bool { + ch.is_alphanumeric() || is_combining_mark(ch) +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + fn word_start_at( position: u32, boundaries: &BTreeSet, diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 95f97a7d..d72a1fff 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -29,6 +29,9 @@ pub enum Error { UnsupportedRegexValidation { pattern: u32, }, + UnsupportedStaticSlice { + slice: &'static str, + }, } impl fmt::Display for Error { @@ -64,6 +67,12 @@ impl fmt::Display for Error { "Regex pattern {pattern} requires validation that is not available in core" ) } + Self::UnsupportedStaticSlice { slice } => { + write!( + formatter, + "Static slice '{slice}' is configured but not supported by native core" + ) + } } } } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 5c2fb4bb..58bc015e 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1,12 +1,28 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, - LiteralSearchOptions, OperatorConfig, PatternSlice, PreparedSearch, - PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + CountryMatchData, DetectionSource, Error, FuzzySearchOptions, + GazetteerMatchData, LiteralSearchOptions, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, }; +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + gazetteer_data: None, + country_data: None, + } +} + #[test] fn prepared_search_runs_normalized_literal_pass() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -180,3 +196,46 @@ fn prepared_search_redacts_static_entities_end_to_end() { assert_eq!(result.redaction.entity_count, 3); assert_eq!(result.resolved_entities.len(), 3); } + +#[test] +fn prepared_search_rejects_unsupported_static_slices() { + let unsupported = PatternSlice { start: 0, end: 1 }; + let cases = [ + ( + "legal_forms", + PreparedSearchSlices { + legal_forms: unsupported, + ..PreparedSearchSlices::default() + }, + ), + ( + "triggers", + PreparedSearchSlices { + triggers: unsupported, + ..PreparedSearchSlices::default() + }, + ), + ( + "deny_list", + PreparedSearchSlices { + deny_list: unsupported, + ..PreparedSearchSlices::default() + }, + ), + ( + "street_types", + PreparedSearchSlices { + street_types: unsupported, + ..PreparedSearchSlices::default() + }, + ), + ]; + + for (slice, slices) in cases { + let error = PreparedSearch::new(empty_config(slices)) + .err() + .expect("unsupported slice should be rejected"); + + assert_eq!(error, Error::UnsupportedStaticSlice { slice }); + } +} diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index ffe8f113..8dea3889 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -10,6 +10,15 @@ use stella_anonymize_core::{ }; fn entity(text: &str, label: &str, value: &str) -> Entity { + entity_with_display_text(text, label, value, value) +} + +fn entity_with_display_text( + text: &str, + label: &str, + value: &str, + display_text: &str, +) -> Entity { let byte_start = text .find(value) .unwrap_or_else(|| panic!("missing fixture value: {value}")); @@ -18,7 +27,7 @@ fn entity(text: &str, label: &str, value: &str) -> Entity { .unwrap_or_else(|| panic!("invalid fixture boundary: {byte_start}")); let start = utf16_len(prefix); let end = start.saturating_add(utf16_len(value)); - Entity::detected(start, end, label, value) + Entity::detected(start, end, label, display_text) } fn utf16_len(text: &str) -> u32 { @@ -255,6 +264,23 @@ fn detected_original_uses_redacted_source_span() { ); } +#[test] +fn detected_placeholder_identity_uses_sanitized_text() { + let text = "Dates: 21.\nMärz 1968 and 21. März 1968."; + let normalized = "21. März 1968"; + let entities = vec![ + entity_with_display_text(text, "date", "21.\nMärz 1968", normalized), + entity(text, "date", normalized), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "Dates: [DATE_1] and [DATE_1]."); + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].original, "21.\nMärz 1968"); +} + #[test] fn invalid_utf16_boundary_is_rejected() { let text = "A 🦀 Bob"; diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 90681939..996ed924 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -257,6 +257,53 @@ fn boundary_expands_partial_words() { assert_eq!(person.end, 17); } +#[test] +fn boundary_expands_inside_apostrophe_names() { + let full_text = "Kontaktujte O'Connor prosím."; + let start = utf16_len("Kontaktujte O'"); + let end = start.saturating_add(utf16_len("Connor")); + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + start, + end, + "person", + "Connor", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.start, utf16_len("Kontaktujte ")); + assert_eq!(person.text, "O'Connor"); +} + +#[test] +fn boundary_expands_across_combining_marks() { + let full_text = "Podepsal Cafe\u{0301}."; + let start = utf16_len("Podepsal "); + let end = start.saturating_add(utf16_len("Cafe")); + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + start, + end, + "organization", + "Cafe", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let organization = result.first().expect("organization"); + assert_eq!(organization.text, "Cafe\u{0301}"); +} + #[test] fn boundary_clamps_expansion_at_cross_label_neighbors() { let full_text = "JanPraha"; From 3dd2a7fd15d21875afa15d761384f28900c6f2b8 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 12:00:39 +0200 Subject: [PATCH 019/130] test: add migration fixture gate --- .github/workflows/ci.yml | 5 + packages/anonymize/package.json | 1 + .../scripts/migration-fixture-perf.mjs | 488 ++++++++++++++++++ 3 files changed, 494 insertions(+) create mode 100644 packages/anonymize/scripts/migration-fixture-perf.mjs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6e2f1a8..d5bf0670 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,6 +81,11 @@ jobs: - name: Smoke test built artifact run: bun run --cwd packages/anonymize smoke:dist + - name: Migration fixture parity and performance + run: | + git fetch origin main --depth=1 + bun run --cwd packages/anonymize perf:migration-fixtures + - name: Contract performance (informational) # Surfaces pipeline-latency regressions against the thresholds in # contract-perf.mjs. Non-blocking for now (shared-runner timing is diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index b23fbb7f..2eb16dd4 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -34,6 +34,7 @@ "test": "bun test --preload ./src/__test__/setup.ts --timeout 15000", "test:fast": "bun run test src/__test__/*.test.ts", "perf:contracts": "bun scripts/contract-perf.mjs", + "perf:migration-fixtures": "bun scripts/migration-fixture-perf.mjs", "perf:native-adapters": "bun scripts/native-adapter-perf.mjs", "smoke:dist": "bun scripts/dist-smoke.mjs", "format": "oxfmt ." diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs new file mode 100644 index 00000000..5824b43f --- /dev/null +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -0,0 +1,488 @@ +import { spawnSync } from "node:child_process"; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join, relative, resolve } from "node:path"; +import { fileURLToPath, pathToFileURL } from "node:url"; + +const SCRIPT_PATH = fileURLToPath(import.meta.url); +const ROOT_DIR = resolve(join(import.meta.dir, "..", "..", "..")); +const PACKAGE_DIR = join(ROOT_DIR, "packages", "anonymize"); +const FIXTURES_DIR = join( + PACKAGE_DIR, + "src", + "__test__", + "fixtures", + "contracts", +); +const BASELINE_REF = + process.env.ANONYMIZE_MIGRATION_BASELINE_REF ?? "origin/main"; +const COMPARE_BASELINE = + process.env.ANONYMIZE_MIGRATION_COMPARE_BASELINE !== "0"; +const WARM_ITERATIONS = positiveIntegerEnv( + "ANONYMIZE_MIGRATION_WARM_ITERATIONS", + 2, +); + +if (process.env.ANONYMIZE_MIGRATION_WORKER === "1") { + await runWorker(); +} else { + await runCoordinator(); +} + +async function runCoordinator() { + const fixtures = discoverFixtures(FIXTURES_DIR); + if (fixtures.length === 0) { + throw new Error(`No contract fixtures found in ${FIXTURES_DIR}`); + } + + const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-migration-")); + + try { + let baseline = null; + if (COMPARE_BASELINE) { + ensureGitRef(BASELINE_REF); + const baselineRoot = materializeGitRef(BASELINE_REF, tempRoot); + baseline = runVariant({ + name: `baseline:${BASELINE_REF}`, + sourceRoot: baselineRoot, + fixtures, + tempRoot, + }); + printVariantSummary(baseline); + } + + const candidate = runVariant({ + name: "candidate", + sourceRoot: ROOT_DIR, + fixtures, + tempRoot, + }); + printVariantSummary(candidate); + + if (baseline !== null) { + const comparison = compareSnapshots(baseline, candidate); + console.log(JSON.stringify(comparison)); + if (!comparison.equal) { + throw new Error( + `Fixture parity failed for ${comparison.mismatches.length} fixture(s)`, + ); + } + } + } finally { + rmSync(tempRoot, { force: true, recursive: true }); + } +} + +function runVariant({ name, sourceRoot, fixtures, tempRoot }) { + const resultPath = join( + tempRoot, + `${name.replaceAll(/[^a-zA-Z0-9_.-]/g, "_")}.json`, + ); + const child = spawnSync(process.execPath, [SCRIPT_PATH], { + cwd: ROOT_DIR, + env: { + ...process.env, + ANONYMIZE_MIGRATION_WORKER: "1", + ANONYMIZE_MIGRATION_SOURCE_ROOT: sourceRoot, + ANONYMIZE_MIGRATION_VARIANT: name, + ANONYMIZE_MIGRATION_FIXTURES_DIR: FIXTURES_DIR, + ANONYMIZE_MIGRATION_FIXTURES: JSON.stringify(fixtures), + ANONYMIZE_MIGRATION_RESULT_PATH: resultPath, + ANONYMIZE_MIGRATION_WARM_ITERATIONS: String(WARM_ITERATIONS), + }, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + + if (child.status !== 0) { + throw new Error( + [ + `Migration fixture worker failed for ${name}`, + child.stdout.trim(), + child.stderr.trim(), + ] + .filter(Boolean) + .join("\n"), + ); + } + + return JSON.parse(readFileSync(resultPath, "utf8")); +} + +async function runWorker() { + const sourceRoot = requiredEnv("ANONYMIZE_MIGRATION_SOURCE_ROOT"); + const variant = requiredEnv("ANONYMIZE_MIGRATION_VARIANT"); + const resultPath = requiredEnv("ANONYMIZE_MIGRATION_RESULT_PATH"); + const fixtures = JSON.parse(requiredEnv("ANONYMIZE_MIGRATION_FIXTURES")); + + const importStart = Bun.nanoseconds(); + const [indexModule, configModule, dictionaryModule] = await Promise.all([ + importSource(sourceRoot, "packages/anonymize/src/index.ts", variant), + importSource( + sourceRoot, + "packages/anonymize/src/__test__/contract-config.ts", + variant, + ), + importSource( + sourceRoot, + "packages/anonymize/src/__test__/load-dictionaries.ts", + variant, + ), + ]); + const importMs = elapsedMs(importStart); + + const dictionaryStart = Bun.nanoseconds(); + const dictionaries = await dictionaryModule.loadTestDictionaries(); + const dictionaryMs = elapsedMs(dictionaryStart); + + const config = { + ...configModule.contractTestConfig(`migration-fixtures-${variant}`), + dictionaries, + }; + const context = indexModule.createPipelineContext(); + + const prepareStart = Bun.nanoseconds(); + await indexModule.preparePipelineSearch({ config, context }); + const prepareMs = elapsedMs(prepareStart); + + const coldRun = await runFixtureSweep({ + indexModule, + config, + context, + fixtures, + }); + + const warmRuns = []; + for (let index = 0; index < WARM_ITERATIONS; index += 1) { + warmRuns.push( + await runFixtureSweep({ + indexModule, + config, + context, + fixtures, + }), + ); + } + + const warmRunMs = roundMs(warmRuns.reduce((sum, run) => sum + run.ms, 0)); + const warmAvgMs = + WARM_ITERATIONS === 0 ? 0 : roundMs(warmRunMs / WARM_ITERATIONS); + const snapshots = Object.fromEntries( + coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), + ); + + writeFileSync( + resultPath, + `${JSON.stringify({ + event: "fixture-migration-variant", + variant, + fixtureCount: fixtures.length, + warmIterations: WARM_ITERATIONS, + timings: { + importMs, + dictionaryMs, + prepareMs, + coldRunMs: coldRun.ms, + coldPipelineMs: roundMs(dictionaryMs + prepareMs + coldRun.ms), + coldTotalMs: roundMs(importMs + dictionaryMs + prepareMs + coldRun.ms), + warmRunMs, + warmAvgMs, + }, + fixtures: coldRun.fixtures.map( + ({ fixture, ms, entityCount, redactedTextLength }) => ({ + fixture, + ms, + entityCount, + redactedTextLength, + }), + ), + snapshots, + })}\n`, + ); +} + +async function runFixtureSweep({ indexModule, config, context, fixtures }) { + const sweepStart = Bun.nanoseconds(); + const results = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const start = Bun.nanoseconds(); + const entities = await indexModule.runPipeline({ + fullText, + config, + gazetteerEntries: [], + context, + }); + const ms = elapsedMs(start); + const snapshot = toSnapshot(indexModule, fullText, entities, context); + results.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + ms, + entityCount: snapshot.entityCount, + redactedTextLength: snapshot.redactedText.length, + snapshot, + }); + } + + return { + ms: elapsedMs(sweepStart), + fixtures: results, + }; +} + +function toSnapshot(indexModule, fullText, entities, context) { + const sorted = entities.toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + const counts = {}; + for (const entity of sorted) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + + const redacted = indexModule.redactText(fullText, sorted, undefined, context); + + return { + entityCount: sorted.length, + counts, + entities: sorted.map(({ start, end, label, text, source }) => ({ + start, + end, + label, + text, + source, + })), + redactedText: redacted.redactedText, + }; +} + +function compareSnapshots(baseline, candidate) { + const mismatches = []; + const fixtureNames = new Set([ + ...Object.keys(baseline.snapshots), + ...Object.keys(candidate.snapshots), + ]); + + for (const fixture of [...fixtureNames].sort()) { + const expected = baseline.snapshots[fixture]; + const actual = candidate.snapshots[fixture]; + if (JSON.stringify(expected) === JSON.stringify(actual)) { + continue; + } + mismatches.push(describeMismatch(fixture, expected, actual)); + } + + return { + event: "fixture-migration-parity", + baseline: baseline.variant, + candidate: candidate.variant, + equal: mismatches.length === 0, + fixtureCount: fixtureNames.size, + mismatches, + timingComparison: timingComparison(baseline, candidate), + }; +} + +function describeMismatch(fixture, expected, actual) { + if (expected === undefined || actual === undefined) { + return { + fixture, + kind: expected === undefined ? "missing-baseline" : "missing-candidate", + }; + } + + const firstEntityDiff = firstDifferentIndex( + expected.entities, + actual.entities, + ); + + return { + fixture, + kind: "snapshot-mismatch", + entityCount: { + baseline: expected.entityCount, + candidate: actual.entityCount, + }, + counts: { + baseline: expected.counts, + candidate: actual.counts, + }, + redactedTextEqual: expected.redactedText === actual.redactedText, + firstEntityDiff: + firstEntityDiff === -1 + ? null + : { + index: firstEntityDiff, + baseline: expected.entities.at(firstEntityDiff) ?? null, + candidate: actual.entities.at(firstEntityDiff) ?? null, + }, + }; +} + +function timingComparison(baseline, candidate) { + return { + coldPipelineSpeedup: speedup( + baseline.timings.coldPipelineMs, + candidate.timings.coldPipelineMs, + ), + warmAvgSpeedup: speedup( + baseline.timings.warmAvgMs, + candidate.timings.warmAvgMs, + ), + baseline: baseline.timings, + candidate: candidate.timings, + }; +} + +function speedup(baselineMs, candidateMs) { + if (candidateMs <= 0) { + return null; + } + return roundMs(baselineMs / candidateMs); +} + +function printVariantSummary(result) { + console.log( + JSON.stringify({ + event: result.event, + variant: result.variant, + fixtureCount: result.fixtureCount, + warmIterations: result.warmIterations, + timings: result.timings, + fixtures: result.fixtures, + }), + ); +} + +function firstDifferentIndex(left, right) { + const len = Math.max(left.length, right.length); + for (let index = 0; index < len; index += 1) { + if (JSON.stringify(left.at(index)) !== JSON.stringify(right.at(index))) { + return index; + } + } + return -1; +} + +function discoverFixtures(fixturesDir) { + const paths = []; + for (const language of readdirSync(fixturesDir)) { + const languageDir = join(fixturesDir, language); + for (const file of readdirSync(languageDir)) { + if (file.endsWith(".txt")) { + paths.push(join(languageDir, file)); + } + } + } + return paths.sort((left, right) => left.localeCompare(right)); +} + +function ensureGitRef(ref) { + const verify = spawnSync("git", ["rev-parse", "--verify", `${ref}^{tree}`], { + cwd: ROOT_DIR, + encoding: "utf8", + }); + if (verify.status === 0) { + return; + } + + if (ref === "origin/main") { + runCommand("git", ["fetch", "origin", "main", "--depth=1"]); + const retry = spawnSync("git", ["rev-parse", "--verify", `${ref}^{tree}`], { + cwd: ROOT_DIR, + encoding: "utf8", + }); + if (retry.status === 0) { + return; + } + } + + throw new Error(`Cannot resolve baseline ref: ${ref}`); +} + +function materializeGitRef(ref, tempRoot) { + const outputDir = join(tempRoot, "baseline"); + mkdirSync(outputDir, { recursive: true }); + + const archive = spawnSync("git", ["archive", "--format=tar", ref], { + cwd: ROOT_DIR, + maxBuffer: 512 * 1024 * 1024, + }); + if (archive.status !== 0) { + throw new Error( + `git archive failed for ${ref}: ${archive.stderr.toString()}`, + ); + } + + const extract = spawnSync("tar", ["-x", "-C", outputDir], { + input: archive.stdout, + maxBuffer: 512 * 1024 * 1024, + }); + if (extract.status !== 0) { + throw new Error(`tar extraction failed: ${extract.stderr.toString()}`); + } + + return outputDir; +} + +function runCommand(command, args) { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + stdio: "inherit", + }); + if (result.status !== 0) { + throw new Error(`${command} ${args.join(" ")} failed`); + } +} + +function importSource(sourceRoot, relativePath, variant) { + const path = join(sourceRoot, relativePath); + if (!existsSync(path)) { + throw new Error(`Missing source file for ${variant}: ${path}`); + } + const url = pathToFileURL(path); + url.searchParams.set("migrationVariant", variant); + // eslint-disable-next-line stll/no-dynamic-import-specifier + return import(url.href); +} + +function positiveIntegerEnv(name, fallback) { + const raw = process.env[name]; + if (raw === undefined) { + return fallback; + } + const value = Number(raw); + if (!Number.isInteger(value) || value < 0) { + throw new Error(`${name} must be a non-negative integer`); + } + return value; +} + +function requiredEnv(name) { + const value = process.env[name]; + if (value === undefined || value === "") { + throw new Error(`Missing required environment variable: ${name}`); + } + return value; +} + +function elapsedMs(start) { + return roundMs((Bun.nanoseconds() - start) / 1_000_000); +} + +function roundMs(ms) { + return Math.round(ms * 1_000) / 1_000; +} From cafd2d8e17317575988694ca202e1c89e9ea0a0b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 12:17:15 +0200 Subject: [PATCH 020/130] feat: port custom deny-list slice --- crates/anonymize-adapter-contract/src/lib.rs | 22 ++- crates/anonymize-core/src/lib.rs | 5 +- crates/anonymize-core/src/prepared.rs | 89 ++++++++++-- crates/anonymize-core/src/processors.rs | 131 ++++++++++++++++++ crates/anonymize-core/src/types.rs | 24 ++++ crates/anonymize-core/tests/prepared.rs | 113 ++++++++++++++- crates/anonymize-core/tests/processors.rs | 101 +++++++++++++- crates/anonymize-napi/src/lib.rs | 30 +++- .../anonymize/scripts/native-adapter-perf.mjs | 29 +++- .../__test__/native-adapter-parity.test.ts | 24 +++- 10 files changed, 535 insertions(+), 33 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 5108c331..bbb7e5db 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -2,9 +2,9 @@ use std::collections::BTreeMap; use serde::{Deserialize, Serialize}; use stella_anonymize_core::{ - CountryMatchData, DetectionSource, FuzzySearchOptions, GazetteerMatchData, - LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, - PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + CountryMatchData, DenyListMatchData, DetectionSource, FuzzySearchOptions, + GazetteerMatchData, LiteralSearchOptions, OperatorConfig, OperatorType, + PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, StaticRedactionResult, }; @@ -100,6 +100,14 @@ pub struct BindingCountryMatchData { pub labels: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDenyListMatchData { + pub labels: Vec>, + pub custom_labels: Vec>, + pub originals: Vec, + pub sources: Vec>, +} + #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] pub struct BindingPreparedSearchConfig { #[serde(default)] @@ -121,6 +129,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub custom_regex_meta: Vec, #[serde(default)] + pub deny_list_data: Option, + #[serde(default)] pub gazetteer_data: Option, #[serde(default)] pub country_data: Option, @@ -186,6 +196,12 @@ pub fn prepared_search_config_from_binding( slices: slices_from_binding(&config.slices), regex_meta: regex_meta_from_binding(config.regex_meta)?, custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, + deny_list_data: config.deny_list_data.map(|data| DenyListMatchData { + labels: data.labels, + custom_labels: data.custom_labels, + originals: data.originals, + sources: data.sources, + }), gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { labels: data.labels, is_fuzzy: data.is_fuzzy, diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 2e79c9c5..f26da0f2 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -19,8 +19,9 @@ pub use prepared::{ PreparedSearchSlices, StaticDetectionResult, StaticRedactionResult, }; pub use processors::{ - CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, - process_country_matches, process_gazetteer_matches, process_regex_matches, + CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, + RegexMatchMeta, process_country_matches, process_deny_list_matches, + process_gazetteer_matches, process_regex_matches, }; pub use redact::{deanonymise, redact_text}; pub use resolution::{ diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 816231d8..68534c19 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1,7 +1,8 @@ use crate::normalize::normalize_for_search; use crate::processors::{ - CountryMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, - process_country_matches, process_gazetteer_matches, process_regex_matches, + CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, + RegexMatchMeta, ensure_custom_deny_list_sources, process_country_matches, + process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; use crate::redact::redact_text; use crate::resolution::{ @@ -21,6 +22,7 @@ pub struct PreparedSearch { slices: PreparedSearchSlices, regex_meta: Vec, custom_regex_meta: Vec, + deny_list_data: Option, gazetteer_data: Option, country_data: Option, } @@ -48,6 +50,7 @@ pub struct PreparedSearchConfig { pub slices: PreparedSearchSlices, pub regex_meta: Vec, pub custom_regex_meta: Vec, + pub deny_list_data: Option, pub gazetteer_data: Option, pub country_data: Option, } @@ -64,6 +67,7 @@ pub struct StaticDetectionResult { pub matches: PreparedSearchMatches, pub regex_entities: Vec, pub custom_regex_entities: Vec, + pub deny_list_entities: Vec, pub gazetteer_entities: Vec, pub country_entities: Vec, } @@ -77,7 +81,7 @@ pub struct StaticRedactionResult { impl PreparedSearch { pub fn new(config: PreparedSearchConfig) -> Result { - validate_supported_slices(&config.slices)?; + validate_supported_config(&config)?; Ok(Self { regex: SearchIndex::new(config.regex_patterns, config.regex_options)?, @@ -92,6 +96,7 @@ impl PreparedSearch { slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, gazetteer_data: config.gazetteer_data, country_data: config.country_data, }) @@ -124,6 +129,16 @@ impl PreparedSearch { full_text, &self.custom_regex_meta, )?; + let deny_list_entities = if let Some(data) = &self.deny_list_data { + process_deny_list_matches( + &matches.literal, + self.slices.deny_list, + full_text, + data, + )? + } else { + Vec::new() + }; let gazetteer_entities = if let Some(data) = &self.gazetteer_data { process_gazetteer_matches( &matches.literal, @@ -149,6 +164,7 @@ impl PreparedSearch { matches, regex_entities, custom_regex_entities, + deny_list_entities, gazetteer_entities, country_entities, }) @@ -178,11 +194,11 @@ impl PreparedSearch { } } -fn validate_supported_slices(slices: &PreparedSearchSlices) -> Result<()> { - reject_unsupported_slice(slices.legal_forms, "legal_forms")?; - reject_unsupported_slice(slices.triggers, "triggers")?; - reject_unsupported_slice(slices.deny_list, "deny_list")?; - reject_unsupported_slice(slices.street_types, "street_types") +fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { + reject_unsupported_slice(config.slices.legal_forms, "legal_forms")?; + reject_unsupported_slice(config.slices.triggers, "triggers")?; + validate_deny_list_config(config)?; + reject_unsupported_slice(config.slices.street_types, "street_types") } const fn reject_unsupported_slice( @@ -196,6 +212,61 @@ const fn reject_unsupported_slice( Err(Error::UnsupportedStaticSlice { slice: name }) } +fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.deny_list.is_empty() { + return Ok(()); + } + + let Some(data) = &config.deny_list_data else { + return Err(Error::UnsupportedStaticSlice { slice: "deny_list" }); + }; + + validate_static_data_length( + "deny_list.labels", + config.slices.deny_list, + data.labels.len(), + )?; + validate_static_data_length( + "deny_list.custom_labels", + config.slices.deny_list, + data.custom_labels.len(), + )?; + validate_static_data_length( + "deny_list.originals", + config.slices.deny_list, + data.originals.len(), + )?; + validate_static_data_length( + "deny_list.sources", + config.slices.deny_list, + data.sources.len(), + )?; + ensure_custom_deny_list_sources(data) +} + +fn validate_static_data_length( + field: &'static str, + slice: PatternSlice, + actual: usize, +) -> Result<()> { + let expected = usize::try_from(slice.len()).map_err(|_| { + Error::StaticDataLengthMismatch { + field, + expected: usize::MAX, + actual, + } + })?; + if actual == expected { + return Ok(()); + } + + Err(Error::StaticDataLengthMismatch { + field, + expected, + actual, + }) +} + impl StaticDetectionResult { #[must_use] pub fn all_entities(&self) -> Vec { @@ -203,11 +274,13 @@ impl StaticDetectionResult { .regex_entities .len() .saturating_add(self.custom_regex_entities.len()) + .saturating_add(self.deny_list_entities.len()) .saturating_add(self.gazetteer_entities.len()) .saturating_add(self.country_entities.len()); let mut entities = Vec::with_capacity(capacity); entities.extend(self.regex_entities.iter().cloned()); entities.extend(self.custom_regex_entities.iter().cloned()); + entities.extend(self.deny_list_entities.iter().cloned()); entities.extend(self.gazetteer_entities.iter().cloned()); entities.extend(self.country_entities.iter().cloned()); entities diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 00cac250..c7170727 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -6,7 +6,9 @@ const MIN_PHONE_LENGTH: usize = 7; const GAZETTEER_EXACT_SCORE: f64 = 0.9; const GAZETTEER_FUZZY_SCORE: f64 = 0.85; const COUNTRY_SCORE: f64 = 0.95; +const DENY_LIST_SCORE: f64 = 0.9; const MAX_GAZETTEER_PREFIX_OVERSHOOT: u32 = 7; +pub(crate) const CUSTOM_DENY_LIST_SOURCE: &str = "custom-deny-list"; #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub struct PatternSlice { @@ -20,6 +22,11 @@ impl PatternSlice { self.start >= self.end } + #[must_use] + pub const fn len(self) -> u32 { + self.end.saturating_sub(self.start) + } + #[must_use] pub const fn contains(self, pattern: u32) -> bool { pattern >= self.start && pattern < self.end @@ -64,6 +71,14 @@ pub struct CountryMatchData { pub labels: Vec, } +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct DenyListMatchData { + pub labels: Vec>, + pub custom_labels: Vec>, + pub originals: Vec, + pub sources: Vec>, +} + pub fn process_regex_matches( matches: &[SearchMatch], slice: PatternSlice, @@ -108,6 +123,59 @@ pub fn process_regex_matches( Ok(results) } +pub fn process_deny_list_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &DenyListMatchData, +) -> Result> { + let offsets = Utf16Offsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + ensure_custom_deny_list_source( + data.sources.get(local_index).map(Vec::as_slice), + )?; + + let Some(custom_labels) = data.custom_labels.get(local_index) else { + continue; + }; + if custom_labels.is_empty() { + continue; + } + + let pattern = data.originals.get(local_index).map_or("", String::as_str); + if !custom_match_has_valid_edges( + full_text, + &offsets, + found.start(), + found.end(), + pattern, + )? { + continue; + } + + let text = offsets.slice(full_text, found.start(), found.end())?; + for label in custom_labels { + let mut entity = PipelineEntity::detected( + found.start(), + found.end(), + label.clone(), + text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + ); + entity.source_detail = Some(SourceDetail::CustomDenyList); + results.push(entity); + } + } + + Ok(results) +} + pub fn process_gazetteer_matches( matches: &[SearchMatch], slice: PatternSlice, @@ -220,6 +288,39 @@ pub fn process_country_matches( Ok(results) } +pub(crate) fn ensure_custom_deny_list_sources( + data: &DenyListMatchData, +) -> Result<()> { + for sources in &data.sources { + ensure_custom_deny_list_source(Some(sources))?; + } + + Ok(()) +} + +fn ensure_custom_deny_list_source(sources: Option<&[String]>) -> Result<()> { + let Some(sources) = sources else { + return Err(Error::UnsupportedDenyListSource { + source: String::from(""), + }); + }; + if sources.is_empty() { + return Err(Error::UnsupportedDenyListSource { + source: String::from(""), + }); + } + if let Some(source) = sources + .iter() + .find(|source| source.as_str() != CUSTOM_DENY_LIST_SOURCE) + { + return Err(Error::UnsupportedDenyListSource { + source: source.clone(), + }); + } + + Ok(()) +} + fn try_gazetteer_prefix_extension( full_text: &str, offsets: &Utf16Offsets, @@ -288,6 +389,36 @@ fn starts_as_proper_noun( Ok(ch.to_string() == upper) } +fn custom_match_has_valid_edges( + full_text: &str, + offsets: &Utf16Offsets, + start: u32, + end: u32, + pattern: &str, +) -> Result { + if !pattern.chars().any(char::is_alphanumeric) { + return Ok(true); + } + + let start_byte = offsets.validate_offset(start)?; + let end_byte = offsets.validate_offset(end)?; + let previous = full_text + .get(..start_byte) + .and_then(|prefix| prefix.chars().next_back()); + if previous.is_some_and(char::is_alphanumeric) { + return Ok(false); + } + + let next = full_text + .get(end_byte..) + .and_then(|suffix| suffix.chars().next()); + if next.is_some_and(char::is_alphanumeric) { + return Ok(false); + } + + Ok(true) +} + const fn fuzzy_distance(found: &SearchMatch) -> Option { let SearchMatch::Fuzzy { distance, .. } = found else { return None; diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index d72a1fff..38201587 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -32,6 +32,14 @@ pub enum Error { UnsupportedStaticSlice { slice: &'static str, }, + UnsupportedDenyListSource { + source: String, + }, + StaticDataLengthMismatch { + field: &'static str, + expected: usize, + actual: usize, + }, } impl fmt::Display for Error { @@ -73,6 +81,22 @@ impl fmt::Display for Error { "Static slice '{slice}' is configured but not supported by native core" ) } + Self::UnsupportedDenyListSource { source } => { + write!( + formatter, + "Deny-list source '{source}' is not supported by native core" + ) + } + Self::StaticDataLengthMismatch { + field, + expected, + actual, + } => { + write!( + formatter, + "Static data field '{field}' has {actual} item(s), expected {expected}" + ) + } } } } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 58bc015e..38ec9990 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1,10 +1,11 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - CountryMatchData, DetectionSource, Error, FuzzySearchOptions, - GazetteerMatchData, LiteralSearchOptions, OperatorConfig, PatternSlice, - PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, + CountryMatchData, DenyListMatchData, DetectionSource, Error, + FuzzySearchOptions, GazetteerMatchData, LiteralSearchOptions, OperatorConfig, + PatternSlice, PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, + RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, + SourceDetail, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -18,6 +19,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { slices, regex_meta: vec![], custom_regex_meta: vec![], + deny_list_data: None, gazetteer_data: None, country_data: None, } @@ -42,6 +44,7 @@ fn prepared_search_runs_normalized_literal_pass() { }, regex_meta: vec![], custom_regex_meta: vec![], + deny_list_data: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -109,6 +112,7 @@ fn prepared_search_emits_static_detector_entities() { source_detail: Some(SourceDetail::CustomRegex), requires_validation: false, }], + deny_list_data: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -172,6 +176,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { }, regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], custom_regex_meta: vec![], + deny_list_data: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -197,6 +202,54 @@ fn prepared_search_redacts_static_entities_end_to_end() { assert_eq!(result.resolved_entities.len(), 3); } +#[test] +fn prepared_search_redacts_custom_deny_list_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Secret Code"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("matter")]], + custom_labels: vec![vec![String::from("matter")]], + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]], + }), + gazetteer_data: None, + country_data: None, + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Secret Code was disclosed.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.detections.deny_list_entities.len(), 1); + assert_eq!(result.redaction.redacted_text, "[MATTER_1] was disclosed."); + assert_eq!(result.redaction.entity_count, 1); +} + #[test] fn prepared_search_rejects_unsupported_static_slices() { let unsupported = PatternSlice { start: 0, end: 1 }; @@ -239,3 +292,55 @@ fn prepared_search_rejects_unsupported_static_slices() { assert_eq!(error, Error::UnsupportedStaticSlice { slice }); } } + +#[test] +fn prepared_search_rejects_curated_deny_list_sources() { + let error = PreparedSearch::new(PreparedSearchConfig { + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]], + custom_labels: vec![vec![]], + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]], + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("curated deny-list source should be rejected"); + + assert_eq!( + error, + Error::UnsupportedDenyListSource { + source: String::from("city") + } + ); +} + +#[test] +fn prepared_search_rejects_truncated_deny_list_data() { + let error = PreparedSearch::new(PreparedSearchConfig { + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("matter")]], + custom_labels: vec![], + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]], + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("truncated deny-list data should be rejected"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "deny_list.custom_labels", + expected: 1, + actual: 0 + } + ); +} diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs index 53774c56..9cdee5e6 100644 --- a/crates/anonymize-core/tests/processors.rs +++ b/crates/anonymize-core/tests/processors.rs @@ -1,9 +1,10 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - CountryMatchData, DetectionSource, GazetteerMatchData, PatternSlice, - PipelineEntity, RegexMatchMeta, SearchMatch, SourceDetail, - process_country_matches, process_gazetteer_matches, process_regex_matches, + CountryMatchData, DenyListMatchData, DetectionSource, Error, + GazetteerMatchData, PatternSlice, PipelineEntity, RegexMatchMeta, + SearchMatch, SourceDetail, process_country_matches, + process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; #[test] @@ -104,6 +105,100 @@ fn regex_processor_preserves_custom_regex_source_detail() { assert_eq!(entities[0].source_detail, Some(SourceDetail::CustomRegex)); } +#[test] +fn deny_list_processor_emits_custom_labels() { + let matches = vec![SearchMatch::Literal { + pattern: 3, + start: 0, + end: 11, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("matter")]], + custom_labels: vec![vec![String::from("matter")]], + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]], + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 3, end: 4 }, + "Secret Code filed", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Secret Code"); + assert_eq!(entities[0].source, DetectionSource::DenyList); + assert_eq!( + entities[0].source_detail, + Some(SourceDetail::CustomDenyList) + ); +} + +#[test] +fn deny_list_processor_rejects_embedded_custom_word_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }, + SearchMatch::Literal { + pattern: 0, + start: 14, + end: 20, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("matter")]], + custom_labels: vec![vec![String::from("matter")]], + originals: vec![String::from("Secret")], + sources: vec![vec![String::from("custom-deny-list")]], + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Secret filed xSecret.", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Secret"); +} + +#[test] +fn deny_list_processor_rejects_curated_sources() { + let matches = vec![SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]], + custom_labels: vec![vec![]], + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]], + }; + + let error = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Prague", + &data, + ) + .unwrap_err(); + + assert_eq!( + error, + Error::UnsupportedDenyListSource { + source: String::from("city") + } + ); +} + #[test] fn gazetteer_processor_extends_exact_matches_and_drops_overlapping_fuzzy() { let matches = vec![ diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index a2c86487..4936edeb 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -3,12 +3,13 @@ use std::collections::BTreeMap; use napi::bindgen_prelude::*; use napi_derive::napi; use stella_anonymize_adapter_contract::{ - BindingCountryMatchData, BindingGazetteerMatchData, BindingOperatorConfig, - BindingOperatorEntry, BindingPatternSlice, BindingPreparedSearchConfig, - BindingPreparedSearchSlices, BindingRedactionResult, BindingRegexMatchMeta, - BindingSearchOptions, BindingSearchPattern, BindingStaticRedactionResult, - ContractError, operator_config_from_binding, - prepared_search_config_from_binding, static_redaction_result_to_binding, + BindingCountryMatchData, BindingDenyListMatchData, BindingGazetteerMatchData, + BindingOperatorConfig, BindingOperatorEntry, BindingPatternSlice, + BindingPreparedSearchConfig, BindingPreparedSearchSlices, + BindingRedactionResult, BindingRegexMatchMeta, BindingSearchOptions, + BindingSearchPattern, BindingStaticRedactionResult, ContractError, + operator_config_from_binding, prepared_search_config_from_binding, + static_redaction_result_to_binding, }; use stella_anonymize_core::PreparedSearch; @@ -68,6 +69,14 @@ pub struct JsCountryMatchData { pub labels: Vec, } +#[napi(object)] +pub struct JsDenyListMatchData { + pub labels: Vec>, + pub custom_labels: Vec>, + pub originals: Vec, + pub sources: Vec>, +} + #[napi(object)] pub struct JsPreparedSearchConfig { pub regex_patterns: Vec, @@ -79,6 +88,7 @@ pub struct JsPreparedSearchConfig { pub slices: JsPreparedSearchSlices, pub regex_meta: Vec, pub custom_regex_meta: Vec, + pub deny_list_data: Option, pub gazetteer_data: Option, pub country_data: Option, } @@ -216,6 +226,14 @@ fn to_binding_config( slices: to_binding_slices(&config.slices), regex_meta: to_binding_regex_meta(config.regex_meta), custom_regex_meta: to_binding_regex_meta(config.custom_regex_meta), + deny_list_data: config.deny_list_data.map(|data| { + BindingDenyListMatchData { + labels: data.labels, + custom_labels: data.custom_labels, + originals: data.originals, + sources: data.sources, + } + }), gazetteer_data: config.gazetteer_data.map(|data| { BindingGazetteerMatchData { labels: data.labels, diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index bdb41d8f..0fe6e30e 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -11,6 +11,12 @@ const configJson = JSON.stringify({ regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Secret Code", + case_insensitive: true, + whole_words: true, + }, { kind: "literal-with-options", pattern: "Acme", @@ -37,13 +43,20 @@ const configJson = JSON.stringify({ slices: { regex: { start: 0, end: 1 }, custom_regex: { start: 0, end: 1 }, - gazetteer: { start: 0, end: 2 }, - countries: { start: 2, end: 3 }, + deny_list: { start: 0, end: 1 }, + gazetteer: { start: 1, end: 3 }, + countries: { start: 3, end: 4 }, }, regex_meta: [{ label: "registration number", score: 0.9 }], custom_regex_meta: [ { label: "matter id", score: 1, source_detail: "custom-regex" }, ], + deny_list_data: { + labels: [["matter"]], + custom_labels: [["matter"]], + originals: ["Secret Code"], + sources: [["custom-deny-list"]], + }, gazetteer_data: { labels: ["organization", "address"], is_fuzzy: [false, true], @@ -162,6 +175,7 @@ function buildCases() { undefined, JSON.stringify({ operators: { country: "redact" } }), JSON.stringify({ operators: { address: "redact", country: "redact" } }), + JSON.stringify({ operators: { matter: "redact" } }), ]; const fixtureCases = []; @@ -172,7 +186,7 @@ function buildCases() { fixtureCases.push({ text: `Reference ${registration} for Acme s.r.o. near ` + - `${place}, Turkey, matter ${matter}.`, + `${place}, Turkey, matter ${matter}, code Secret Code.`, operatorsJson: operators[index % operators.length], }); } @@ -200,6 +214,15 @@ function toNapiConfig(config) { }, regexMeta: config.regex_meta.map(toNapiRegexMeta), customRegexMeta: config.custom_regex_meta.map(toNapiRegexMeta), + denyListData: + config.deny_list_data === undefined + ? undefined + : { + labels: config.deny_list_data.labels, + customLabels: config.deny_list_data.custom_labels, + originals: config.deny_list_data.originals, + sources: config.deny_list_data.sources, + }, gazetteerData: config.gazetteer_data === undefined ? undefined diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 880cd192..fc7e038c 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -55,6 +55,12 @@ const CONFIG_JSON = JSON.stringify({ regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Secret Code", + case_insensitive: true, + whole_words: true, + }, { kind: "literal-with-options", pattern: "Acme", @@ -81,13 +87,20 @@ const CONFIG_JSON = JSON.stringify({ slices: { regex: { start: 0, end: 1 }, custom_regex: { start: 0, end: 1 }, - gazetteer: { start: 0, end: 2 }, - countries: { start: 2, end: 3 }, + deny_list: { start: 0, end: 1 }, + gazetteer: { start: 1, end: 3 }, + countries: { start: 3, end: 4 }, }, regex_meta: [{ label: "registration number", score: 0.9 }], custom_regex_meta: [ { label: "matter id", score: 1, source_detail: "custom-regex" }, ], + deny_list_data: { + labels: [["matter"]], + custom_labels: [["matter"]], + originals: ["Secret Code"], + sources: [["custom-deny-list"]], + }, gazetteer_data: { labels: ["organization", "address"], is_fuzzy: [false, true], @@ -180,6 +193,7 @@ const operatorsArb = fc.option( { country: "redact" }, { address: "redact", country: "redact" }, { "matter id": "redact" }, + { matter: "redact" }, ), { nil: null }, ); @@ -198,7 +212,8 @@ const generatedCaseArb: fc.Arbitrary = fc ({ left, middle, right, registration, matter, fuzzyPlace, operators }) => { const text = `${left}Reference ${registration} for Acme s.r.o. near ` + - `${fuzzyPlace}, Turkey, matter ${matter}.${middle}${right}`; + `${fuzzyPlace}, Turkey, matter ${matter}, code Secret Code.` + + `${middle}${right}`; return { text, operators, @@ -208,6 +223,7 @@ const generatedCaseArb: fc.Arbitrary = fc fuzzyPlace, "Turkey", matter, + "Secret Code", ], }; }, @@ -243,7 +259,7 @@ describe("native adapter parity", () => { for (const [index, item] of cases.entries()) { const result = tsResults.at(index); expect(result).toBeDefined(); - expect(result?.redaction.entity_count).toBe(5); + expect(result?.redaction.entity_count).toBe(6); for (const value of item.sensitiveValues) { expect(result?.redaction.redacted_text).not.toContain(value); } From 11fb186ff1e6c9ffe66154831e4c39f9cd9503f7 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 12:30:37 +0200 Subject: [PATCH 021/130] fix: tighten cli dictionary scope type --- packages/cli/src/dictionary-scope.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/cli/src/dictionary-scope.ts b/packages/cli/src/dictionary-scope.ts index bcb773e4..4e056aac 100644 --- a/packages/cli/src/dictionary-scope.ts +++ b/packages/cli/src/dictionary-scope.ts @@ -34,7 +34,7 @@ const pickKeys = ( }; /** Dictionaries with every section present (possibly empty). */ -export type ScopedDictionaries = Dictionaries & { +export type ScopedDictionaries = { firstNames: Record; surnames: Record; denyList: Record; From 6947fabe24375fe89a3a50c74af168865201007e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 12:30:41 +0200 Subject: [PATCH 022/130] test: report migration fixture runtime coverage --- .../scripts/migration-fixture-perf.mjs | 200 +++++++++++++++++- 1 file changed, 199 insertions(+), 1 deletion(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 5824b43f..fa1c2323 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -26,6 +26,8 @@ const BASELINE_REF = process.env.ANONYMIZE_MIGRATION_BASELINE_REF ?? "origin/main"; const COMPARE_BASELINE = process.env.ANONYMIZE_MIGRATION_COMPARE_BASELINE !== "0"; +const REQUIRE_NATIVE_PIPELINE = + process.env.ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE === "1"; const WARM_ITERATIONS = positiveIntegerEnv( "ANONYMIZE_MIGRATION_WARM_ITERATIONS", 2, @@ -67,6 +69,15 @@ async function runCoordinator() { }); printVariantSummary(candidate); + if ( + REQUIRE_NATIVE_PIPELINE && + !candidate.nativeRewrite.measuredInPipeline + ) { + throw new Error( + "Native pipeline is required, but the candidate run used the TypeScript pipeline", + ); + } + if (baseline !== null) { const comparison = compareSnapshots(baseline, candidate); console.log(JSON.stringify(comparison)); @@ -150,8 +161,9 @@ async function runWorker() { const context = indexModule.createPipelineContext(); const prepareStart = Bun.nanoseconds(); - await indexModule.preparePipelineSearch({ config, context }); + const search = await indexModule.preparePipelineSearch({ config, context }); const prepareMs = elapsedMs(prepareStart); + const nativeRewrite = describeNativeRewrite(config, search); const coldRun = await runFixtureSweep({ indexModule, @@ -175,6 +187,7 @@ async function runWorker() { const warmRunMs = roundMs(warmRuns.reduce((sum, run) => sum + run.ms, 0)); const warmAvgMs = WARM_ITERATIONS === 0 ? 0 : roundMs(warmRunMs / WARM_ITERATIONS); + const fixtureTimings = summarizeFixtureTimings(coldRun, warmRuns); const snapshots = Object.fromEntries( coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), ); @@ -184,6 +197,8 @@ async function runWorker() { `${JSON.stringify({ event: "fixture-migration-variant", variant, + pipelineRuntime: "typescript", + nativeRewrite, fixtureCount: fixtures.length, warmIterations: WARM_ITERATIONS, timings: { @@ -193,9 +208,11 @@ async function runWorker() { coldRunMs: coldRun.ms, coldPipelineMs: roundMs(dictionaryMs + prepareMs + coldRun.ms), coldTotalMs: roundMs(importMs + dictionaryMs + prepareMs + coldRun.ms), + warmRunMsByIteration: warmRuns.map((run) => run.ms), warmRunMs, warmAvgMs, }, + fixtureTimings, fixtures: coldRun.fixtures.map( ({ fixture, ms, entityCount, redactedTextLength }) => ({ fixture, @@ -292,6 +309,10 @@ function compareSnapshots(baseline, candidate) { fixtureCount: fixtureNames.size, mismatches, timingComparison: timingComparison(baseline, candidate), + nativeRewrite: { + baseline: baseline.nativeRewrite, + candidate: candidate.nativeRewrite, + }, }; } @@ -358,14 +379,191 @@ function printVariantSummary(result) { JSON.stringify({ event: result.event, variant: result.variant, + pipelineRuntime: result.pipelineRuntime, + nativeRewrite: result.nativeRewrite, fixtureCount: result.fixtureCount, warmIterations: result.warmIterations, timings: result.timings, + fixtureTimings: result.fixtureTimings, fixtures: result.fixtures, }), ); } +function describeNativeRewrite(config, search) { + const sliceLengths = Object.fromEntries( + Object.entries(search.slices).map(([name, slice]) => [ + name, + sliceLength(slice), + ]), + ); + const regexValidationSlots = countRegexValidationSlots(search.regexMeta); + const denyListSourceCounts = countDenyListSources(search.denyListData); + const unsupportedSearchSlots = [ + unsupportedSlot("regex", regexValidationSlots, "regex validators"), + unsupportedSlot("triggers", sliceLengths.triggers, "trigger extraction"), + unsupportedSlot("streetTypes", sliceLengths.streetTypes, "address seeds"), + unsupportedSlot( + "denyList", + denyListSourceCounts.curated, + "curated deny-list semantics", + ), + ].filter((slot) => slot.count > 0); + const supportedSearchSlots = + Math.max(0, sliceLengths.regex - regexValidationSlots) + + sliceLengths.customRegex + + denyListSourceCounts.customOnly + + sliceLengths.gazetteer + + sliceLengths.countries; + const totalSearchSlots = Object.values(sliceLengths).reduce( + (sum, length) => sum + length, + 0, + ); + const unsupportedPipelineStages = describeUnsupportedPipelineStages( + config, + search, + denyListSourceCounts, + ); + + return { + measuredInPipeline: false, + pipelineRuntime: "typescript", + fullPipelineNativeEligible: + unsupportedSearchSlots.length === 0 && + unsupportedPipelineStages.length === 0, + searchSlotCoverage: { + supported: supportedSearchSlots, + total: totalSearchSlots, + ratio: + totalSearchSlots === 0 + ? 1 + : roundMs(supportedSearchSlots / totalSearchSlots), + }, + sliceLengths, + unsupportedSearchSlots, + unsupportedPipelineStages, + }; +} + +function describeUnsupportedPipelineStages( + config, + search, + denyListSourceCounts, +) { + const stages = []; + if (config.enableLegalForms) { + stages.push("legal-forms-v2"); + } + if (config.enableTriggerPhrases) { + stages.push("triggers"); + } + if (config.enableDenyList && denyListSourceCounts.curated > 0) { + stages.push("curated-deny-list"); + } + if (config.enableNameCorpus) { + stages.push("name-corpus"); + } + if (config.enableHotwordRules) { + stages.push("hotword-rules"); + } + if (config.enableZoneClassification) { + stages.push("zone-classification"); + } + if (config.enableConfidenceBoost) { + stages.push("confidence-boost"); + } + if (config.enableCoreference) { + stages.push("coreference"); + } + if (sliceLength(search.slices.streetTypes) > 0) { + stages.push("address-seeds"); + } + + stages.push("signatures", "false-positive-filters", "final-extensions"); + return stages; +} + +function countRegexValidationSlots(regexMeta) { + return regexMeta.reduce( + (count, meta) => count + (meta.requiresValidation === true ? 1 : 0), + 0, + ); +} + +function countDenyListSources(denyListData) { + if (!denyListData) { + return { customOnly: 0, curated: 0 }; + } + + let customOnly = 0; + let curated = 0; + for (const sources of denyListData.sources) { + const sourceList = Array.isArray(sources) ? sources : [sources]; + if ( + sourceList.length > 0 && + sourceList.every((source) => source === "custom-deny-list") + ) { + customOnly += 1; + } else { + curated += 1; + } + } + + return { customOnly, curated }; +} + +function unsupportedSlot(slice, count, reason) { + return { slice, count, reason }; +} + +function sliceLength(slice) { + return Math.max(0, Number(slice.end ?? 0) - Number(slice.start ?? 0)); +} + +function summarizeFixtureTimings(coldRun, warmRuns) { + return { + cold: summarizeRunFixtures(coldRun.fixtures), + warm: + warmRuns.length === 0 + ? null + : summarizeRunFixtures(warmRuns.flatMap((run) => run.fixtures)), + byFixture: coldRun.fixtures.map((coldFixture, index) => { + const warmMs = warmRuns + .map((run) => run.fixtures.at(index)?.ms) + .filter((ms) => typeof ms === "number"); + return { + fixture: coldFixture.fixture, + coldMs: coldFixture.ms, + warmAvgMs: + warmMs.length === 0 + ? null + : roundMs(warmMs.reduce((sum, ms) => sum + ms, 0) / warmMs.length), + }; + }), + }; +} + +function summarizeRunFixtures(fixtures) { + const values = fixtures.map((fixture) => fixture.ms).sort((a, b) => a - b); + return { + minMs: percentile(values, 0), + p50Ms: percentile(values, 0.5), + p95Ms: percentile(values, 0.95), + maxMs: percentile(values, 1), + }; +} + +function percentile(values, fraction) { + if (values.length === 0) { + return 0; + } + const index = Math.min( + values.length - 1, + Math.max(0, Math.ceil(values.length * fraction) - 1), + ); + return roundMs(values[index]); +} + function firstDifferentIndex(left, right) { const len = Math.max(left.length, right.length); for (let index = 0; index < len; index += 1) { From 7adb235a9c119d9ca13b31fecb2d0a63ea4dcaff Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 19:42:39 +0200 Subject: [PATCH 023/130] feat: wire native static redaction --- Cargo.lock | 22 +- crates/anonymize-adapter-contract/src/lib.rs | 210 ++- crates/anonymize-core/Cargo.toml | 4 +- crates/anonymize-core/src/byte_offsets.rs | 62 + crates/anonymize-core/src/diagnostics.rs | 203 +++ crates/anonymize-core/src/lib.rs | 18 +- crates/anonymize-core/src/normalize.rs | 98 ++ crates/anonymize-core/src/prepared.rs | 319 +++- crates/anonymize-core/src/processors.rs | 1297 ++++++++++++++++- crates/anonymize-core/src/redact.rs | 11 +- .../anonymize-core/src/resolution/boundary.rs | 18 +- .../anonymize-core/src/resolution/common.rs | 4 +- crates/anonymize-core/src/resolution/mod.rs | 1 + .../anonymize-core/src/resolution/sanitize.rs | 61 +- crates/anonymize-core/src/search.rs | 476 +++--- crates/anonymize-core/src/types.rs | 60 +- crates/anonymize-core/src/utf16.rs | 66 - crates/anonymize-core/tests/normalize.rs | 4 +- crates/anonymize-core/tests/prepared.rs | 125 +- crates/anonymize-core/tests/processors.rs | 53 +- crates/anonymize-core/tests/redaction.rs | 18 +- crates/anonymize-core/tests/resolution.rs | 48 +- crates/anonymize-core/tests/search.rs | 8 +- crates/anonymize-napi/src/lib.rs | 135 +- crates/anonymize-py/src/lib.rs | 60 +- .../scripts/migration-fixture-perf.mjs | 443 +++++- .../anonymize/scripts/native-adapter-perf.mjs | 58 +- .../__test__/native-adapter-parity.test.ts | 205 ++- .../anonymize/src/build-unified-search.ts | 359 +++++ .../anonymize/src/data/deny-list-filters.json | 81 + packages/anonymize/src/detectors/deny-list.ts | 266 ++-- packages/anonymize/src/detectors/regex.ts | 34 +- 32 files changed, 4073 insertions(+), 754 deletions(-) create mode 100644 crates/anonymize-core/src/byte_offsets.rs create mode 100644 crates/anonymize-core/src/diagnostics.rs delete mode 100644 crates/anonymize-core/src/utf16.rs create mode 100644 packages/anonymize/src/data/deny-list-filters.json diff --git a/Cargo.lock b/Cargo.lock index a0b687aa..5f81042e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -437,7 +437,7 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" -source = "git+https://github.com/stella/aho-corasick?rev=ad5dfa06c1be8bffda75e050030fa4e70b93c75f#ad5dfa06c1be8bffda75e050030fa4e70b93c75f" +source = "git+https://github.com/stella/aho-corasick?rev=b67e487aedc17725f57fb5b5d60678475b6b6667#b67e487aedc17725f57fb5b5d60678475b6b6667" dependencies = [ "aho-corasick", "unicode-case-mapping", @@ -456,9 +456,7 @@ dependencies = [ name = "stella-anonymize-core" version = "1.5.0" dependencies = [ - "stella-aho-corasick-core", - "stella-fuzzy-search-core", - "stella-regex-set-core", + "stella-text-search-core", ] [[package]] @@ -486,8 +484,8 @@ dependencies = [ [[package]] name = "stella-fuzzy-search-core" -version = "1.1.2" -source = "git+https://github.com/stella/fuzzy-search?rev=4ccb8ced60d8f2ff7f5d1870d2931556e8247632#4ccb8ced60d8f2ff7f5d1870d2931556e8247632" +version = "1.1.3" +source = "git+https://github.com/stella/fuzzy-search?rev=0743b9c6710a84bb7e6863fdcda9a9cc1dce4fa2#0743b9c6710a84bb7e6863fdcda9a9cc1dce4fa2" dependencies = [ "unicode-case-mapping", "unicode-normalization", @@ -497,7 +495,7 @@ dependencies = [ [[package]] name = "stella-regex-set-core" version = "1.0.5" -source = "git+https://github.com/stella/regex-set?rev=a50fdc018b40d23ecf732be85b00d495bd6d95cf#a50fdc018b40d23ecf732be85b00d495bd6d95cf" +source = "git+https://github.com/stella/regex-set?rev=8b80241a5a54cef8fdc6b6b34119981db0c6f597#8b80241a5a54cef8fdc6b6b34119981db0c6f597" dependencies = [ "fancy-regex", "regex", @@ -506,6 +504,16 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "stella-text-search-core" +version = "1.0.6" +source = "git+https://github.com/stella/text-search?rev=e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e#e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e" +dependencies = [ + "stella-aho-corasick-core", + "stella-fuzzy-search-core", + "stella-regex-set-core", +] + [[package]] name = "syn" version = "2.0.118" diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index bbb7e5db..484119ce 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -1,11 +1,13 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use serde::{Deserialize, Serialize}; use stella_anonymize_core::{ - CountryMatchData, DenyListMatchData, DetectionSource, FuzzySearchOptions, + CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, LiteralSearchOptions, OperatorConfig, OperatorType, PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, + RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, SourceDetail, + StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, }; @@ -47,6 +49,10 @@ pub struct BindingSearchPattern { pub distance: Option, pub case_insensitive: Option, pub whole_words: Option, + pub lazy: Option, + pub prefilter_any: Option>, + pub prefilter_case_insensitive: Option, + pub prefilter_regex: Option, } #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] @@ -87,6 +93,7 @@ pub struct BindingRegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: Option, + pub min_byte_length: Option, } #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] @@ -106,6 +113,21 @@ pub struct BindingDenyListMatchData { pub custom_labels: Vec>, pub originals: Vec, pub sources: Vec>, + pub filters: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDenyListFilterData { + pub stopwords: Vec, + pub allow_list: Vec, + pub person_stopwords: Vec, + pub address_stopwords: Vec, + pub street_types: Vec, + pub first_names: Vec, + pub generic_roles: Vec, + pub sentence_starters: Vec, + pub trailing_address_word_exclusions: Vec, + pub defined_term_cues: Vec, } #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] @@ -179,6 +201,51 @@ pub struct BindingStaticRedactionResult { pub redaction: BindingRedactionResult, } +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingDiagnosticEvent { + pub stage: String, + pub kind: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub engine: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub pattern: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub source: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_detail: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub label: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub start: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub end: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub text: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub score: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub span_valid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub elapsed_us: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub input_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionDiagnostics { + pub events: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionDiagnosticResult { + pub result: BindingStaticRedactionResult, + pub diagnostics: BindingStaticRedactionDiagnostics, +} + pub fn prepared_search_config_from_binding( config: BindingPreparedSearchConfig, ) -> Result { @@ -201,6 +268,7 @@ pub fn prepared_search_config_from_binding( custom_labels: data.custom_labels, originals: data.originals, sources: data.sources, + filters: data.filters.map(deny_list_filters_from_binding), }), gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { labels: data.labels, @@ -275,6 +343,78 @@ pub fn static_redaction_result_to_binding( } } +#[must_use] +pub fn static_redaction_diagnostic_result_to_binding( + result: StaticRedactionDiagnosticResult, +) -> BindingStaticRedactionDiagnosticResult { + BindingStaticRedactionDiagnosticResult { + result: static_redaction_result_to_binding(result.result), + diagnostics: static_redaction_diagnostics_to_binding(result.diagnostics), + } +} + +#[must_use] +pub fn static_redaction_diagnostics_to_binding( + diagnostics: StaticRedactionDiagnostics, +) -> BindingStaticRedactionDiagnostics { + BindingStaticRedactionDiagnostics { + events: diagnostics + .events + .into_iter() + .map(diagnostic_event_to_binding) + .collect(), + } +} + +fn diagnostic_event_to_binding( + event: DiagnosticEvent, +) -> BindingDiagnosticEvent { + BindingDiagnosticEvent { + stage: diagnostic_stage_name(event.stage), + kind: diagnostic_event_kind_name(event.kind), + count: event.count, + engine: event.engine.map(search_engine_name), + pattern: event.pattern, + source: event.source.map(detection_source_name), + source_detail: event.source_detail.map(source_detail_name), + label: event.label, + start: event.start, + end: event.end, + text: event.text, + score: event.score, + span_valid: event.span_valid, + elapsed_us: event.elapsed_us, + input_bytes: event.input_bytes, + reason: event.reason, + } +} + +fn deny_list_filters_from_binding( + filters: BindingDenyListFilterData, +) -> DenyListFilterData { + DenyListFilterData { + stopwords: lower_set(filters.stopwords), + allow_list: lower_set(filters.allow_list), + person_stopwords: lower_set(filters.person_stopwords), + address_stopwords: lower_set(filters.address_stopwords), + street_types: lower_set(filters.street_types), + first_names: lower_set(filters.first_names), + generic_roles: lower_set(filters.generic_roles), + sentence_starters: lower_set(filters.sentence_starters), + trailing_address_word_exclusions: lower_set( + filters.trailing_address_word_exclusions, + ), + defined_term_cues: lower_set(filters.defined_term_cues), + } +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + fn search_patterns_from_binding( patterns: Vec, ) -> Result> { @@ -294,7 +434,22 @@ fn search_pattern_from_binding( case_insensitive: pattern.case_insensitive, whole_words: pattern.whole_words, }), - "regex" => Ok(SearchPattern::Regex(pattern.pattern)), + "regex" => { + if pattern.lazy.is_some() + || pattern.prefilter_any.is_some() + || pattern.prefilter_case_insensitive.is_some() + || pattern.prefilter_regex.is_some() + { + return Ok(SearchPattern::RegexWithOptions { + pattern: pattern.pattern, + lazy: pattern.lazy.unwrap_or(false), + prefilter_any: pattern.prefilter_any.unwrap_or_default(), + prefilter_case_insensitive: pattern.prefilter_case_insensitive, + prefilter_regex: pattern.prefilter_regex, + }); + } + Ok(SearchPattern::Regex(pattern.pattern)) + } "fuzzy" => Ok(SearchPattern::Fuzzy { pattern: pattern.pattern, distance: pattern @@ -370,6 +525,7 @@ fn regex_meta_from_binding( .map(|value| source_detail_from_binding(&value)) .transpose()?, requires_validation: entry.requires_validation.unwrap_or(false), + min_byte_length: entry.min_byte_length, }) }) .collect() @@ -419,6 +575,52 @@ fn source_detail_name(detail: SourceDetail) -> String { .to_owned() } +fn search_engine_name(engine: SearchEngine) -> String { + match engine { + SearchEngine::Literal => "literal", + SearchEngine::Regex => "regex", + SearchEngine::Fuzzy => "fuzzy", + SearchEngine::Text => "text-search", + } + .to_owned() +} + +fn diagnostic_stage_name(stage: DiagnosticStage) -> String { + match stage { + DiagnosticStage::PrepareTotal => "prepare.total", + DiagnosticStage::PrepareRegex => "prepare.regex", + DiagnosticStage::PrepareCustomRegex => "prepare.custom-regex", + DiagnosticStage::PrepareLiteral => "prepare.literal", + DiagnosticStage::Normalize => "normalize", + DiagnosticStage::FindMatches => "find-matches", + DiagnosticStage::FindRegex => "find.regex", + DiagnosticStage::FindCustomRegex => "find.custom-regex", + DiagnosticStage::FindLiteral => "find.literal", + DiagnosticStage::SearchRegex => "search.regex", + DiagnosticStage::SearchCustomRegex => "search.custom-regex", + DiagnosticStage::SearchLiteral => "search.literal", + DiagnosticStage::EntityRegex => "entity.regex", + DiagnosticStage::EntityCustomRegex => "entity.custom-regex", + DiagnosticStage::EntityDenyList => "entity.deny-list", + DiagnosticStage::EntityGazetteer => "entity.gazetteer", + DiagnosticStage::EntityCountry => "entity.country", + DiagnosticStage::Merge => "resolution.merge", + DiagnosticStage::Boundary => "resolution.boundary", + DiagnosticStage::Sanitize => "resolution.sanitize", + DiagnosticStage::Redaction => "redaction", + } + .to_owned() +} + +fn diagnostic_event_kind_name(kind: DiagnosticEventKind) -> String { + match kind { + DiagnosticEventKind::StageSummary => "stage-summary", + DiagnosticEventKind::SearchMatch => "search-match", + DiagnosticEventKind::Entity => "entity", + } + .to_owned() +} + fn operator_name(operator: OperatorType) -> String { match operator { OperatorType::Replace => "replace", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index a115528b..55871a1e 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -10,9 +10,7 @@ keywords = ["anonymization", "pii", "redaction", "text"] categories = ["text-processing"] [dependencies] -stella-aho-corasick-core = { version = "1.0.4", git = "https://github.com/stella/aho-corasick", rev = "ad5dfa06c1be8bffda75e050030fa4e70b93c75f" } -stella-fuzzy-search-core = { version = "1.1.2", git = "https://github.com/stella/fuzzy-search", rev = "4ccb8ced60d8f2ff7f5d1870d2931556e8247632" } -stella-regex-set-core = { version = "1.0.5", git = "https://github.com/stella/regex-set", rev = "a50fdc018b40d23ecf732be85b00d495bd6d95cf" } +stella-text-search-core = { git = "https://github.com/stella/text-search", rev = "e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e" } [lints] workspace = true diff --git a/crates/anonymize-core/src/byte_offsets.rs b/crates/anonymize-core/src/byte_offsets.rs new file mode 100644 index 00000000..b4119355 --- /dev/null +++ b/crates/anonymize-core/src/byte_offsets.rs @@ -0,0 +1,62 @@ +use crate::types::{Error, Result}; + +pub(crate) struct ByteOffsets<'a> { + text: &'a str, +} + +impl<'a> ByteOffsets<'a> { + pub(crate) const fn new(text: &'a str) -> Self { + Self { text } + } + + pub(crate) fn len(&self) -> Result { + u32::try_from(self.text.len()) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn validate_offset(&self, offset: u32) -> Result { + let index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + if index > self.text.len() { + return Err(Error::ByteOffsetOutOfBounds { offset }); + } + if !self.text.is_char_boundary(index) { + return Err(Error::ByteOffsetInsideCodepoint { offset }); + } + Ok(index) + } + + pub(crate) fn floor_offset(&self, offset: u32) -> Result { + let mut index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + if index > self.text.len() { + index = self.text.len(); + } + while index > 0 && !self.text.is_char_boundary(index) { + index = index.saturating_sub(1); + } + u32::try_from(index) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn slice( + &self, + full_text: &str, + start: u32, + end: u32, + ) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let start_byte = self.validate_offset(start)?; + let end_byte = self.validate_offset(end)?; + + Ok( + full_text + .get(start_byte..end_byte) + .ok_or(Error::InvalidSpan { start, end })? + .to_owned(), + ) + } +} diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs new file mode 100644 index 00000000..81578156 --- /dev/null +++ b/crates/anonymize-core/src/diagnostics.rs @@ -0,0 +1,203 @@ +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{RedactionResult, SearchEngine, SearchMatch}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DiagnosticStage { + PrepareTotal, + PrepareRegex, + PrepareCustomRegex, + PrepareLiteral, + Normalize, + FindMatches, + FindRegex, + FindCustomRegex, + FindLiteral, + SearchRegex, + SearchCustomRegex, + SearchLiteral, + EntityRegex, + EntityCustomRegex, + EntityDenyList, + EntityGazetteer, + EntityCountry, + Merge, + Boundary, + Sanitize, + Redaction, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DiagnosticEventKind { + StageSummary, + SearchMatch, + Entity, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct DiagnosticEvent { + pub stage: DiagnosticStage, + pub kind: DiagnosticEventKind, + pub count: Option, + pub engine: Option, + pub pattern: Option, + pub source: Option, + pub source_detail: Option, + pub label: Option, + pub start: Option, + pub end: Option, + pub text: Option, + pub score: Option, + pub span_valid: Option, + pub elapsed_us: Option, + pub input_bytes: Option, + pub reason: Option, +} + +#[derive(Clone, Debug, Default, PartialEq)] +pub struct StaticRedactionDiagnostics { + pub events: Vec, +} + +impl StaticRedactionDiagnostics { + pub(crate) fn record_search_matches( + &mut self, + stage: DiagnosticStage, + matches: &[SearchMatch], + full_text: &str, + elapsed_us: Option, + ) { + self.record_stage( + stage, + Some(matches.len()), + elapsed_us, + Some(full_text.len()), + ); + + let offsets = ByteOffsets::new(full_text); + for found in matches { + let span_valid = span_slices(&offsets, found.start(), found.end()); + let text = span_valid + .then(|| offsets.slice(full_text, found.start(), found.end()).ok()) + .flatten(); + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::SearchMatch, + count: None, + engine: Some(found.engine()), + pattern: Some(found.pattern()), + source: None, + source_detail: None, + label: None, + start: Some(found.start()), + end: Some(found.end()), + text, + score: None, + span_valid: Some(span_valid), + elapsed_us: None, + input_bytes: None, + reason: None, + }); + } + } + + pub(crate) fn record_entities( + &mut self, + stage: DiagnosticStage, + entities: &[PipelineEntity], + full_text: &str, + elapsed_us: Option, + ) { + self.record_stage( + stage, + Some(entities.len()), + elapsed_us, + Some(full_text.len()), + ); + + let offsets = ByteOffsets::new(full_text); + for entity in entities { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::Entity, + count: None, + engine: None, + pattern: None, + source: Some(entity.source), + source_detail: entity.source_detail, + label: Some(entity.label.clone()), + start: Some(entity.start), + end: Some(entity.end), + text: Some(entity.text.clone()), + score: Some(entity.score), + span_valid: Some(span_slices(&offsets, entity.start, entity.end)), + elapsed_us: None, + input_bytes: None, + reason: None, + }); + } + } + + pub(crate) fn record_redaction( + &mut self, + result: &RedactionResult, + elapsed_us: Option, + input_bytes: usize, + ) { + self.events.push(DiagnosticEvent { + stage: DiagnosticStage::Redaction, + kind: DiagnosticEventKind::StageSummary, + count: Some(result.entity_count), + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes: Some(input_bytes), + reason: None, + }); + } + + pub(crate) fn record_stage( + &mut self, + stage: DiagnosticStage, + count: Option, + elapsed_us: Option, + input_bytes: Option, + ) { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::StageSummary, + count, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes, + reason: None, + }); + } + + pub fn extend(&mut self, other: Self) { + self.events.extend(other.events); + } +} + +fn span_slices(offsets: &ByteOffsets<'_>, start: u32, end: u32) -> bool { + start <= end + && offsets.validate_offset(start).is_ok() + && offsets.validate_offset(end).is_ok() +} diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index f26da0f2..0a0e9db5 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -2,6 +2,8 @@ //! Core anonymization contracts shared by host-language bindings. +pub(crate) mod byte_offsets; +mod diagnostics; pub(crate) mod normalize; mod placeholders; mod prepared; @@ -10,18 +12,22 @@ mod redact; mod resolution; mod search; mod types; -pub(crate) mod utf16; +pub use diagnostics::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, + StaticRedactionDiagnostics, +}; pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; pub use prepared::{ - PreparedSearch, PreparedSearchConfig, PreparedSearchMatches, - PreparedSearchSlices, StaticDetectionResult, StaticRedactionResult, + PreparedSearch, PreparedSearchBuildResult, PreparedSearchConfig, + PreparedSearchMatches, PreparedSearchSlices, StaticDetectionResult, + StaticRedactionDiagnosticResult, StaticRedactionResult, }; pub use processors::{ - CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, - RegexMatchMeta, process_country_matches, process_deny_list_matches, - process_gazetteer_matches, process_regex_matches, + CountryMatchData, DenyListFilterData, DenyListMatchData, GazetteerMatchData, + PatternSlice, RegexMatchMeta, process_country_matches, + process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; pub use redact::{deanonymise, redact_text}; pub use resolution::{ diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 99b92a8b..68321de3 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -2,6 +2,34 @@ const PHONE_NOISE: [char; 3] = ['(', ')', '-']; const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; const IDENTIFIER_CUES: &str = include_str!("../data/identifier-cues.txt"); +use crate::types::{Error, Result}; + +pub(crate) struct NormalizedSearchText { + text: String, + byte_to_original: Option>, +} + +impl NormalizedSearchText { + pub(crate) fn as_str(&self) -> &str { + &self.text + } + + pub(crate) fn map_span(&self, start: u32, end: u32) -> Result<(u32, u32)> { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let Some(byte_to_original) = &self.byte_to_original else { + return Ok((start, end)); + }; + + Ok(( + map_normalized_offset(byte_to_original, start)?, + map_normalized_offset(byte_to_original, end)?, + )) + } +} + #[must_use] pub fn normalize_for_search(text: &str) -> String { let mut has_replacement = false; @@ -22,6 +50,76 @@ pub fn normalize_for_search(text: &str) -> String { output } +pub(crate) fn normalize_for_search_with_byte_map( + text: &str, +) -> Result { + let mut has_replacement = false; + for ch in text.chars() { + if replacement_char(ch) != ch { + has_replacement = true; + break; + } + } + if !has_replacement { + return Ok(NormalizedSearchText { + text: text.to_owned(), + byte_to_original: None, + }); + } + + let mut output = String::with_capacity(text.len()); + let mut byte_to_original = vec![0_u32]; + for (original_start, ch) in text.char_indices() { + set_boundary( + &mut byte_to_original, + output.len(), + checked_u32(original_start)?, + ); + output.push(replacement_char(ch)); + set_boundary( + &mut byte_to_original, + output.len(), + checked_u32(original_start.saturating_add(ch.len_utf8()))?, + ); + } + + Ok(NormalizedSearchText { + text: output, + byte_to_original: Some(byte_to_original), + }) +} + +fn set_boundary( + byte_to_original: &mut Vec, + normalized_offset: usize, + original_offset: u32, +) { + if byte_to_original.len() <= normalized_offset { + byte_to_original.resize(normalized_offset.saturating_add(1), u32::MAX); + } + if let Some(slot) = byte_to_original.get_mut(normalized_offset) { + *slot = original_offset; + } +} + +fn map_normalized_offset(byte_to_original: &[u32], offset: u32) -> Result { + let index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + let mapped = byte_to_original + .get(index) + .copied() + .ok_or(Error::ByteOffsetOutOfBounds { offset })?; + if mapped == u32::MAX { + return Err(Error::ByteOffsetInsideCodepoint { offset }); + } + Ok(mapped) +} + +fn checked_u32(offset: usize) -> Result { + u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) +} + // Normalization decides placeholder identity. pub(crate) fn label_key(label: &str) -> String { let uppercase = uppercase(label); diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 68534c19..f46b2f5e 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1,13 +1,16 @@ -use crate::normalize::normalize_for_search; +use std::time::Instant; + +use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; +use crate::normalize::normalize_for_search_with_byte_map; use crate::processors::{ CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, - RegexMatchMeta, ensure_custom_deny_list_sources, process_country_matches, + RegexMatchMeta, ensure_supported_deny_list_sources, process_country_matches, process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; use crate::redact::redact_text; use crate::resolution::{ PipelineEntity, enforce_boundary_consistency, merge_and_dedup, - sanitize_entities, + sanitize_entities_with_source, }; use crate::search::{SearchIndex, SearchOptions, SearchPattern}; use crate::types::{ @@ -79,20 +82,95 @@ pub struct StaticRedactionResult { pub redaction: RedactionResult, } +#[derive(Clone, Debug, PartialEq)] +pub struct StaticRedactionDiagnosticResult { + pub result: StaticRedactionResult, + pub diagnostics: StaticRedactionDiagnostics, +} + +pub struct PreparedSearchBuildResult { + pub prepared: PreparedSearch, + pub diagnostics: StaticRedactionDiagnostics, +} + impl PreparedSearch { pub fn new(config: PreparedSearchConfig) -> Result { + Self::new_inner(config, None) + } + + pub fn new_with_diagnostics( + config: PreparedSearchConfig, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let prepared = Self::new_inner(config, Some(&mut diagnostics))?; + + Ok(PreparedSearchBuildResult { + prepared, + diagnostics, + }) + } + + fn new_inner( + config: PreparedSearchConfig, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let total_start = Instant::now(); validate_supported_config(&config)?; + let regex_len = config.regex_patterns.len(); + let custom_regex_len = config.custom_regex_patterns.len(); + let literal_len = config.literal_patterns.len(); + + let regex_start = Instant::now(); + let regex = SearchIndex::new(config.regex_patterns, config.regex_options)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::PrepareRegex, + Some(regex_len), + Some(elapsed_us(regex_start)), + None, + ); + } + + let custom_regex_start = Instant::now(); + let custom_regex = SearchIndex::new( + config.custom_regex_patterns, + config.custom_regex_options, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::PrepareCustomRegex, + Some(custom_regex_len), + Some(elapsed_us(custom_regex_start)), + None, + ); + } + + let literal_start = Instant::now(); + let literals = + SearchIndex::new(config.literal_patterns, config.literal_options)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::PrepareLiteral, + Some(literal_len), + Some(elapsed_us(literal_start)), + None, + ); + diagnostics.record_stage( + DiagnosticStage::PrepareTotal, + Some( + regex_len + .saturating_add(custom_regex_len) + .saturating_add(literal_len), + ), + Some(elapsed_us(total_start)), + None, + ); + } Ok(Self { - regex: SearchIndex::new(config.regex_patterns, config.regex_options)?, - custom_regex: SearchIndex::new( - config.custom_regex_patterns, - config.custom_regex_options, - )?, - literals: SearchIndex::new( - config.literal_patterns, - config.literal_options, - )?, + regex, + custom_regex, + literals, slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -103,12 +181,79 @@ impl PreparedSearch { } pub fn find_matches(&self, full_text: &str) -> Result { - let normalized = normalize_for_search(full_text); + self.find_matches_inner(full_text, None) + } + + fn find_matches_inner( + &self, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let total_start = Instant::now(); + let normalize_start = Instant::now(); + let normalized = normalize_for_search_with_byte_map(full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::Normalize, + None, + Some(elapsed_us(normalize_start)), + Some(full_text.len()), + ); + } + + let regex_start = Instant::now(); + let regex = self.regex.find_iter(full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchRegex, + ®ex, + full_text, + Some(elapsed_us(regex_start)), + ); + } + + let custom_regex_start = Instant::now(); + let custom_regex = self.custom_regex.find_iter(full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchCustomRegex, + &custom_regex, + full_text, + Some(elapsed_us(custom_regex_start)), + ); + } + + let literal_start = Instant::now(); + let literal = self + .literals + .find_iter(normalized.as_str())? + .into_iter() + .map(|found| remap_normalized_match(&normalized, found)) + .collect::>>()?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchLiteral, + &literal, + full_text, + Some(elapsed_us(literal_start)), + ); + diagnostics.record_stage( + DiagnosticStage::FindMatches, + Some( + regex + .len() + .saturating_add(custom_regex.len()) + .saturating_add(literal.len()), + ), + Some(elapsed_us(total_start)), + Some(full_text.len()), + ); + } Ok(PreparedSearchMatches { - regex: self.regex.find_iter(full_text)?, - custom_regex: self.custom_regex.find_iter(full_text)?, - literal: self.literals.find_iter(&normalized)?, + regex, + custom_regex, + literal, }) } @@ -116,19 +261,36 @@ impl PreparedSearch { &self, full_text: &str, ) -> Result { - let matches = self.find_matches(full_text)?; + self.detect_static_entities_inner(full_text, None) + } + + fn detect_static_entities_inner( + &self, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let matches = + self.find_matches_inner(full_text, diagnostics.as_deref_mut())?; + + let regex_start = Instant::now(); let regex_entities = process_regex_matches( &matches.regex, self.slices.regex, full_text, &self.regex_meta, )?; + let regex_elapsed_us = elapsed_us(regex_start); + + let custom_regex_start = Instant::now(); let custom_regex_entities = process_regex_matches( &matches.custom_regex, self.slices.custom_regex, full_text, &self.custom_regex_meta, )?; + let custom_regex_elapsed_us = elapsed_us(custom_regex_start); + + let deny_list_start = Instant::now(); let deny_list_entities = if let Some(data) = &self.deny_list_data { process_deny_list_matches( &matches.literal, @@ -139,6 +301,9 @@ impl PreparedSearch { } else { Vec::new() }; + let deny_list_elapsed_us = elapsed_us(deny_list_start); + + let gazetteer_start = Instant::now(); let gazetteer_entities = if let Some(data) = &self.gazetteer_data { process_gazetteer_matches( &matches.literal, @@ -149,6 +314,9 @@ impl PreparedSearch { } else { Vec::new() }; + let gazetteer_elapsed_us = elapsed_us(gazetteer_start); + + let country_start = Instant::now(); let country_entities = if let Some(data) = &self.country_data { process_country_matches( &matches.literal, @@ -159,6 +327,40 @@ impl PreparedSearch { } else { Vec::new() }; + let country_elapsed_us = elapsed_us(country_start); + + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::EntityRegex, + ®ex_entities, + full_text, + Some(regex_elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCustomRegex, + &custom_regex_entities, + full_text, + Some(custom_regex_elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityDenyList, + &deny_list_entities, + full_text, + Some(deny_list_elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityGazetteer, + &gazetteer_entities, + full_text, + Some(gazetteer_elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCountry, + &country_entities, + full_text, + Some(country_elapsed_us), + ); + } Ok(StaticDetectionResult { matches, @@ -175,16 +377,80 @@ impl PreparedSearch { full_text: &str, operators: &OperatorConfig, ) -> Result { - let detections = self.detect_static_entities(full_text)?; + self.redact_static_entities_inner(full_text, operators, None) + } + + pub fn redact_static_entities_with_diagnostics( + &self, + full_text: &str, + operators: &OperatorConfig, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let result = self.redact_static_entities_inner( + full_text, + operators, + Some(&mut diagnostics), + )?; + + Ok(StaticRedactionDiagnosticResult { + result, + diagnostics, + }) + } + + fn redact_static_entities_inner( + &self, + full_text: &str, + operators: &OperatorConfig, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let detections = self + .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; let raw_entities = detections.all_entities(); + let merge_start = Instant::now(); let merged = merge_and_dedup(&raw_entities); + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Merge, + &merged, + full_text, + Some(elapsed_us(merge_start)), + ); + } + let boundary_start = Instant::now(); let consistent = enforce_boundary_consistency(&merged, full_text)?; - let resolved_entities = sanitize_entities(&consistent); + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Boundary, + &consistent, + full_text, + Some(elapsed_us(boundary_start)), + ); + } + let sanitize_start = Instant::now(); + let resolved_entities = + sanitize_entities_with_source(&consistent, full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Sanitize, + &resolved_entities, + full_text, + Some(elapsed_us(sanitize_start)), + ); + } let redaction_entities = resolved_entities .iter() .map(to_redaction_entity) .collect::>(); + let redaction_start = Instant::now(); let redaction = redact_text(full_text, &redaction_entities, operators)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_redaction( + &redaction, + Some(elapsed_us(redaction_start)), + full_text.len(), + ); + } Ok(StaticRedactionResult { detections, @@ -194,6 +460,19 @@ impl PreparedSearch { } } +fn elapsed_us(start: Instant) -> u64 { + let micros = start.elapsed().as_micros(); + u64::try_from(micros).unwrap_or(u64::MAX) +} + +fn remap_normalized_match( + normalized: &crate::normalize::NormalizedSearchText, + found: SearchMatch, +) -> Result { + let (start, end) = normalized.map_span(found.start(), found.end())?; + Ok(found.with_span(start, end)) +} + fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { reject_unsupported_slice(config.slices.legal_forms, "legal_forms")?; reject_unsupported_slice(config.slices.triggers, "triggers")?; @@ -241,7 +520,7 @@ fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { config.slices.deny_list, data.sources.len(), )?; - ensure_custom_deny_list_sources(data) + ensure_supported_deny_list_sources(data) } fn validate_static_data_length( diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index c7170727..0b27ed59 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -1,14 +1,22 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::byte_offsets::ByteOffsets; use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; use crate::types::{Error, Result, SearchMatch}; -use crate::utf16::Utf16Offsets; -const MIN_PHONE_LENGTH: usize = 7; const GAZETTEER_EXACT_SCORE: f64 = 0.9; const GAZETTEER_FUZZY_SCORE: f64 = 0.85; const COUNTRY_SCORE: f64 = 0.95; const DENY_LIST_SCORE: f64 = 0.9; const MAX_GAZETTEER_PREFIX_OVERSHOOT: u32 = 7; pub(crate) const CUSTOM_DENY_LIST_SOURCE: &str = "custom-deny-list"; +const DENY_LIST_SOURCE: &str = "deny-list"; +const CITY_SOURCE: &str = "city"; +const FIRST_NAME_SOURCE: &str = "first-name"; +const SURNAME_SOURCE: &str = "surname"; +const TITLE_SOURCE: &str = "title"; +const PERSON_LABEL: &str = "person"; +const ADDRESS_LABEL: &str = "address"; #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub struct PatternSlice { @@ -46,6 +54,7 @@ pub struct RegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: bool, + pub min_byte_length: Option, } impl RegexMatchMeta { @@ -56,6 +65,7 @@ impl RegexMatchMeta { score, source_detail: None, requires_validation: false, + min_byte_length: None, } } } @@ -77,6 +87,31 @@ pub struct DenyListMatchData { pub custom_labels: Vec>, pub originals: Vec, pub sources: Vec>, + pub filters: Option, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct DenyListFilterData { + pub stopwords: BTreeSet, + pub allow_list: BTreeSet, + pub person_stopwords: BTreeSet, + pub address_stopwords: BTreeSet, + pub street_types: BTreeSet, + pub first_names: BTreeSet, + pub generic_roles: BTreeSet, + pub sentence_starters: BTreeSet, + pub trailing_address_word_exclusions: BTreeSet, + pub defined_term_cues: BTreeSet, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct RawDenyListMatch { + start: u32, + end: u32, + labels: Vec, + custom_labels: Vec, + sources: Vec, + text: String, } pub fn process_regex_matches( @@ -85,7 +120,7 @@ pub fn process_regex_matches( full_text: &str, meta: &[RegexMatchMeta], ) -> Result> { - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); let mut results = Vec::new(); for found in matches { @@ -96,14 +131,13 @@ pub fn process_regex_matches( let Some(entry) = meta.get(local_index) else { continue; }; + let text = offsets.slice(full_text, found.start(), found.end())?; if entry.requires_validation { return Err(Error::UnsupportedRegexValidation { pattern }); } - - let text = offsets.slice(full_text, found.start(), found.end())?; - if entry.source_detail != Some(SourceDetail::CustomRegex) - && entry.label == "phone number" - && text.encode_utf16().count() < MIN_PHONE_LENGTH + if entry + .min_byte_length + .is_some_and(|min| byte_len(&text) < min) { continue; } @@ -129,51 +163,321 @@ pub fn process_deny_list_matches( full_text: &str, data: &DenyListMatchData, ) -> Result> { - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); + let matches_by_pattern = + collect_deny_list_matches(matches, slice, full_text, data, &offsets)?; + let mut results = Vec::new(); + let mut name_hits = Vec::new(); + + for pattern_matches in matches_by_pattern.values() { + for found in pattern_matches { + for label in &found.custom_labels { + let mut entity = PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + ); + entity.source_detail = Some(SourceDetail::CustomDenyList); + results.push(entity); + } + } + + for found in pattern_matches { + if found.labels.iter().any(|label| label == PERSON_LABEL) + && !filter_contains( + data + .filters + .as_ref() + .map(|filters| &filters.person_stopwords), + &found.text.to_lowercase(), + ) + { + name_hits.push(found.clone()); + } + + let suppress_address = should_suppress_address(full_text, data, found)?; + for label in found.labels.iter().filter(|label| *label != PERSON_LABEL) { + if label == ADDRESS_LABEL && suppress_address { + continue; + } + results.push(PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + )); + } + } + } + + append_person_name_hits( + &mut results, + full_text, + &offsets, + data, + &mut name_hits, + )?; + extend_city_districts( + &mut results, + full_text, + &offsets, + data.filters.as_ref(), + )?; + + Ok(results) +} + +fn collect_deny_list_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &DenyListMatchData, + offsets: &ByteOffsets<'_>, +) -> Result>> { + let mut matches_by_pattern = BTreeMap::>::new(); for found in matches { let Some(local_index) = slice.local_index(found.pattern()) else { continue; }; - ensure_custom_deny_list_source( - data.sources.get(local_index).map(Vec::as_slice), - )?; - - let Some(custom_labels) = data.custom_labels.get(local_index) else { + let Some(labels) = data.labels.get(local_index) else { continue; }; - if custom_labels.is_empty() { + let Some(sources) = data.sources.get(local_index) else { continue; - } + }; + validate_deny_list_sources(sources)?; + let match_text = offsets.slice(full_text, found.start(), found.end())?; + let keyword = match_text.to_lowercase(); let pattern = data.originals.get(local_index).map_or("", String::as_str); - if !custom_match_has_valid_edges( + let custom_pattern_labels = data + .custom_labels + .get(local_index) + .cloned() + .unwrap_or_default(); + let custom_edges_are_valid = custom_match_has_valid_edges( full_text, - &offsets, + offsets, found.start(), found.end(), pattern, + )?; + let custom_labels = if custom_edges_are_valid { + custom_pattern_labels.clone() + } else { + Vec::new() + }; + + if labels.is_empty() && custom_labels.is_empty() { + continue; + } + + let curated_labels = if has_curated_source(sources) { + let filters = data.filters.as_ref().ok_or(Error::MissingStaticData { + field: "deny_list.filters", + })?; + curated_labels_for_match(&CuratedDenyListMatch { + full_text, + offsets, + start: found.start(), + match_text: &match_text, + keyword: &keyword, + pattern, + labels, + custom_pattern_labels: &custom_pattern_labels, + custom_edges_are_valid, + filters, + })? + } else { + Vec::new() + }; + + if curated_labels.is_empty() && custom_labels.is_empty() { + continue; + } + + matches_by_pattern + .entry(local_index) + .or_default() + .push(RawDenyListMatch { + start: found.start(), + end: found.end(), + labels: curated_labels, + custom_labels, + sources: sources.clone(), + text: match_text, + }); + } + + Ok(matches_by_pattern) +} + +struct CuratedDenyListMatch<'a> { + full_text: &'a str, + offsets: &'a ByteOffsets<'a>, + start: u32, + match_text: &'a str, + keyword: &'a str, + pattern: &'a str, + labels: &'a [String], + custom_pattern_labels: &'a [String], + custom_edges_are_valid: bool, + filters: &'a DenyListFilterData, +} + +fn curated_labels_for_match( + args: &CuratedDenyListMatch<'_>, +) -> Result> { + let pattern_is_acronym = !args.pattern.is_empty() + && args.pattern.len() <= 5 + && all_upper(args.pattern); + let acronym_matches_acronym = + !pattern_is_acronym || all_upper(args.match_text); + let source_char = char_at(args.full_text, args.offsets, args.start)?; + let passes_filters = source_char.is_some_and(char::is_uppercase) + && !args.filters.stopwords.contains(args.keyword) + && !args.filters.allow_list.contains(args.keyword) + && acronym_matches_acronym + && !all_upper(args.match_text); + + if !passes_filters || !args.custom_edges_are_valid { + return Ok(Vec::new()); + } + + if is_dotted_acronym_suffix_collision( + args.full_text, + args.offsets, + args.start, + args.match_text, + )? { + return Ok(Vec::new()); + } + + Ok( + args + .labels + .iter() + .filter(|label| !args.custom_pattern_labels.contains(label)) + .cloned() + .collect(), + ) +} + +fn should_suppress_address( + full_text: &str, + data: &DenyListMatchData, + found: &RawDenyListMatch, +) -> Result { + if !is_single_word(found.text.as_str()) { + return Ok(false); + } + let Some(filters) = &data.filters else { + return Ok(false); + }; + let lower = found.text.to_lowercase(); + if !filters.address_stopwords.contains(&lower) { + return Ok(false); + } + + Ok(!has_adjacent_address_evidence( + full_text, + found.start, + found.end, + filters, + )?) +} + +fn append_person_name_hits( + results: &mut Vec, + full_text: &str, + offsets: &ByteOffsets<'_>, + data: &DenyListMatchData, + name_hits: &mut [RawDenyListMatch], +) -> Result<()> { + name_hits.sort_by_key(|hit| hit.start); + let mut consumed = BTreeSet::::new(); + + for index in 0..name_hits.len() { + if consumed.contains(&index) { + continue; + } + let Some(hit) = name_hits.get(index) else { + continue; + }; + + let mut chain = vec![hit.clone()]; + let mut cursor = index.saturating_add(1); + + while cursor < name_hits.len() && chain.len() < 5 { + let Some(next) = name_hits.get(cursor) else { + break; + }; + let Some(prev) = chain.last() else { + break; + }; + let gap = offsets.slice(full_text, prev.end, next.start)?; + if person_chain_breaks(prev.text.as_str(), gap.as_str()) { + break; + } + + chain.push(next.clone()); + cursor = cursor.saturating_add(1); + } + + for consumed_index in index..index.saturating_add(chain.len()) { + consumed.insert(consumed_index); + } + + if !chain.iter().any(has_person_name_source) { + continue; + } + + let Some(first) = chain.first() else { + continue; + }; + let Some(last) = chain.last() else { + continue; + }; + let Some(filters) = &data.filters else { + continue; + }; + if is_suppressible_defined_term_quote( + full_text, + offsets, + first.start, + filters, )? { continue; } - let text = offsets.slice(full_text, found.start(), found.end())?; - for label in custom_labels { - let mut entity = PipelineEntity::detected( - found.start(), - found.end(), - label.clone(), - text.clone(), - DENY_LIST_SCORE, - DetectionSource::DenyList, - ); - entity.source_detail = Some(SourceDetail::CustomDenyList); - results.push(entity); + let extended = + extend_person_name(full_text, offsets, first.start, last.end, filters)?; + let score = if chain.len() >= 2 { 0.9 } else { 0.5 }; + + if chain.len() == 1 + && !single_name_hit_has_context(full_text, offsets, last.end, filters)? + { + continue; } + + results.push(PipelineEntity::detected( + first.start, + extended.end, + PERSON_LABEL, + extended.text, + score, + DetectionSource::DenyList, + )); } - Ok(results) + Ok(()) } pub fn process_gazetteer_matches( @@ -182,7 +486,7 @@ pub fn process_gazetteer_matches( full_text: &str, data: &GazetteerMatchData, ) -> Result> { - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); let mut results = Vec::new(); let mut exact_spans = Vec::<(u32, u32)>::new(); @@ -261,7 +565,7 @@ pub fn process_country_matches( full_text: &str, data: &CountryMatchData, ) -> Result> { - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); let mut results = Vec::new(); for found in matches { @@ -288,42 +592,936 @@ pub fn process_country_matches( Ok(results) } -pub(crate) fn ensure_custom_deny_list_sources( +pub(crate) fn ensure_supported_deny_list_sources( data: &DenyListMatchData, ) -> Result<()> { + let mut needs_filters = false; for sources in &data.sources { - ensure_custom_deny_list_source(Some(sources))?; + validate_deny_list_sources(sources)?; + needs_filters |= has_curated_source(sources); + } + + if needs_filters && data.filters.is_none() { + return Err(Error::MissingStaticData { + field: "deny_list.filters", + }); } Ok(()) } -fn ensure_custom_deny_list_source(sources: Option<&[String]>) -> Result<()> { - let Some(sources) = sources else { - return Err(Error::UnsupportedDenyListSource { - source: String::from(""), - }); - }; +fn validate_deny_list_sources(sources: &[String]) -> Result<()> { if sources.is_empty() { return Err(Error::UnsupportedDenyListSource { source: String::from(""), }); } - if let Some(source) = sources + + for source in sources { + match source.as_str() { + DENY_LIST_SOURCE + | CITY_SOURCE + | CUSTOM_DENY_LIST_SOURCE + | FIRST_NAME_SOURCE + | SURNAME_SOURCE + | TITLE_SOURCE => {} + _ => { + return Err(Error::UnsupportedDenyListSource { + source: source.clone(), + }); + } + } + } + + Ok(()) +} + +fn has_curated_source(sources: &[String]) -> bool { + sources .iter() - .find(|source| source.as_str() != CUSTOM_DENY_LIST_SOURCE) + .any(|source| source.as_str() != CUSTOM_DENY_LIST_SOURCE) +} + +fn has_person_name_source(found: &RawDenyListMatch) -> bool { + found + .sources + .iter() + .any(|source| source == FIRST_NAME_SOURCE || source == SURNAME_SOURCE) +} + +fn filter_contains(set: Option<&BTreeSet>, value: &str) -> bool { + set.is_some_and(|set| set.contains(value)) +} + +fn char_at( + full_text: &str, + offsets: &ByteOffsets<'_>, + offset: u32, +) -> Result> { + let byte = offsets.validate_offset(offset)?; + Ok(full_text.get(byte..).and_then(|tail| tail.chars().next())) +} + +fn char_before_byte(full_text: &str, byte: usize) -> Option { + full_text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) +} + +fn char_after_byte(full_text: &str, byte: usize) -> Option { + full_text + .get(byte..) + .and_then(|suffix| suffix.chars().next()) +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn all_upper(text: &str) -> bool { + let mut saw_letter = false; + for ch in text.chars() { + if !ch.is_alphabetic() || !ch.is_uppercase() { + return false; + } + saw_letter = true; + } + saw_letter +} + +fn is_single_word(text: &str) -> bool { + let mut saw_letter = false; + for ch in text.chars() { + if !ch.is_alphabetic() { + return false; + } + saw_letter = true; + } + saw_letter +} + +fn is_dotted_acronym(text: &str) -> bool { + if text.chars().count() < 3 { + return false; + } + + let mut segments = 0_u8; + let mut chars = text.chars().peekable(); + while let Some(ch) = chars.next() { + if !ch.is_alphabetic() { + return false; + } + segments = segments.saturating_add(1); + if segments > 4 { + return false; + } + match chars.peek().copied() { + Some('.') => { + let _ = chars.next(); + if chars.peek().is_none() { + break; + } + } + None => break, + Some(_) => return false, + } + } + + segments > 0 +} + +fn is_dotted_acronym_suffix_collision( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + match_text: &str, +) -> Result { + if !is_dotted_acronym(match_text) { + return Ok(false); + } + + let start_byte = offsets.validate_offset(start)?; + let prefix = full_text + .get(..start_byte) + .unwrap_or_default() + .chars() + .rev() + .take(2) + .collect::>(); + + Ok(matches!( + (prefix.first().copied(), prefix.get(1).copied()), + (Some('.'), Some(ch)) if ch.is_alphabetic() + )) +} + +fn has_adjacent_address_evidence( + full_text: &str, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let offsets = ByteOffsets::new(full_text); + let full_len = offsets.len()?; + let window_start = offsets.floor_offset(start.saturating_sub(40))?; + let window_end = + offsets.floor_offset(end.saturating_add(40).min(full_len))?; + let window = offsets.slice(full_text, window_start, window_end)?; + + Ok(has_address_format(&window) || has_street_type(&window, filters)) +} + +fn has_address_format(text: &str) -> bool { + has_state_after_comma(text) + || has_us_zip(text) + || has_cz_sk_postal_code(text) + || has_pl_postal_code(text) +} + +fn has_state_after_comma(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if chars.get(index) != Some(&',') { + continue; + } + let mut cursor = index.saturating_add(1); + while chars.get(cursor).is_some_and(|ch| ch.is_whitespace()) { + cursor = cursor.saturating_add(1); + } + let first = chars.get(cursor).copied(); + let second = chars.get(cursor.saturating_add(1)).copied(); + let after = chars.get(cursor.saturating_add(2)).copied(); + if first.is_some_and(char::is_uppercase) + && second.is_some_and(char::is_uppercase) + && !after.is_some_and(char::is_alphanumeric) + { + return true; + } + } + false +} + +fn has_us_zip(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if !five_digits_at(&chars, index) { + continue; + } + let after_five = index.saturating_add(5); + let has_zip4 = chars.get(after_five) == Some(&'-') + && four_digits_at(&chars, after_five.saturating_add(1)); + let end = if has_zip4 { + after_five.saturating_add(5) + } else { + after_five + }; + if !chars + .get(index.wrapping_sub(1)) + .is_some_and(char::is_ascii_digit) + && !chars.get(end).is_some_and(char::is_ascii_digit) + { + return true; + } + } + false +} + +fn has_cz_sk_postal_code(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if three_digits_at(&chars, index) + && chars.get(index.saturating_add(3)) == Some(&' ') + && two_digits_at(&chars, index.saturating_add(4)) + { + return true; + } + } + false +} + +fn has_pl_postal_code(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if two_digits_at(&chars, index) + && chars.get(index.saturating_add(2)) == Some(&'-') + && three_digits_at(&chars, index.saturating_add(3)) + { + return true; + } + } + false +} + +fn digits_at(chars: &[char], start: usize, len: usize) -> bool { + start.checked_add(len).is_some_and(|end| end <= chars.len()) + && chars + .get(start..start.saturating_add(len)) + .is_some_and(|slice| slice.iter().all(char::is_ascii_digit)) +} + +fn two_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 2) +} + +fn three_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 3) +} + +fn four_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 4) +} + +fn five_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 5) +} + +fn has_street_type(window: &str, filters: &DenyListFilterData) -> bool { + let lower_window = window.to_lowercase(); + for street_type in &filters.street_types { + if street_type.is_empty() { + continue; + } + let lower_type = street_type.to_lowercase(); + if street_type_matches(lower_window.as_str(), lower_type.as_str()) { + return true; + } + } + false +} + +fn street_type_matches(window: &str, street_type: &str) -> bool { + for (byte, _) in window.match_indices(street_type) { + let before = char_before_byte(window, byte); + if before.is_some_and(char::is_alphanumeric) { + continue; + } + let end = byte.saturating_add(street_type.len()); + let Some(last) = street_type.chars().next_back() else { + continue; + }; + if last.is_alphanumeric() + && char_after_byte(window, end).is_some_and(char::is_alphanumeric) + { + continue; + } + return true; + } + false +} + +fn person_chain_breaks(previous_text: &str, gap: &str) -> bool { + byte_len(gap) > 4 + || gap.is_empty() + || gap.contains('\n') + || gap.contains('\t') + || gap + .chars() + .any(|ch| matches!(ch, '!' | '?' | ';' | ':' | ',')) + || (gap.contains('.') && !is_initial_continuation_gap(previous_text, gap)) +} + +fn is_initial_continuation_gap(text: &str, gap: &str) -> bool { + let mut chars = text.chars(); + let text_is_single_upper = + chars.next().is_some_and(char::is_uppercase) && chars.next().is_none(); + if text_is_single_upper && dot_space_gap(gap) { + return true; + } + + let mut remaining = gap; + let Some(after_space) = consume_horizontal_space(remaining, 1, 2) else { + return false; + }; + remaining = after_space; + let mut consumed_initial = false; + + loop { + let Some(ch) = remaining.chars().next() else { + return consumed_initial; + }; + if !ch.is_uppercase() { + return false; + } + let Some(after_initial) = remaining.strip_prefix(ch) else { + return false; + }; + let Some(after_dot) = after_initial.strip_prefix('.') else { + return false; + }; + let Some(after_initial_gap) = consume_horizontal_space(after_dot, 1, 2) + else { + return false; + }; + remaining = after_initial_gap; + consumed_initial = true; + } +} + +fn dot_space_gap(gap: &str) -> bool { + let Some(rest) = gap.strip_prefix('.') else { + return false; + }; + consume_horizontal_space(rest, 1, 2).is_some_and(str::is_empty) +} + +fn consume_horizontal_space( + text: &str, + min: usize, + max: usize, +) -> Option<&str> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if ch == '\n' || !ch.is_whitespace() || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min).then(|| text.get(byte..)).flatten() +} + +fn single_name_hit_has_context( + full_text: &str, + offsets: &ByteOffsets<'_>, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let tail = slice_from(full_text, offsets, end)?; + let rest = tail.trim_start(); + let mut chars = rest.chars(); + let next_is_upper = chars.next().is_some_and(char::is_uppercase) + && chars.next().is_some_and(char::is_lowercase); + if !next_is_upper { + return Ok(false); + } + + let next_word = rest + .chars() + .take_while(|ch| ch.is_alphabetic()) + .collect::(); + Ok( + !filters + .sentence_starters + .contains(&next_word.to_lowercase()), + ) +} + +fn slice_from<'a>( + full_text: &'a str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result<&'a str> { + let byte = offsets.validate_offset(start)?; + full_text + .get(byte..) + .ok_or(Error::ByteOffsetOutOfBounds { offset: start }) +} + +struct ExtendedName { + end: u32, + text: String, +} + +fn extend_person_name( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let mut new_end = end; + + loop { + if char_at(full_text, offsets, new_end)? != Some(' ') { + break; + } + let word_start = new_end.saturating_add(1); + let Some(first) = char_at(full_text, offsets, word_start)? else { + break; + }; + if !first.is_uppercase() { + break; + } + + let word = read_until_whitespace(full_text, offsets, word_start)?; + let stripped = strip_trailing_name_punctuation(&word); + if stripped.chars().count() < 2 { + break; + } + let lower = stripped.to_lowercase(); + if filters.stopwords.contains(&lower) + || filters.person_stopwords.contains(&lower) + { + break; + } + + new_end = word_start.saturating_add(byte_len(stripped)); + } + + Ok(ExtendedName { + end: new_end, + text: offsets.slice(full_text, start, new_end)?, + }) +} + +fn read_until_whitespace( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result { + let tail = slice_from(full_text, offsets, start)?; + Ok(tail.chars().take_while(|ch| !ch.is_whitespace()).collect()) +} + +fn strip_trailing_name_punctuation(word: &str) -> &str { + word.trim_end_matches([',', ';', '.', '”', '"', '’', '\'', '“', '»']) +} + +struct DefinedTermQuote { + content: String, + after_closing_quote: String, +} + +fn is_suppressible_defined_term_quote( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + filters: &DenyListFilterData, +) -> Result { + let Some(quote) = + find_defined_term_quote_content(full_text, offsets, start, filters)? + else { + return Ok(false); + }; + let words = quote + .content + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()) + .collect::>(); + + if words.len() >= 2 + && starts_with_known_first_name("e.content, filters) + && has_person_role_definition("e.after_closing_quote, filters) { - return Err(Error::UnsupportedDenyListSource { - source: source.clone(), + return Ok(false); + } + + Ok(words.len() >= 2) +} + +fn find_defined_term_quote_content( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + filters: &DenyListFilterData, +) -> Result> { + let start_byte = offsets.validate_offset(start)?; + let Some(quote_start) = find_opening_quote(full_text, start_byte) else { + return Ok(None); + }; + let Some((quote_end, quote_char)) = + find_closing_quote(full_text, quote_start, start_byte) + else { + return Ok(None); + }; + let after_start = quote_end.saturating_add(quote_char.len_utf8()); + let after = full_text.get(after_start..).unwrap_or_default(); + let after_window = take_bytes(after, 120); + if strip_defined_term_cue(&after_window, filters).is_none() { + return Ok(None); + } + + let quote_width = full_text + .get(quote_start..) + .and_then(|tail| tail.chars().next()) + .map_or(0, char::len_utf8); + let content_start = quote_start.saturating_add(quote_width); + + Ok(Some(DefinedTermQuote { + content: full_text + .get(content_start..quote_end) + .unwrap_or_default() + .to_owned(), + after_closing_quote: after_window, + })) +} + +fn find_opening_quote(full_text: &str, start_byte: usize) -> Option { + let prefix = full_text.get(..start_byte)?; + let mut distance = 0_u32; + for (byte, ch) in prefix.char_indices().rev() { + distance = distance.saturating_add(byte_len(ch.encode_utf8(&mut [0; 4]))); + if distance > 80 || ch == '\n' { + break; + } + if opening_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + return Some(byte); + } + if closing_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + break; + } + } + None +} + +fn find_closing_quote( + full_text: &str, + quote_start: usize, + start_byte: usize, +) -> Option<(usize, char)> { + let tail = full_text.get(start_byte..)?; + let mut distance = byte_len(full_text.get(quote_start..start_byte)?); + for (relative, ch) in tail.char_indices() { + if distance > 120 { + break; + } + let byte = start_byte.saturating_add(relative); + if closing_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + return Some((byte, ch)); + } + distance = distance.saturating_add(byte_len(ch.encode_utf8(&mut [0; 4]))); + } + None +} + +fn is_quote_boundary(full_text: &str, byte: usize, ch: char) -> bool { + if ch != '\'' && ch != '’' { + return true; + } + let after_byte = byte.saturating_add(ch.len_utf8()); + let before = char_before_byte(full_text, byte); + let after = char_after_byte(full_text, after_byte); + !(before.is_some_and(char::is_alphabetic) + && after.is_some_and(char::is_alphabetic)) +} + +fn opening_quotes() -> &'static BTreeSet { + static QUOTES: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + BTreeSet::from(['"', '\'', '“', '„', '‟', '‘', '‛', '«']) }); + "ES +} + +fn closing_quotes() -> &'static BTreeSet { + static QUOTES: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + BTreeSet::from(['"', '\'', '”', '’', '»', '“']) + }); + "ES +} + +fn take_bytes(text: &str, max: u32) -> String { + let mut taken = String::new(); + let mut len = 0_u32; + for ch in text.chars() { + let width = byte_len(ch.encode_utf8(&mut [0; 4])); + if len.saturating_add(width) > max { + break; + } + taken.push(ch); + len = len.saturating_add(width); + } + taken +} + +fn strip_defined_term_cue<'a>( + after: &'a str, + filters: &DenyListFilterData, +) -> Option<&'a str> { + let trimmed = + after.trim_start_matches(|ch: char| ch.is_whitespace() || ch == ','); + let lower = trimmed.to_lowercase(); + for cue in &filters.defined_term_cues { + if lower.starts_with(cue) && word_boundary_after(lower.as_str(), cue.len()) + { + return trimmed.get(cue.len()..); + } + } + None +} + +fn word_boundary_after(text: &str, byte: usize) -> bool { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .is_none_or(|ch| !ch.is_alphabetic()) +} + +fn starts_with_known_first_name( + quote_content: &str, + filters: &DenyListFilterData, +) -> bool { + let first_word = quote_content + .trim() + .chars() + .take_while(|ch| ch.is_alphabetic()) + .collect::(); + !first_word.is_empty() + && filters.first_names.contains(&first_word.to_lowercase()) +} + +fn has_person_role_definition( + after_closing_quote: &str, + filters: &DenyListFilterData, +) -> bool { + let Some(after_cue) = strip_defined_term_cue(after_closing_quote, filters) + else { + return false; + }; + after_cue + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()) + .take(8) + .any(|word| filters.generic_roles.contains(&word.to_lowercase())) +} + +fn extend_city_districts( + entities: &mut [PipelineEntity], + full_text: &str, + offsets: &ByteOffsets<'_>, + filters: Option<&DenyListFilterData>, +) -> Result<()> { + for entity in entities { + if entity.label != ADDRESS_LABEL + || entity.source_detail == Some(SourceDetail::CustomDenyList) + { + continue; + } + + if let Some(suffix) = + match_district_suffix(slice_from(full_text, offsets, entity.end)?) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(full_text, entity.start, entity.end)?; + } + + if let Some(suffix) = + match_dash_district(slice_from(full_text, offsets, entity.end)?) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(full_text, entity.start, entity.end)?; + } + + let before = offsets.slice( + full_text, + offsets.floor_offset(entity.start.saturating_sub(10))?, + entity.start, + )?; + if let Some(prefix) = postal_prefix(&before) { + entity.start = entity.start.saturating_sub(byte_len(prefix)); + entity.text = offsets.slice(full_text, entity.start, entity.end)?; + } + + if let Some(filters) = filters + && let Some(suffix) = match_trailing_address_word( + slice_from(full_text, offsets, entity.end)?, + filters, + ) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(full_text, entity.start, entity.end)?; + } } Ok(()) } +fn match_district_suffix(after: &str) -> Option<&str> { + let rest = after.strip_prefix(' ')?; + let suffix = numeric_district(rest).or_else(|| roman_district(rest))?; + let end = ' '.len_utf8().saturating_add(suffix.len()); + let next = after.get(end..).and_then(|tail| tail.chars().next()); + next + .is_none_or(is_district_boundary) + .then(|| after.get(..end)) + .flatten() +} + +fn numeric_district(text: &str) -> Option<&str> { + let digits = text + .chars() + .take_while(char::is_ascii_digit) + .collect::(); + if digits.is_empty() || digits.len() > 2 { + return None; + } + text.get(..digits.len()) +} + +fn roman_district(text: &str) -> Option<&str> { + roman_districts() + .iter() + .find_map(|roman| text.starts_with(roman).then_some(*roman)) +} + +const fn roman_districts() -> &'static [&'static str] { + &[ + "XXX", "XXIX", "XXVIII", "XXVII", "XXVI", "XXV", "XXIV", "XXIII", "XXII", + "XXI", "XX", "XIX", "XVIII", "XVII", "XVI", "XV", "XIV", "XIII", "XII", + "XI", "X", "IX", "VIII", "VII", "VI", "IV", "III", "II", + ] +} + +const fn is_district_boundary(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, ',' | ';' | '.' | ')' | '"') +} + +fn match_dash_district(after: &str) -> Option<&str> { + let (space_len, after_space) = consume_spaces_or_tabs(after, 1, 4)?; + let dash = after_space.chars().next()?; + if dash != '-' && dash != '–' { + return None; + } + let after_dash = after_space.get(dash.len_utf8()..)?; + let (post_dash_spaces, word_start) = + consume_spaces_or_tabs(after_dash, 0, usize::MAX) + .unwrap_or((0, after_dash)); + let mut chars = word_start.chars(); + let first = chars.next()?; + let second = chars.next()?; + if !first.is_uppercase() || !second.is_lowercase() { + return None; + } + let word_len = first + .len_utf8() + .saturating_add(second.len_utf8()) + .saturating_add( + chars + .take_while(|ch| ch.is_lowercase()) + .map(char::len_utf8) + .sum::(), + ); + let total = space_len + .saturating_add(dash.len_utf8()) + .saturating_add(post_dash_spaces) + .saturating_add(word_len); + after.get(..total) +} + +fn consume_spaces_or_tabs( + text: &str, + min: usize, + max: usize, +) -> Option<(usize, &str)> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if (ch != ' ' && ch != '\t') || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min) + .then(|| text.get(byte..).map(|rest| (byte, rest))) + .flatten() +} + +fn postal_prefix(before: &str) -> Option<&str> { + let trimmed_end = before.trim_end(); + let suffix_ws = before.len().saturating_sub(trimmed_end.len()); + let before_dash = + trimmed_end.trim_end_matches(|ch: char| ch.is_whitespace() || is_dash(ch)); + let dash_ws = trimmed_end.len().saturating_sub(before_dash.len()); + + if let Some(code) = trailing_postal_code(before_dash) { + let start = before_dash.len().saturating_sub(code.len()); + let end = before + .len() + .saturating_sub(suffix_ws) + .saturating_add(dash_ws); + return before.get(start..end); + } + None +} + +fn trailing_postal_code(text: &str) -> Option<&str> { + let chars = text.chars().collect::>(); + if chars.len() >= 5 { + let start = chars.len().saturating_sub(5); + if five_digits_at(&chars, start) { + return text.get(byte_index_for_char(text, start)..); + } + } + if chars.len() >= 6 { + let start = chars.len().saturating_sub(6); + if three_digits_at(&chars, start) + && chars.get(start.saturating_add(3)) == Some(&' ') + && two_digits_at(&chars, start.saturating_add(4)) + { + return text.get(byte_index_for_char(text, start)..); + } + } + None +} + +fn byte_index_for_char(text: &str, char_index: usize) -> usize { + text + .char_indices() + .nth(char_index) + .map_or(text.len(), |(byte, _)| byte) +} + +const fn is_dash(ch: char) -> bool { + matches!(ch, '-' | '–' | '—') +} + +fn match_trailing_address_word<'a>( + after: &'a str, + filters: &DenyListFilterData, +) -> Option<&'a str> { + let (space_len, word_start) = consume_whitespace_no_newline(after, 1, 4)?; + let mut chars = word_start.chars(); + let first = chars.next()?; + let second = chars.next()?; + if !first.is_uppercase() || !second.is_lowercase() { + return None; + } + let rest_len = chars + .take_while(|ch| ch.is_lowercase()) + .map(char::len_utf8) + .sum::(); + let word_len = first + .len_utf8() + .saturating_add(second.len_utf8()) + .saturating_add(rest_len); + let word = word_start.get(..word_len)?; + if filters + .trailing_address_word_exclusions + .contains(&word.to_lowercase()) + { + return None; + } + after.get(..space_len.saturating_add(word_len)) +} + +fn consume_whitespace_no_newline( + text: &str, + min: usize, + max: usize, +) -> Option<(usize, &str)> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if ch == '\n' || !ch.is_whitespace() || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min) + .then(|| text.get(byte..).map(|rest| (byte, rest))) + .flatten() +} + fn try_gazetteer_prefix_extension( full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, found: &SearchMatch, ) -> Result)>> { let full_len = offsets.len()?; @@ -331,6 +1529,7 @@ fn try_gazetteer_prefix_extension( .end() .saturating_add(MAX_GAZETTEER_PREFIX_OVERSHOOT) .min(full_len); + let max_end = offsets.floor_offset(max_end)?; if max_end <= found.end().saturating_add(1) { return Ok(None); } @@ -357,7 +1556,7 @@ fn next_space_offset_after_initial(text: &str) -> u32 { let mut offset = 0_u32; for ch in text.chars() { - let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + let width = u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX); if offset > 0 && ch == ' ' { return offset; } @@ -369,7 +1568,7 @@ fn next_space_offset_after_initial(text: &str) -> u32 { fn starts_as_proper_noun( full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, start: u32, ) -> Result { let start_byte = offsets.validate_offset(start)?; @@ -391,7 +1590,7 @@ fn starts_as_proper_noun( fn custom_match_has_valid_edges( full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, start: u32, end: u32, pattern: &str, diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index e51925b1..79ab16f6 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -1,10 +1,10 @@ +use crate::byte_offsets::ByteOffsets; use crate::normalize::placeholder_fallback; use crate::placeholders::build_placeholder_map; use crate::types::{ Entity, EntityKind, OperatorConfig, OperatorEntry, OperatorType, RedactionEntry, RedactionResult, Result, }; -use crate::utf16::Utf16Offsets; pub fn redact_text( full_text: &str, @@ -20,7 +20,7 @@ pub fn redact_text( }); } - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); validate_spans(entities, &offsets)?; let placeholder_map = build_placeholder_map(entities, full_text); @@ -99,7 +99,10 @@ pub fn deanonymise( result } -fn validate_spans(entities: &[Entity], offsets: &Utf16Offsets) -> Result<()> { +fn validate_spans( + entities: &[Entity], + offsets: &ByteOffsets<'_>, +) -> Result<()> { for entity in entities { // Empty spans would insert without redacting. if entity.start >= entity.end { @@ -124,7 +127,7 @@ struct RedactionSpan { fn redaction_spans( full_text: &str, entities: &[Entity], - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, ) -> Result> { let mut resolved = Vec::with_capacity(entities.len()); diff --git a/crates/anonymize-core/src/resolution/boundary.rs b/crates/anonymize-core/src/resolution/boundary.rs index 43e5950f..53f3b396 100644 --- a/crates/anonymize-core/src/resolution/boundary.rs +++ b/crates/anonymize-core/src/resolution/boundary.rs @@ -1,16 +1,16 @@ use std::collections::{BTreeMap, BTreeSet}; +use crate::byte_offsets::ByteOffsets; use crate::types::Result; -use crate::utf16::Utf16Offsets; -use super::common::{contains_span, entity_len, is_caller_owned, utf16_len}; +use super::common::{byte_len, contains_span, entity_len, is_caller_owned}; use super::{DetectionSource, PipelineEntity}; pub fn enforce_boundary_consistency( entities: &[PipelineEntity], full_text: &str, ) -> Result> { - let offsets = Utf16Offsets::new(full_text); + let offsets = ByteOffsets::new(full_text); let spans = char_spans(full_text); let boundaries = word_boundaries(&spans); let fixed = @@ -31,7 +31,7 @@ struct CharSpan { fn fix_partial_words( entities: &[PipelineEntity], full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, spans: &[CharSpan], boundaries: &BTreeSet, ) -> Result> { @@ -83,7 +83,7 @@ fn fix_partial_words( fn resolve_cross_label_overlaps( entities: &[PipelineEntity], full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, ) -> Result> { let mut sorted = entities.to_vec(); sorted.sort_by_key(|entity| entity.start); @@ -172,7 +172,7 @@ fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { fn merge_adjacent( entities: &[PipelineEntity], full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, ) -> Result> { let mut sorted = entities.to_vec(); sorted.sort_by_key(|entity| entity.start); @@ -279,7 +279,7 @@ fn char_spans(text: &str) -> Vec { let mut offset = 0_u32; for ch in text.chars() { - let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + let width = u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX); let end = offset.saturating_add(width); spans.push(CharSpan { start: offset, @@ -401,7 +401,7 @@ fn merge_into_previous( previous_index: usize, entity: &PipelineEntity, full_text: &str, - offsets: &Utf16Offsets, + offsets: &ByteOffsets<'_>, ) -> Result<()> { if let Some(previous) = entities.get_mut(previous_index) { previous.end = previous.end.max(entity.end); @@ -427,7 +427,7 @@ fn is_legal_form_organization(entity: &PipelineEntity) -> bool { fn is_mergeable_gap(gap: &str) -> bool { gap.is_empty() - || (utf16_len(gap) <= 3 + || (byte_len(gap) <= 3 && gap.chars().all(|ch| matches!(ch, ' ' | '\t' | ',' | '-'))) } diff --git a/crates/anonymize-core/src/resolution/common.rs b/crates/anonymize-core/src/resolution/common.rs index 0bc00b2a..7afe2b7e 100644 --- a/crates/anonymize-core/src/resolution/common.rs +++ b/crates/anonymize-core/src/resolution/common.rs @@ -18,6 +18,6 @@ pub(crate) const fn is_caller_owned(entity: &PipelineEntity) -> bool { ) } -pub(crate) fn utf16_len(text: &str) -> u32 { - u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +pub(crate) fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) } diff --git a/crates/anonymize-core/src/resolution/mod.rs b/crates/anonymize-core/src/resolution/mod.rs index 379bae77..0c9463a8 100644 --- a/crates/anonymize-core/src/resolution/mod.rs +++ b/crates/anonymize-core/src/resolution/mod.rs @@ -7,4 +7,5 @@ mod types; pub use boundary::enforce_boundary_consistency; pub use merge::merge_and_dedup; pub use sanitize::sanitize_entities; +pub(crate) use sanitize::sanitize_entities_with_source; pub use types::{DetectionSource, PipelineEntity, SourceDetail}; diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs index 0c06b39c..276165b8 100644 --- a/crates/anonymize-core/src/resolution/sanitize.rs +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -1,4 +1,7 @@ -use super::common::{is_caller_owned, utf16_len}; +use crate::byte_offsets::ByteOffsets; +use crate::types::Result; + +use super::common::{byte_len, is_caller_owned}; use super::{DetectionSource, PipelineEntity, SourceDetail}; const LEGAL_PERIOD_SUFFIXES: &str = @@ -16,7 +19,7 @@ pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { continue; } - let Some(cleaned) = clean_entity_text(entity) else { + let Some(cleaned) = clean_entity_text(entity, &entity.text) else { continue; }; sanitized.push(cleaned); @@ -25,12 +28,37 @@ pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { sanitized } -fn clean_entity_text(entity: &PipelineEntity) -> Option { +pub(crate) fn sanitize_entities_with_source( + entities: &[PipelineEntity], + full_text: &str, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut sanitized = Vec::new(); + + for entity in entities { + if is_caller_owned(entity) || has_curated_literal_boundary(entity) { + sanitized.push(entity.clone()); + continue; + } + + let raw_text = offsets.slice(full_text, entity.start, entity.end)?; + let Some(cleaned) = clean_entity_text(entity, &raw_text) else { + continue; + }; + sanitized.push(cleaned); + } + + Ok(sanitized) +} + +fn clean_entity_text( + entity: &PipelineEntity, + raw_text: &str, +) -> Option { let mut start_byte = 0; - let mut end_byte = entity.text.len(); + let mut end_byte = raw_text.len(); - while let Some((ch, len)) = first_char(entity.text.get(start_byte..end_byte)?) - { + while let Some((ch, len)) = first_char(raw_text.get(start_byte..end_byte)?) { if ch.is_whitespace() || is_leading_trim(ch, &entity.label) { start_byte = start_byte.saturating_add(len); continue; @@ -38,8 +66,7 @@ fn clean_entity_text(entity: &PipelineEntity) -> Option { break; } - while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) - { + while let Some((ch, len)) = last_char(raw_text.get(start_byte..end_byte)?) { if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { end_byte = end_byte.saturating_sub(len); continue; @@ -47,12 +74,11 @@ fn clean_entity_text(entity: &PipelineEntity) -> Option { break; } - if should_strip_period(entity, start_byte, end_byte) { + if should_strip_period(entity, raw_text, start_byte, end_byte) { end_byte = end_byte.saturating_sub('.'.len_utf8()); } - while let Some((ch, len)) = last_char(entity.text.get(start_byte..end_byte)?) - { + while let Some((ch, len)) = last_char(raw_text.get(start_byte..end_byte)?) { if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { end_byte = end_byte.saturating_sub(len); continue; @@ -64,16 +90,16 @@ fn clean_entity_text(entity: &PipelineEntity) -> Option { return None; } - let cleaned_raw = entity.text.get(start_byte..end_byte)?; + let cleaned_raw = raw_text.get(start_byte..end_byte)?; if !cleaned_raw.chars().any(char::is_alphanumeric) { return None; } let display_text = collapse_display_whitespace(cleaned_raw); - let start = entity.start.saturating_add(utf16_len( - entity.text.get(..start_byte).unwrap_or_default(), - )); - let end = start.saturating_add(utf16_len(cleaned_raw)); + let start = entity + .start + .saturating_add(byte_len(raw_text.get(..start_byte).unwrap_or_default())); + let end = start.saturating_add(byte_len(cleaned_raw)); let mut cleaned = entity.clone(); cleaned.start = start; @@ -146,6 +172,7 @@ const fn is_literal_boundary_punct(ch: char) -> bool { fn should_strip_period( entity: &PipelineEntity, + raw_text: &str, start_byte: usize, end_byte: usize, ) -> bool { @@ -155,7 +182,7 @@ fn should_strip_period( ) { return false; } - let Some(text) = entity.text.get(start_byte..end_byte) else { + let Some(text) = raw_text.get(start_byte..end_byte) else { return false; }; if !text.ends_with('.') || known_period_suffix(text) { diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 4b165196..cbfcd97a 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -1,11 +1,7 @@ -use std::collections::BTreeMap; -use stella_aho_corasick_core as literal_core; -use stella_fuzzy_search_core as fuzzy_core; -use stella_regex_set_core as regex_core; +use stella_text_search_core as text_search; use crate::types::{Error, Result, SearchEngine, SearchMatch}; -// Preserves caller pattern indexes across primitive engines. #[derive(Clone, Debug, Eq, PartialEq)] pub enum SearchPattern { Literal(String), @@ -15,6 +11,13 @@ pub enum SearchPattern { whole_words: Option, }, Regex(String), + RegexWithOptions { + pattern: String, + lazy: bool, + prefilter_any: Vec, + prefilter_case_insensitive: Option, + prefilter_regex: Option, + }, Fuzzy { pattern: String, distance: Option, @@ -57,143 +60,153 @@ impl Default for FuzzySearchOptions { } pub struct SearchIndex { - literal: Vec, - regex: Option, - regex_pattern_indexes: Vec, - fuzzy: Option, - fuzzy_pattern_indexes: Vec, + slots: Vec, } -struct LiteralSlot { - engine: literal_core::AhoCorasick, +struct SearchSlot { + engine: SlotEngine, + search: text_search::TextSearch, pattern_indexes: Vec, } -type LiteralPatternGroup = (Vec, Vec); +#[derive(Clone, Copy)] +enum SlotEngine { + Literal, + Regex, + Fuzzy, +} impl SearchIndex { pub fn new( patterns: Vec, options: SearchOptions, ) -> Result { - let mut literal_groups = - BTreeMap::::new(); - let mut regex_patterns = Vec::::new(); - let mut regex_pattern_indexes = Vec::::new(); - let mut fuzzy_patterns = Vec::::new(); - let mut fuzzy_pattern_indexes = Vec::::new(); + let mut literals = Vec::new(); + let mut literal_indexes = Vec::new(); + let mut regex = Vec::new(); + let mut regex_indexes = Vec::new(); + let mut fuzzy = Vec::new(); + let mut fuzzy_indexes = Vec::new(); for (index, entry) in patterns.into_iter().enumerate() { let pattern_index = pattern_index(index)?; match entry { - SearchPattern::Literal(value) => { - push_literal_pattern( - &mut literal_groups, - options.literal, - value, - pattern_index, - ); + SearchPattern::Literal(pattern) => { + literals.push(text_search::PatternEntry::Literal( + text_search::LiteralPattern { + pattern, + name: None, + case_insensitive: None, + whole_words: None, + }, + )); + literal_indexes.push(pattern_index); } SearchPattern::LiteralWithOptions { pattern, case_insensitive, whole_words, } => { - push_literal_pattern( - &mut literal_groups, - LiteralSearchOptions { - case_insensitive: case_insensitive - .unwrap_or(options.literal.case_insensitive), - whole_words: whole_words.unwrap_or(options.literal.whole_words), + literals.push(text_search::PatternEntry::Literal( + text_search::LiteralPattern { + pattern, + name: None, + case_insensitive, + whole_words, }, - pattern, - pattern_index, - ); + )); + literal_indexes.push(pattern_index); } - SearchPattern::Regex(value) => { - regex_patterns.push(value); - regex_pattern_indexes.push(pattern_index); + SearchPattern::Regex(pattern) => { + regex.push(text_search::PatternEntry::Regex( + text_search::RegexPattern::new(pattern), + )); + regex_indexes.push(pattern_index); } - SearchPattern::Fuzzy { - pattern: fuzzy_pattern, - distance, + SearchPattern::RegexWithOptions { + pattern, + lazy, + prefilter_any, + prefilter_case_insensitive, + prefilter_regex, } => { - fuzzy_patterns.push(fuzzy_core::PatternEntry { - pattern: fuzzy_pattern, - distance, - }); - fuzzy_pattern_indexes.push(pattern_index); + let mut regex_pattern = text_search::RegexPattern::new(pattern); + regex_pattern.lazy = lazy; + regex_pattern.prefilter_any = prefilter_any; + regex_pattern.prefilter_case_insensitive = prefilter_case_insensitive; + regex_pattern.prefilter_regex = prefilter_regex; + regex.push(text_search::PatternEntry::Regex(regex_pattern)); + regex_indexes.push(pattern_index); + } + SearchPattern::Fuzzy { pattern, distance } => { + fuzzy.push(text_search::PatternEntry::Fuzzy( + text_search::FuzzyPattern::new( + pattern, + distance.map_or( + text_search::FuzzyDistance::Auto, + text_search::FuzzyDistance::Exact, + ), + ), + )); + fuzzy_indexes.push(pattern_index); } } } - let literal = build_literal_slots(literal_groups)?; - let regex = build_regex(regex_patterns, options.regex)?; - let fuzzy = build_fuzzy(fuzzy_patterns, options.fuzzy)?; - - Ok(Self { - literal, + let mut slots = Vec::new(); + push_slot( + &mut slots, + SlotEngine::Literal, + literals, + literal_indexes, + literal_options(options.literal), + )?; + push_slot( + &mut slots, + SlotEngine::Regex, regex, - regex_pattern_indexes, + regex_indexes, + regex_options(options.regex), + )?; + push_slot( + &mut slots, + SlotEngine::Fuzzy, fuzzy, - fuzzy_pattern_indexes, - }) + fuzzy_indexes, + fuzzy_options(options.fuzzy), + )?; + + Ok(Self { slots }) } pub fn find_iter(&self, haystack: &str) -> Result> { let mut matches = Vec::new(); - - for slot in &self.literal { - // Downstream merge priority chooses among overlaps. - extend_triple_matches( - &mut matches, - SearchEngine::Literal, - &slot.pattern_indexes, - &slot - .engine - .find_overlapping_iter_packed(haystack) - .map_err(|err| Error::Search { - engine: SearchEngine::Literal, - reason: err.to_string(), - })?, - |pattern, start, end| SearchMatch::Literal { - pattern, - start, - end, - }, - )?; - } - - if let Some(regex) = &self.regex { - extend_triple_matches( - &mut matches, - SearchEngine::Regex, - &self.regex_pattern_indexes, - ®ex - .find_iter_packed(haystack) - .map_err(|err| Error::Search { - engine: SearchEngine::Regex, - reason: err.to_string(), - })?, - |pattern, start, end| SearchMatch::Regex { - pattern, - start, - end, - }, - )?; - } - - if let Some(fuzzy) = &self.fuzzy { - extend_fuzzy_matches( - &mut matches, - &self.fuzzy_pattern_indexes, - &fuzzy - .find_iter_packed(haystack) - .map_err(|err| Error::Search { - engine: SearchEngine::Fuzzy, - reason: err.to_string(), - })?, - )?; + for slot in &self.slots { + for found in slot + .search + .find_iter(haystack) + .map_err(|error| search_error(&error))? + { + let pattern = remap_pattern(slot, found.pattern)?; + matches.push(match slot.engine { + SlotEngine::Literal => SearchMatch::Literal { + pattern, + start: found.start, + end: found.end, + }, + SlotEngine::Regex => SearchMatch::Regex { + pattern, + start: found.start, + end: found.end, + }, + SlotEngine::Fuzzy => SearchMatch::Fuzzy { + pattern, + start: found.start, + end: found.end, + distance: found.distance.unwrap_or(0), + }, + }); + } } matches.sort_by(|left, right| { @@ -207,222 +220,103 @@ impl SearchIndex { } pub fn is_match(&self, haystack: &str) -> Result { - for slot in &self.literal { + for slot in &self.slots { if slot - .engine + .search .is_match(haystack) - .map_err(|err| Error::Search { - engine: SearchEngine::Literal, - reason: err.to_string(), - })? + .map_err(|error| search_error(&error))? { return Ok(true); } } - if let Some(regex) = &self.regex - && regex.is_match(haystack) - { - return Ok(true); - } - - if let Some(fuzzy) = &self.fuzzy - && fuzzy.is_match(haystack).map_err(|err| Error::Search { - engine: SearchEngine::Fuzzy, - reason: err.to_string(), - })? - { - return Ok(true); - } - Ok(false) } } -fn push_literal_pattern( - groups: &mut BTreeMap, - options: LiteralSearchOptions, - pattern: String, - pattern_index: u32, -) { - let (patterns, pattern_indexes) = groups.entry(options).or_default(); - patterns.push(pattern); - pattern_indexes.push(pattern_index); -} - -fn build_literal_slots( - groups: BTreeMap, -) -> Result> { - let mut slots = Vec::new(); - - for (options, (patterns, pattern_indexes)) in groups { - if let Some(engine) = build_literal(patterns, options)? { - slots.push(LiteralSlot { - engine, - pattern_indexes, - }); - } +fn push_slot( + slots: &mut Vec, + engine: SlotEngine, + patterns: Vec, + pattern_indexes: Vec, + options: text_search::TextSearchOptions, +) -> Result<()> { + if patterns.is_empty() { + return Ok(()); } - Ok(slots) + let search = text_search::TextSearch::new(patterns, options) + .map_err(|error| search_error(&error))?; + slots.push(SearchSlot { + engine, + search, + pattern_indexes, + }); + Ok(()) } -fn build_literal( - patterns: Vec, +fn literal_options( options: LiteralSearchOptions, -) -> Result> { - if patterns.is_empty() { - return Ok(None); +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + case_insensitive: options.case_insensitive, + whole_words: options.whole_words, + overlap_strategy: text_search::OverlapStrategy::All, + ..text_search::TextSearchOptions::default() } - - literal_core::AhoCorasick::new( - patterns, - literal_core::Options { - match_kind: literal_core::MatchKind::LeftmostFirst, - case_insensitive: options.case_insensitive, - dfa: false, - whole_words: options.whole_words, - }, - ) - .map(Some) - .map_err(|err| Error::Search { - engine: SearchEngine::Literal, - reason: err.to_string(), - }) } -fn build_regex( - patterns: Vec, +fn regex_options( options: RegexSearchOptions, -) -> Result> { - if patterns.is_empty() { - return Ok(None); +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + whole_words: options.whole_words, + ..text_search::TextSearchOptions::default() } - - regex_core::RegexSet::new( - patterns, - regex_core::Options { - whole_words: options.whole_words, - unicode_boundaries: true, - }, - ) - .map(Some) - .map_err(|err| Error::Search { - engine: SearchEngine::Regex, - reason: err.to_string(), - }) } -fn build_fuzzy( - patterns: Vec, +fn fuzzy_options( options: FuzzySearchOptions, -) -> Result> { - if patterns.is_empty() { - return Ok(None); +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + case_insensitive: options.case_insensitive, + whole_words: options.whole_words, + normalize_diacritics: options.normalize_diacritics, + ..text_search::TextSearchOptions::default() } - - fuzzy_core::FuzzySearch::new( - patterns, - fuzzy_core::Options { - metric: fuzzy_core::Metric::Levenshtein, - normalize_diacritics: options.normalize_diacritics, - unicode_boundaries: true, - whole_words: options.whole_words, - case_insensitive: options.case_insensitive, - }, - ) - .map(Some) - .map_err(|err| Error::Search { - engine: SearchEngine::Fuzzy, - reason: err.to_string(), - }) } -fn extend_triple_matches( - matches: &mut Vec, - engine: SearchEngine, - pattern_indexes: &[u32], - packed: &[u32], - make_match: impl Fn(u32, u32, u32) -> SearchMatch, -) -> Result<()> { - let chunks = packed.chunks_exact(3); - if !chunks.remainder().is_empty() { - return Err(invalid_packed_search_result(engine, packed.len())); - } - - for chunk in chunks { - let [local_pattern, start, end] = chunk else { - return Err(invalid_packed_search_result(engine, packed.len())); - }; - let pattern = pattern_index_from_packed( - engine, - pattern_indexes, - *local_pattern, - packed.len(), - )?; - - matches.push(make_match(pattern, *start, *end)); - } - - Ok(()) +fn remap_pattern(slot: &SearchSlot, local_pattern: u32) -> Result { + let index = usize::try_from(local_pattern).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: local_pattern, + } + })?; + slot + .pattern_indexes + .get(index) + .copied() + .ok_or_else(|| Error::Search { + engine: slot.engine.into(), + reason: format!("Missing pattern map entry for {local_pattern}"), + }) } -fn extend_fuzzy_matches( - matches: &mut Vec, - pattern_indexes: &[u32], - packed: &[u32], -) -> Result<()> { - let chunks = packed.chunks_exact(4); - if !chunks.remainder().is_empty() { - return Err(invalid_packed_search_result( - SearchEngine::Fuzzy, - packed.len(), - )); +fn search_error(error: &text_search::Error) -> Error { + Error::Search { + engine: SearchEngine::Text, + reason: error.to_string(), } - - for chunk in chunks { - let [local_pattern, start, end, distance] = chunk else { - return Err(invalid_packed_search_result( - SearchEngine::Fuzzy, - packed.len(), - )); - }; - let pattern = pattern_index_from_packed( - SearchEngine::Fuzzy, - pattern_indexes, - *local_pattern, - packed.len(), - )?; - - matches.push(SearchMatch::Fuzzy { - pattern, - start: *start, - end: *end, - distance: *distance, - }); - } - - Ok(()) -} - -fn pattern_index_from_packed( - engine: SearchEngine, - pattern_indexes: &[u32], - local_pattern: u32, - len: usize, -) -> Result { - usize::try_from(local_pattern) - .ok() - .and_then(|index| pattern_indexes.get(index)) - .copied() - .ok_or_else(|| invalid_packed_search_result(engine, len)) } -const fn invalid_packed_search_result( - engine: SearchEngine, - len: usize, -) -> Error { - Error::InvalidPackedSearchResult { engine, len } +impl From for SearchEngine { + fn from(value: SlotEngine) -> Self { + match value { + SlotEngine::Literal => Self::Literal, + SlotEngine::Regex => Self::Regex, + SlotEngine::Fuzzy => Self::Fuzzy, + } + } } fn pattern_index(index: usize) -> Result { diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 38201587..0de292fb 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -9,10 +9,10 @@ pub enum Error { start: u32, end: u32, }, - Utf16OffsetOutOfBounds { + ByteOffsetOutOfBounds { offset: u32, }, - Utf16OffsetInsideSurrogate { + ByteOffsetInsideCodepoint { offset: u32, }, Search { @@ -26,6 +26,9 @@ pub enum Error { PatternIndexOutOfRange { index: usize, }, + PatternIndexNotAddressable { + pattern: u32, + }, UnsupportedRegexValidation { pattern: u32, }, @@ -35,6 +38,9 @@ pub enum Error { UnsupportedDenyListSource { source: String, }, + MissingStaticData { + field: &'static str, + }, StaticDataLengthMismatch { field: &'static str, expected: usize, @@ -48,14 +54,11 @@ impl fmt::Display for Error { Self::InvalidSpan { start, end } => { write!(formatter, "Invalid entity span: {start}..{end}") } - Self::Utf16OffsetOutOfBounds { offset } => { - write!(formatter, "UTF-16 offset is out of bounds: {offset}") + Self::ByteOffsetOutOfBounds { offset } => { + write!(formatter, "Byte offset is out of bounds: {offset}") } - Self::Utf16OffsetInsideSurrogate { offset } => { - write!( - formatter, - "UTF-16 offset is not a scalar boundary: {offset}" - ) + Self::ByteOffsetInsideCodepoint { offset } => { + write!(formatter, "Byte offset is not a UTF-8 boundary: {offset}") } Self::Search { engine, reason } => { write!(formatter, "{engine} search failed: {reason}") @@ -69,6 +72,12 @@ impl fmt::Display for Error { Self::PatternIndexOutOfRange { index } => { write!(formatter, "Search pattern index exceeds u32 range: {index}") } + Self::PatternIndexNotAddressable { pattern } => { + write!( + formatter, + "Search pattern index is not addressable: {pattern}" + ) + } Self::UnsupportedRegexValidation { pattern } => { write!( formatter, @@ -87,6 +96,9 @@ impl fmt::Display for Error { "Deny-list source '{source}' is not supported by native core" ) } + Self::MissingStaticData { field } => { + write!(formatter, "Static data field '{field}' is required") + } Self::StaticDataLengthMismatch { field, expected, @@ -109,7 +121,7 @@ pub enum EntityKind { Coreference { source_text: String }, } -/// Source span with UTF-16 offsets. +/// Source span with UTF-8 byte offsets. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Entity { pub start: u32, @@ -281,6 +293,7 @@ pub enum SearchEngine { Literal, Regex, Fuzzy, + Text, } impl fmt::Display for SearchEngine { @@ -289,11 +302,12 @@ impl fmt::Display for SearchEngine { Self::Literal => formatter.write_str("literal"), Self::Regex => formatter.write_str("regex"), Self::Fuzzy => formatter.write_str("fuzzy"), + Self::Text => formatter.write_str("text-search"), } } } -/// Search match with the caller's pattern index. +/// Search match with the caller's pattern index and UTF-8 byte offsets. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum SearchMatch { Literal { @@ -350,4 +364,28 @@ impl SearchMatch { | Self::Fuzzy { end, .. } => *end, } } + + #[must_use] + pub(crate) const fn with_span(self, start: u32, end: u32) -> Self { + match self { + Self::Literal { pattern, .. } => Self::Literal { + pattern, + start, + end, + }, + Self::Regex { pattern, .. } => Self::Regex { + pattern, + start, + end, + }, + Self::Fuzzy { + pattern, distance, .. + } => Self::Fuzzy { + pattern, + start, + end, + distance, + }, + } + } } diff --git a/crates/anonymize-core/src/utf16.rs b/crates/anonymize-core/src/utf16.rs deleted file mode 100644 index 6afb583b..00000000 --- a/crates/anonymize-core/src/utf16.rs +++ /dev/null @@ -1,66 +0,0 @@ -use crate::types::{Error, Result}; - -pub(crate) struct Utf16Offsets { - offsets: Vec>, -} - -impl Utf16Offsets { - pub(crate) fn new(text: &str) -> Self { - let capacity = text.encode_utf16().count().saturating_add(1); - let mut offsets = Vec::with_capacity(capacity); - offsets.push(Some(0)); - - let mut byte_cursor: usize = 0; - for ch in text.chars() { - byte_cursor = byte_cursor.saturating_add(ch.len_utf8()); - if ch.len_utf16() == 2 { - offsets.push(None); - } - offsets.push(Some(byte_cursor)); - } - - Self { offsets } - } - - pub(crate) fn len(&self) -> Result { - let len = self - .offsets - .len() - .checked_sub(1) - .ok_or(Error::Utf16OffsetOutOfBounds { offset: 0 })?; - u32::try_from(len) - .map_err(|_| Error::Utf16OffsetOutOfBounds { offset: u32::MAX }) - } - - pub(crate) fn validate_offset(&self, offset: u32) -> Result { - let index = usize::try_from(offset) - .map_err(|_| Error::Utf16OffsetOutOfBounds { offset })?; - self - .offsets - .get(index) - .copied() - .ok_or(Error::Utf16OffsetOutOfBounds { offset })? - .ok_or(Error::Utf16OffsetInsideSurrogate { offset }) - } - - pub(crate) fn slice( - &self, - full_text: &str, - start: u32, - end: u32, - ) -> Result { - if start > end { - return Err(Error::InvalidSpan { start, end }); - } - - let start_byte = self.validate_offset(start)?; - let end_byte = self.validate_offset(end)?; - - Ok( - full_text - .get(start_byte..end_byte) - .ok_or(Error::InvalidSpan { start, end })? - .to_owned(), - ) - } -} diff --git a/crates/anonymize-core/tests/normalize.rs b/crates/anonymize-core/tests/normalize.rs index 7d24ee4c..13f5f98f 100644 --- a/crates/anonymize-core/tests/normalize.rs +++ b/crates/anonymize-core/tests/normalize.rs @@ -11,10 +11,10 @@ fn normalize_for_search_matches_ts_replacements() { } #[test] -fn normalize_for_search_preserves_utf16_width() { +fn normalize_for_search_does_not_preserve_byte_width() { let input = "a\u{00a0}\u{1f600}\u{2013}b"; let output = normalize_for_search(input); assert_eq!(output, "a \u{1f600}-b"); - assert_eq!(output.encode_utf16().count(), input.encode_utf16().count()); + assert_ne!(output.len(), input.len()); } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 38ec9990..152f17d2 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1,11 +1,11 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - CountryMatchData, DenyListMatchData, DetectionSource, Error, - FuzzySearchOptions, GazetteerMatchData, LiteralSearchOptions, OperatorConfig, - PatternSlice, PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, - RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, - SourceDetail, + CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, + DiagnosticEventKind, DiagnosticStage, Error, FuzzySearchOptions, + GazetteerMatchData, LiteralSearchOptions, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -111,6 +111,7 @@ fn prepared_search_emits_static_detector_entities() { score: 1.0, source_detail: Some(SourceDetail::CustomRegex), requires_validation: false, + min_byte_length: None, }], deny_list_data: None, gazetteer_data: Some(GazetteerMatchData { @@ -202,6 +203,75 @@ fn prepared_search_redacts_static_entities_end_to_end() { assert_eq!(result.resolved_entities.len(), 3); } +#[test] +fn prepared_search_reports_static_redaction_diagnostics() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![], + deny_list_data: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + "Acme s.r.o. filed AB1234.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!( + result.result.redaction.redacted_text, + "[ORGANIZATION_1] filed [REGISTRATION_NUMBER_1]." + ); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::SearchRegex + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::Sanitize + && event.kind == DiagnosticEventKind::Entity + && event.label.as_deref() == Some("organization") + && event.span_valid == Some(true) + })); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::Redaction + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(2) + })); +} + #[test] fn prepared_search_redacts_custom_deny_list_entities() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -232,6 +302,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { custom_labels: vec![vec![String::from("matter")]], originals: vec![String::from("Secret Code")], sources: vec![vec![String::from("custom-deny-list")]], + filters: None, }), gazetteer_data: None, country_data: None, @@ -294,13 +365,50 @@ fn prepared_search_rejects_unsupported_static_slices() { } #[test] -fn prepared_search_rejects_curated_deny_list_sources() { +fn prepared_search_redacts_curated_deny_list_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Prague"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]], + custom_labels: vec![vec![]], + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]], + filters: Some(DenyListFilterData::default()), + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Prague filed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.redaction.redacted_text, "[ADDRESS_1] filed."); +} + +#[test] +fn prepared_search_rejects_curated_deny_list_without_filters() { let error = PreparedSearch::new(PreparedSearchConfig { deny_list_data: Some(DenyListMatchData { labels: vec![vec![String::from("address")]], custom_labels: vec![vec![]], originals: vec![String::from("Prague")], sources: vec![vec![String::from("city")]], + filters: None, }), ..empty_config(PreparedSearchSlices { deny_list: PatternSlice { start: 0, end: 1 }, @@ -312,8 +420,8 @@ fn prepared_search_rejects_curated_deny_list_sources() { assert_eq!( error, - Error::UnsupportedDenyListSource { - source: String::from("city") + Error::MissingStaticData { + field: "deny_list.filters" } ); } @@ -326,6 +434,7 @@ fn prepared_search_rejects_truncated_deny_list_data() { custom_labels: vec![], originals: vec![String::from("Secret Code")], sources: vec![vec![String::from("custom-deny-list")]], + filters: None, }), ..empty_config(PreparedSearchSlices { deny_list: PatternSlice { start: 0, end: 1 }, diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs index 9cdee5e6..560a2e1c 100644 --- a/crates/anonymize-core/tests/processors.rs +++ b/crates/anonymize-core/tests/processors.rs @@ -1,14 +1,14 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - CountryMatchData, DenyListMatchData, DetectionSource, Error, - GazetteerMatchData, PatternSlice, PipelineEntity, RegexMatchMeta, + CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, + Error, GazetteerMatchData, PatternSlice, PipelineEntity, RegexMatchMeta, SearchMatch, SourceDetail, process_country_matches, process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; #[test] -fn regex_processor_filters_slice_and_short_phone_matches() { +fn regex_processor_filters_slice_and_short_matches_by_meta() { let matches = vec![ SearchMatch::Regex { pattern: 0, @@ -28,7 +28,13 @@ fn regex_processor_filters_slice_and_short_phone_matches() { ]; let meta = vec![ RegexMatchMeta::new("person", 0.8), - RegexMatchMeta::new("phone number", 0.8), + RegexMatchMeta { + label: String::from("short gated"), + score: 0.8, + source_detail: None, + requires_validation: false, + min_byte_length: Some(7), + }, ]; let entities = process_regex_matches( @@ -64,6 +70,7 @@ fn regex_processor_rejects_unported_validators() { score: 0.9, source_detail: None, requires_validation: true, + min_byte_length: None, }]; let err = process_regex_matches( @@ -92,6 +99,7 @@ fn regex_processor_preserves_custom_regex_source_detail() { score: 0.7, source_detail: Some(SourceDetail::CustomRegex), requires_validation: false, + min_byte_length: None, }]; let entities = process_regex_matches( @@ -117,6 +125,7 @@ fn deny_list_processor_emits_custom_labels() { custom_labels: vec![vec![String::from("matter")]], originals: vec![String::from("Secret Code")], sources: vec![vec![String::from("custom-deny-list")]], + filters: None, }; let entities = process_deny_list_matches( @@ -155,6 +164,7 @@ fn deny_list_processor_rejects_embedded_custom_word_matches() { custom_labels: vec![vec![String::from("matter")]], originals: vec![String::from("Secret")], sources: vec![vec![String::from("custom-deny-list")]], + filters: None, }; let entities = process_deny_list_matches( @@ -170,7 +180,35 @@ fn deny_list_processor_rejects_embedded_custom_word_matches() { } #[test] -fn deny_list_processor_rejects_curated_sources() { +fn deny_list_processor_emits_curated_non_person_labels() { + let matches = vec![SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]], + custom_labels: vec![vec![]], + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]], + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Prague", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "address"); + assert_eq!(entities[0].source_detail, None); +} + +#[test] +fn deny_list_processor_rejects_curated_sources_without_filters() { let matches = vec![SearchMatch::Literal { pattern: 0, start: 0, @@ -181,6 +219,7 @@ fn deny_list_processor_rejects_curated_sources() { custom_labels: vec![vec![]], originals: vec![String::from("Prague")], sources: vec![vec![String::from("city")]], + filters: None, }; let error = process_deny_list_matches( @@ -193,8 +232,8 @@ fn deny_list_processor_rejects_curated_sources() { assert_eq!( error, - Error::UnsupportedDenyListSource { - source: String::from("city") + Error::MissingStaticData { + field: "deny_list.filters" } ); } diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index 8dea3889..8ec5688d 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -25,13 +25,13 @@ fn entity_with_display_text( let prefix = text .get(..byte_start) .unwrap_or_else(|| panic!("invalid fixture boundary: {byte_start}")); - let start = utf16_len(prefix); - let end = start.saturating_add(utf16_len(value)); + let start = byte_len(prefix); + let end = start.saturating_add(byte_len(value)); Entity::detected(start, end, label, display_text) } -fn utf16_len(text: &str) -> u32 { - u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) } #[test] @@ -237,10 +237,10 @@ fn redact_operator_is_not_reversible() { } #[test] -fn utf16_offsets_apply_non_ascii_spans() { +fn byte_offsets_apply_non_ascii_spans() { let text = "A 🦀 Bob"; - let start = 5; - let end = 8; + let start = byte_len("A 🦀 "); + let end = start.saturating_add(byte_len("Bob")); let entities = vec![Entity::detected(start, end, "person", "Bob")]; let result = @@ -282,14 +282,14 @@ fn detected_placeholder_identity_uses_sanitized_text() { } #[test] -fn invalid_utf16_boundary_is_rejected() { +fn invalid_byte_boundary_is_rejected() { let text = "A 🦀 Bob"; let entities = vec![Entity::detected(3, 5, "person", " Bob")]; let error = redact_text(text, &entities, &OperatorConfig::default()) .expect_err("offset inside a surrogate pair must fail"); - assert_eq!(error, Error::Utf16OffsetInsideSurrogate { offset: 3 }); + assert_eq!(error, Error::ByteOffsetInsideCodepoint { offset: 3 }); } #[test] diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 996ed924..47a3f28e 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -27,11 +27,11 @@ fn text_entity( label: &str, source: DetectionSource, ) -> PipelineEntity { - PipelineEntity::detected(0, utf16_len(text), label, text, 0.9, source) + PipelineEntity::detected(0, byte_len(text), label, text, 0.9, source) } -fn utf16_len(text: &str) -> u32 { - u32::try_from(text.encode_utf16().count()).unwrap_or(u32::MAX) +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) } #[test] @@ -153,11 +153,11 @@ fn same_span_country_loses_to_person() { } #[test] -fn sanitize_trims_punctuation_and_updates_utf16_offsets() { +fn sanitize_trims_punctuation_and_updates_byte_offsets() { let mut input = text_entity("\"Tesla Shares\"", "organization", DetectionSource::Ner); input.start = 10; - input.end = 10_u32.saturating_add(utf16_len(&input.text)); + input.end = 10_u32.saturating_add(byte_len(&input.text)); let result = sanitize_entities(&[input]); assert_eq!(result.len(), 1); @@ -218,10 +218,20 @@ fn sanitize_drops_empty_entities() { #[test] fn boundary_merges_adjacent_same_label_entities() { let full_text = "Kontaktujte Jan Novák prosím."; + let jan_start = byte_len("Kontaktujte "); + let jan_end = jan_start.saturating_add(byte_len("Jan")); + let surname_start = jan_end.saturating_add(byte_len(" ")); + let surname_end = surname_start.saturating_add(byte_len("Novák")); let result = enforce_boundary_consistency( &[ - entity(DetectionSource::Ner, 0.8, 12, 15, "person"), - entity(DetectionSource::Ner, 0.95, 16, 21, "person"), + entity(DetectionSource::Ner, 0.8, jan_start, jan_end, "person"), + entity( + DetectionSource::Ner, + 0.95, + surname_start, + surname_end, + "person", + ), ], full_text, ) @@ -230,18 +240,20 @@ fn boundary_merges_adjacent_same_label_entities() { assert_eq!(result.len(), 1); let person = result.first().expect("person"); assert_eq!(person.text, "Jan Novák"); - assert_eq!(person.start, 12); - assert_eq!(person.end, 21); + assert_eq!(person.start, jan_start); + assert_eq!(person.end, surname_end); assert_eq!(person.score, 0.95); } #[test] fn boundary_expands_partial_words() { let full_text = "Kontaktujte Novák prosím."; + let start = byte_len("Kontaktujte "); + let partial_end = start.saturating_add(byte_len("Nová")); let result = enforce_boundary_consistency( &[PipelineEntity::detected( - 12, - 16, + start, + partial_end, "person", "Nová", 0.9, @@ -254,14 +266,14 @@ fn boundary_expands_partial_words() { assert_eq!(result.len(), 1); let person = result.first().expect("person"); assert_eq!(person.text, "Novák"); - assert_eq!(person.end, 17); + assert_eq!(person.end, start.saturating_add(byte_len("Novák"))); } #[test] fn boundary_expands_inside_apostrophe_names() { let full_text = "Kontaktujte O'Connor prosím."; - let start = utf16_len("Kontaktujte O'"); - let end = start.saturating_add(utf16_len("Connor")); + let start = byte_len("Kontaktujte O'"); + let end = start.saturating_add(byte_len("Connor")); let result = enforce_boundary_consistency( &[PipelineEntity::detected( start, @@ -277,15 +289,15 @@ fn boundary_expands_inside_apostrophe_names() { assert_eq!(result.len(), 1); let person = result.first().expect("person"); - assert_eq!(person.start, utf16_len("Kontaktujte ")); + assert_eq!(person.start, byte_len("Kontaktujte ")); assert_eq!(person.text, "O'Connor"); } #[test] fn boundary_expands_across_combining_marks() { let full_text = "Podepsal Cafe\u{0301}."; - let start = utf16_len("Podepsal "); - let end = start.saturating_add(utf16_len("Cafe")); + let start = byte_len("Podepsal "); + let end = start.saturating_add(byte_len("Cafe")); let result = enforce_boundary_consistency( &[PipelineEntity::detected( start, @@ -359,7 +371,7 @@ fn boundary_removes_nested_same_label_entities() { &[ PipelineEntity::detected( 0, - 16, + byte_len("Ing. Pavel Novák"), "person", "Ing. Pavel Novák", 0.9, diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index a8e37f18..ddfacc00 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -59,7 +59,7 @@ fn search_index_routes_literal_regex_and_fuzzy_patterns() { } #[test] -fn search_index_preserves_utf16_offsets_from_primitive_engines() { +fn search_index_preserves_byte_offsets_from_primitive_engines() { const SUPPLEMENTARY_SCALAR: &str = "\u{1F9EA}"; let index = SearchIndex::new( @@ -80,12 +80,12 @@ fn search_index_preserves_utf16_offsets_from_primitive_engines() { SearchMatch::Regex { pattern: 1, start: 2, - end: 4, + end: 6, }, SearchMatch::Literal { pattern: 0, - start: 5, - end: 8, + start: 7, + end: 10, }, ] ); diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 4936edeb..51736589 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -3,15 +3,17 @@ use std::collections::BTreeMap; use napi::bindgen_prelude::*; use napi_derive::napi; use stella_anonymize_adapter_contract::{ - BindingCountryMatchData, BindingDenyListMatchData, BindingGazetteerMatchData, - BindingOperatorConfig, BindingOperatorEntry, BindingPatternSlice, - BindingPreparedSearchConfig, BindingPreparedSearchSlices, - BindingRedactionResult, BindingRegexMatchMeta, BindingSearchOptions, - BindingSearchPattern, BindingStaticRedactionResult, ContractError, - operator_config_from_binding, prepared_search_config_from_binding, - static_redaction_result_to_binding, + BindingCountryMatchData, BindingDenyListFilterData, BindingDenyListMatchData, + BindingGazetteerMatchData, BindingOperatorConfig, BindingOperatorEntry, + BindingPatternSlice, BindingPreparedSearchConfig, + BindingPreparedSearchSlices, BindingRedactionResult, BindingRegexMatchMeta, + BindingSearchOptions, BindingSearchPattern, BindingStaticRedactionResult, + ContractError, operator_config_from_binding, + prepared_search_config_from_binding, + static_redaction_diagnostic_result_to_binding, + static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, }; -use stella_anonymize_core::PreparedSearch; +use stella_anonymize_core::{PreparedSearch, StaticRedactionDiagnostics}; #[napi(object)] pub struct JsSearchPattern { @@ -20,6 +22,10 @@ pub struct JsSearchPattern { pub distance: Option, pub case_insensitive: Option, pub whole_words: Option, + pub lazy: Option, + pub prefilter_any: Option>, + pub prefilter_case_insensitive: Option, + pub prefilter_regex: Option, } #[napi(object)] @@ -56,6 +62,7 @@ pub struct JsRegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: Option, + pub min_byte_length: Option, } #[napi(object)] @@ -75,6 +82,21 @@ pub struct JsDenyListMatchData { pub custom_labels: Vec>, pub originals: Vec, pub sources: Vec>, + pub filters: Option, +} + +#[napi(object)] +pub struct JsDenyListFilterData { + pub stopwords: Vec, + pub allow_list: Vec, + pub person_stopwords: Vec, + pub address_stopwords: Vec, + pub street_types: Vec, + pub first_names: Vec, + pub generic_roles: Vec, + pub sentence_starters: Vec, + pub trailing_address_word_exclusions: Vec, + pub defined_term_cues: Vec, } #[napi(object)] @@ -175,9 +197,46 @@ pub fn redact_static_entities_json( serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) } +#[napi] +#[allow(clippy::needless_pass_by_value)] +pub fn redact_static_entities_diagnostics_json( + config_json: String, + full_text: String, + operators_json: Option, +) -> Result { + let config = + serde_json::from_str::(&config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let operators = operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_napi_serde_error(&error))?; + let prepared = PreparedSearch::new_with_diagnostics( + prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + let mut diagnostics = prepared.diagnostics; + let mut result = prepared + .prepared + .redact_static_entities_with_diagnostics( + &full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; + let result = static_redaction_diagnostic_result_to_binding(result); + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) +} + #[napi] pub struct NativePreparedSearch { inner: PreparedSearch, + prepare_diagnostics: StaticRedactionDiagnostics, } #[napi] @@ -186,9 +245,21 @@ impl NativePreparedSearch { pub fn new(config: JsPreparedSearchConfig) -> Result { let config = prepared_search_config_from_binding(to_binding_config(config)) .map_err(|error| to_napi_contract_error(&error))?; - PreparedSearch::new(config) - .map(|inner| Self { inner }) - .map_err(|error| to_napi_core_error(&error)) + let result = PreparedSearch::new_with_diagnostics(config) + .map_err(|error| to_napi_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + #[napi] + pub fn prepare_diagnostics_json(&self) -> Result { + let diagnostics = + static_redaction_diagnostics_to_binding(self.prepare_diagnostics.clone()); + + serde_json::to_string(&diagnostics) + .map_err(|error| to_napi_serde_error(&error)) } #[napi] @@ -208,6 +279,25 @@ impl NativePreparedSearch { .map(to_js_static_redaction_result) .map_err(|error| to_napi_core_error(&error))? } + + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities_diagnostics_json( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; + let result = self + .inner + .redact_static_entities_with_diagnostics(&full_text, &operators) + .map(static_redaction_diagnostic_result_to_binding) + .map_err(|error| to_napi_core_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) + } } fn to_binding_config( @@ -232,6 +322,7 @@ fn to_binding_config( custom_labels: data.custom_labels, originals: data.originals, sources: data.sources, + filters: data.filters.map(to_binding_deny_list_filters), } }), gazetteer_data: config.gazetteer_data.map(|data| { @@ -246,6 +337,23 @@ fn to_binding_config( } } +fn to_binding_deny_list_filters( + filters: JsDenyListFilterData, +) -> BindingDenyListFilterData { + BindingDenyListFilterData { + stopwords: filters.stopwords, + allow_list: filters.allow_list, + person_stopwords: filters.person_stopwords, + address_stopwords: filters.address_stopwords, + street_types: filters.street_types, + first_names: filters.first_names, + generic_roles: filters.generic_roles, + sentence_starters: filters.sentence_starters, + trailing_address_word_exclusions: filters.trailing_address_word_exclusions, + defined_term_cues: filters.defined_term_cues, + } +} + fn to_binding_patterns( patterns: Vec, ) -> Vec { @@ -257,6 +365,10 @@ fn to_binding_patterns( distance: pattern.distance, case_insensitive: pattern.case_insensitive, whole_words: pattern.whole_words, + lazy: pattern.lazy, + prefilter_any: pattern.prefilter_any, + prefilter_case_insensitive: pattern.prefilter_case_insensitive, + prefilter_regex: pattern.prefilter_regex, }) .collect() } @@ -304,6 +416,7 @@ fn to_binding_regex_meta( score: entry.score, source_detail: entry.source_detail, requires_validation: entry.requires_validation, + min_byte_length: entry.min_byte_length, }) .collect() } diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index abd6a00d..92c6e374 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -4,9 +4,13 @@ use stella_anonymize_adapter_contract::{ BindingOperatorConfig, BindingOperatorEntry, BindingPipelineEntity, BindingPreparedSearchConfig, BindingRedactionEntry, BindingRedactionResult, BindingStaticRedactionResult, ContractError, operator_config_from_binding, - prepared_search_config_from_binding, static_redaction_result_to_binding, + prepared_search_config_from_binding, + static_redaction_diagnostic_result_to_binding, + static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, +}; +use stella_anonymize_core::{ + PreparedSearch as CorePreparedSearch, StaticRedactionDiagnostics, }; -use stella_anonymize_core::PreparedSearch as CorePreparedSearch; #[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] #[derive(Clone)] @@ -53,6 +57,7 @@ pub struct PyStaticRedactionResult { #[pyclass(name = "PreparedSearch")] pub struct PyPreparedSearch { inner: CorePreparedSearch, + prepare_diagnostics: StaticRedactionDiagnostics, } #[pymethods] @@ -60,12 +65,23 @@ impl PyPreparedSearch { #[new] fn new(config_json: &str) -> PyResult { let config = parse_prepared_search_config(config_json)?; - let inner = CorePreparedSearch::new( + let result = CorePreparedSearch::new_with_diagnostics( prepared_search_config_from_binding(config) .map_err(|error| to_py_contract_error(&error))?, ) .map_err(|error| to_py_core_error(&error))?; - Ok(Self { inner }) + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + fn prepare_diagnostics_json(&self) -> PyResult { + let diagnostics = + static_redaction_diagnostics_to_binding(self.prepare_diagnostics.clone()); + + serde_json::to_string(&diagnostics) + .map_err(|error| to_py_serde_error(&error)) } fn redact_static_entities( @@ -95,6 +111,28 @@ impl PyPreparedSearch { serde_json::to_string(&to_binding_static_redaction_result(result)) .map_err(|error| to_py_serde_error(&error)) } + + fn redact_static_entities_diagnostics_json( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let operators = parse_operator_config(operators_json)?; + let mut result = self + .inner + .redact_static_entities_with_diagnostics( + full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map_err(|error| to_py_core_error(&error))?; + let mut diagnostics = self.prepare_diagnostics.clone(); + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; + let result = static_redaction_diagnostic_result_to_binding(result); + + serde_json::to_string(&result).map_err(|error| to_py_serde_error(&error)) + } } #[pyfunction] @@ -107,6 +145,16 @@ fn redact_static_entities_json( prepared.redact_static_entities_json(full_text, operators_json) } +#[pyfunction] +fn redact_static_entities_diagnostics_json( + config_json: &str, + full_text: &str, + operators_json: Option<&str>, +) -> PyResult { + let prepared = PyPreparedSearch::new(config_json)?; + prepared.redact_static_entities_diagnostics_json(full_text, operators_json) +} + #[pyfunction] fn normalize_for_search(text: &str) -> String { stella_anonymize_core::normalize_for_search(text) @@ -267,6 +315,10 @@ fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module .add_function(wrap_pyfunction!(redact_static_entities_json, module)?)?; + module.add_function(wrap_pyfunction!( + redact_static_entities_diagnostics_json, + module + )?)?; module.add_function(wrap_pyfunction!(normalize_for_search, module)?)?; Ok(()) } diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index fa1c2323..b4add88e 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1,5 +1,7 @@ import { spawnSync } from "node:child_process"; +import { createRequire } from "node:module"; import { + copyFileSync, existsSync, mkdirSync, mkdtempSync, @@ -28,6 +30,11 @@ const COMPARE_BASELINE = process.env.ANONYMIZE_MIGRATION_COMPARE_BASELINE !== "0"; const REQUIRE_NATIVE_PIPELINE = process.env.ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE === "1"; +const CANDIDATE_RUNTIME = + process.env.ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME ?? "typescript"; +const FAIL_ON_MISMATCH = + process.env.ANONYMIZE_MIGRATION_FAIL_ON_MISMATCH ?? + (CANDIDATE_RUNTIME === "typescript" ? "1" : "0"); const WARM_ITERATIONS = positiveIntegerEnv( "ANONYMIZE_MIGRATION_WARM_ITERATIONS", 2, @@ -66,6 +73,7 @@ async function runCoordinator() { sourceRoot: ROOT_DIR, fixtures, tempRoot, + runtime: CANDIDATE_RUNTIME, }); printVariantSummary(candidate); @@ -81,7 +89,7 @@ async function runCoordinator() { if (baseline !== null) { const comparison = compareSnapshots(baseline, candidate); console.log(JSON.stringify(comparison)); - if (!comparison.equal) { + if (!comparison.equal && FAIL_ON_MISMATCH !== "0") { throw new Error( `Fixture parity failed for ${comparison.mismatches.length} fixture(s)`, ); @@ -92,7 +100,18 @@ async function runCoordinator() { } } -function runVariant({ name, sourceRoot, fixtures, tempRoot }) { +function runVariant({ + name, + sourceRoot, + fixtures, + tempRoot, + runtime = "typescript", +}) { + validateRuntime(runtime); + if (runtime === "native-static") { + ensureNativeAdapterBuilt(); + } + const resultPath = join( tempRoot, `${name.replaceAll(/[^a-zA-Z0-9_.-]/g, "_")}.json`, @@ -104,6 +123,7 @@ function runVariant({ name, sourceRoot, fixtures, tempRoot }) { ANONYMIZE_MIGRATION_WORKER: "1", ANONYMIZE_MIGRATION_SOURCE_ROOT: sourceRoot, ANONYMIZE_MIGRATION_VARIANT: name, + ANONYMIZE_MIGRATION_RUNTIME: runtime, ANONYMIZE_MIGRATION_FIXTURES_DIR: FIXTURES_DIR, ANONYMIZE_MIGRATION_FIXTURES: JSON.stringify(fixtures), ANONYMIZE_MIGRATION_RESULT_PATH: resultPath, @@ -131,8 +151,10 @@ function runVariant({ name, sourceRoot, fixtures, tempRoot }) { async function runWorker() { const sourceRoot = requiredEnv("ANONYMIZE_MIGRATION_SOURCE_ROOT"); const variant = requiredEnv("ANONYMIZE_MIGRATION_VARIANT"); + const runtime = requiredEnv("ANONYMIZE_MIGRATION_RUNTIME"); const resultPath = requiredEnv("ANONYMIZE_MIGRATION_RESULT_PATH"); const fixtures = JSON.parse(requiredEnv("ANONYMIZE_MIGRATION_FIXTURES")); + validateRuntime(runtime); const importStart = Bun.nanoseconds(); const [indexModule, configModule, dictionaryModule] = await Promise.all([ @@ -163,24 +185,35 @@ async function runWorker() { const prepareStart = Bun.nanoseconds(); const search = await indexModule.preparePipelineSearch({ config, context }); const prepareMs = elapsedMs(prepareStart); - const nativeRewrite = describeNativeRewrite(config, search); + const nativeRewrite = describeNativeRewrite(config, search, runtime); - const coldRun = await runFixtureSweep({ - indexModule, - config, - context, - fixtures, - }); + const runtimeRunner = + runtime === "native-static" + ? createNativeStaticRunner(search.nativeStaticConfig) + : null; + const nativePrepareMs = runtimeRunner?.prepareMs ?? 0; + + const coldRun = + runtimeRunner === null + ? await runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, + }) + : runNativeStaticFixtureSweep({ runner: runtimeRunner, fixtures }); const warmRuns = []; for (let index = 0; index < WARM_ITERATIONS; index += 1) { warmRuns.push( - await runFixtureSweep({ - indexModule, - config, - context, - fixtures, - }), + runtimeRunner === null + ? await runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, + }) + : runNativeStaticFixtureSweep({ runner: runtimeRunner, fixtures }), ); } @@ -188,6 +221,10 @@ async function runWorker() { const warmAvgMs = WARM_ITERATIONS === 0 ? 0 : roundMs(warmRunMs / WARM_ITERATIONS); const fixtureTimings = summarizeFixtureTimings(coldRun, warmRuns); + const nativeDiagnostics = + runtimeRunner === null + ? null + : collectNativeDiagnostics({ runner: runtimeRunner, fixtures }); const snapshots = Object.fromEntries( coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), ); @@ -197,7 +234,7 @@ async function runWorker() { `${JSON.stringify({ event: "fixture-migration-variant", variant, - pipelineRuntime: "typescript", + pipelineRuntime: runtime, nativeRewrite, fixtureCount: fixtures.length, warmIterations: WARM_ITERATIONS, @@ -205,13 +242,19 @@ async function runWorker() { importMs, dictionaryMs, prepareMs, + nativePrepareMs, coldRunMs: coldRun.ms, - coldPipelineMs: roundMs(dictionaryMs + prepareMs + coldRun.ms), - coldTotalMs: roundMs(importMs + dictionaryMs + prepareMs + coldRun.ms), + coldPipelineMs: roundMs( + dictionaryMs + prepareMs + nativePrepareMs + coldRun.ms, + ), + coldTotalMs: roundMs( + importMs + dictionaryMs + prepareMs + nativePrepareMs + coldRun.ms, + ), warmRunMsByIteration: warmRuns.map((run) => run.ms), warmRunMs, warmAvgMs, }, + nativeDiagnostics, fixtureTimings, fixtures: coldRun.fixtures.map( ({ fixture, ms, entityCount, redactedTextLength }) => ({ @@ -226,7 +269,12 @@ async function runWorker() { ); } -async function runFixtureSweep({ indexModule, config, context, fixtures }) { +async function runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, +}) { const sweepStart = Bun.nanoseconds(); const results = []; @@ -256,6 +304,140 @@ async function runFixtureSweep({ indexModule, config, context, fixtures }) { }; } +function runNativeStaticFixtureSweep({ runner, fixtures }) { + const sweepStart = Bun.nanoseconds(); + const results = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const start = Bun.nanoseconds(); + const result = runner.prepared.redactStaticEntities(fullText, undefined); + const ms = elapsedMs(start); + const snapshot = toNativeSnapshot(result); + results.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + ms, + entityCount: snapshot.entityCount, + redactedTextLength: snapshot.redactedText.length, + snapshot, + }); + } + + return { + ms: elapsedMs(sweepStart), + fixtures: results, + }; +} + +function collectNativeDiagnostics({ runner, fixtures }) { + const fixtureDiagnostics = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const report = JSON.parse( + runner.prepared.redactStaticEntitiesDiagnosticsJson(fullText, undefined), + ); + fixtureDiagnostics.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + stages: diagnosticStageSummaries(report.diagnostics.events), + }); + } + + return { + prepare: { + stages: diagnosticStageSummaries(runner.prepareDiagnostics.events), + topStages: topDiagnosticStages( + diagnosticStageSummaries(runner.prepareDiagnostics.events), + ), + }, + run: summarizeFixtureDiagnostics(fixtureDiagnostics), + }; +} + +function summarizeFixtureDiagnostics(fixtureDiagnostics) { + const stageBuckets = new Map(); + const byFixture = []; + + for (const fixture of fixtureDiagnostics) { + let fixtureElapsedMs = 0; + for (const stage of fixture.stages) { + fixtureElapsedMs += stage.elapsedMs ?? 0; + const bucket = stageBuckets.get(stage.stage) ?? { + stage: stage.stage, + elapsedMs: [], + count: 0, + }; + if (typeof stage.elapsedMs === "number") { + bucket.elapsedMs.push(stage.elapsedMs); + } + bucket.count += stage.count ?? 0; + stageBuckets.set(stage.stage, bucket); + } + byFixture.push({ + fixture: fixture.fixture, + elapsedMs: roundMs(fixtureElapsedMs), + topStages: topDiagnosticStages(fixture.stages).slice(0, 5), + }); + } + + const stages = [...stageBuckets.values()] + .map((bucket) => ({ + stage: bucket.stage, + calls: bucket.elapsedMs.length, + totalMs: roundMs(bucket.elapsedMs.reduce((sum, ms) => sum + ms, 0)), + avgMs: + bucket.elapsedMs.length === 0 + ? 0 + : roundMs( + bucket.elapsedMs.reduce((sum, ms) => sum + ms, 0) / + bucket.elapsedMs.length, + ), + p50Ms: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 0.5, + ), + p95Ms: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 0.95, + ), + maxMs: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 1, + ), + count: bucket.count, + })) + .sort((left, right) => right.totalMs - left.totalMs); + + return { + stages, + topStages: stages.slice(0, 10), + topFixtures: byFixture + .toSorted((left, right) => right.elapsedMs - left.elapsedMs) + .slice(0, 10), + byFixture, + }; +} + +function diagnosticStageSummaries(events) { + return events + .filter((event) => event.kind === "stage-summary") + .map((event) => ({ + stage: event.stage, + count: event.count ?? 0, + elapsedMs: + typeof event.elapsed_us === "number" + ? roundMs(event.elapsed_us / 1_000) + : null, + inputBytes: event.input_bytes ?? null, + })); +} + +function topDiagnosticStages(stages) { + return stages + .filter((stage) => typeof stage.elapsedMs === "number") + .toSorted((left, right) => right.elapsedMs - left.elapsedMs); +} + function toSnapshot(indexModule, fullText, entities, context) { const sorted = entities.toSorted( (left, right) => @@ -285,6 +467,33 @@ function toSnapshot(indexModule, fullText, entities, context) { }; } +function toNativeSnapshot(result) { + const entities = result.resolvedEntities.toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + const counts = {}; + for (const entity of entities) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + + return { + entityCount: entities.length, + counts, + entities: entities.map(({ start, end, label, text, source }) => ({ + start, + end, + label, + text, + source, + })), + redactedText: result.redaction.redactedText, + }; +} + function compareSnapshots(baseline, candidate) { const mismatches = []; const fixtureNames = new Set([ @@ -384,13 +593,14 @@ function printVariantSummary(result) { fixtureCount: result.fixtureCount, warmIterations: result.warmIterations, timings: result.timings, + nativeDiagnostics: result.nativeDiagnostics, fixtureTimings: result.fixtureTimings, fixtures: result.fixtures, }), ); } -function describeNativeRewrite(config, search) { +function describeNativeRewrite(config, search, runtime) { const sliceLengths = Object.fromEntries( Object.entries(search.slices).map(([name, slice]) => [ name, @@ -399,22 +609,22 @@ function describeNativeRewrite(config, search) { ); const regexValidationSlots = countRegexValidationSlots(search.regexMeta); const denyListSourceCounts = countDenyListSources(search.denyListData); + const nativeStaticConfig = search.nativeStaticConfig; const unsupportedSearchSlots = [ unsupportedSlot("regex", regexValidationSlots, "regex validators"), unsupportedSlot("triggers", sliceLengths.triggers, "trigger extraction"), unsupportedSlot("streetTypes", sliceLengths.streetTypes, "address seeds"), - unsupportedSlot( - "denyList", - denyListSourceCounts.curated, - "curated deny-list semantics", - ), ].filter((slot) => slot.count > 0); - const supportedSearchSlots = - Math.max(0, sliceLengths.regex - regexValidationSlots) + - sliceLengths.customRegex + - denyListSourceCounts.customOnly + - sliceLengths.gazetteer + - sliceLengths.countries; + const supportedSearchSlots = nativeStaticConfig + ? nativeStaticConfig.regex_patterns.length + + nativeStaticConfig.custom_regex_patterns.length + + nativeStaticConfig.literal_patterns.length + : Math.max(0, sliceLengths.regex - regexValidationSlots) + + sliceLengths.customRegex + + denyListSourceCounts.customOnly + + denyListSourceCounts.curated + + sliceLengths.gazetteer + + sliceLengths.countries; const totalSearchSlots = Object.values(sliceLengths).reduce( (sum, length) => sum + length, 0, @@ -426,8 +636,8 @@ function describeNativeRewrite(config, search) { ); return { - measuredInPipeline: false, - pipelineRuntime: "typescript", + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, fullPipelineNativeEligible: unsupportedSearchSlots.length === 0 && unsupportedPipelineStages.length === 0, @@ -457,11 +667,10 @@ function describeUnsupportedPipelineStages( if (config.enableTriggerPhrases) { stages.push("triggers"); } - if (config.enableDenyList && denyListSourceCounts.curated > 0) { - stages.push("curated-deny-list"); - } if (config.enableNameCorpus) { - stages.push("name-corpus"); + stages.push( + config.enableDenyList ? "name-corpus-supplemental" : "name-corpus", + ); } if (config.enableHotwordRules) { stages.push("hotword-rules"); @@ -635,6 +844,157 @@ function materializeGitRef(ref, tempRoot) { return outputDir; } +function createNativeStaticRunner(nativeStaticConfig) { + if (!nativeStaticConfig) { + throw new Error("Native static runtime requires nativeStaticConfig"); + } + + const native = loadNativeAdapter(); + const prepareStart = Bun.nanoseconds(); + const prepared = new native.NativePreparedSearch( + toNapiConfig(nativeStaticConfig), + ); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + return { + prepared, + prepareDiagnostics, + prepareMs, + }; +} + +function loadNativeAdapter() { + const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-fixture-napi-")); + const napiPath = join(tempDir, "stella_anonymize_napi.node"); + copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); + const loaded = createRequire(import.meta.url)(napiPath); + const NativePreparedSearch = Reflect.get( + Object(loaded), + "NativePreparedSearch", + ); + if (typeof NativePreparedSearch !== "function") { + throw new TypeError("Native anonymize adapter exports are incomplete"); + } + return { NativePreparedSearch }; +} + +function toNapiConfig(config) { + return { + regexPatterns: config.regex_patterns.map(toNapiPattern), + customRegexPatterns: config.custom_regex_patterns.map(toNapiPattern), + literalPatterns: config.literal_patterns.map(toNapiPattern), + regexOptions: toNapiOptions(config.regex_options), + customRegexOptions: toNapiOptions(config.custom_regex_options), + literalOptions: toNapiOptions(config.literal_options), + slices: { + regex: config.slices.regex, + customRegex: config.slices.custom_regex, + legalForms: config.slices.legal_forms, + triggers: config.slices.triggers, + denyList: config.slices.deny_list, + streetTypes: config.slices.street_types, + gazetteer: config.slices.gazetteer, + countries: config.slices.countries, + }, + regexMeta: config.regex_meta.map(toNapiRegexMeta), + customRegexMeta: config.custom_regex_meta.map(toNapiRegexMeta), + denyListData: + config.deny_list_data === undefined + ? undefined + : { + labels: config.deny_list_data.labels, + customLabels: config.deny_list_data.custom_labels, + originals: config.deny_list_data.originals, + sources: config.deny_list_data.sources, + filters: + config.deny_list_data.filters === undefined + ? undefined + : toNapiDenyListFilters(config.deny_list_data.filters), + }, + gazetteerData: + config.gazetteer_data === undefined + ? undefined + : { + labels: config.gazetteer_data.labels, + isFuzzy: config.gazetteer_data.is_fuzzy, + }, + countryData: config.country_data, + }; +} + +function toNapiPattern(pattern) { + return { + kind: pattern.kind, + pattern: pattern.pattern, + distance: pattern.distance, + caseInsensitive: pattern.case_insensitive, + wholeWords: pattern.whole_words, + lazy: pattern.lazy, + prefilterAny: pattern.prefilter_any, + prefilterCaseInsensitive: pattern.prefilter_case_insensitive, + prefilterRegex: pattern.prefilter_regex, + }; +} + +function toNapiOptions(options) { + if (options === undefined) { + return undefined; + } + return { + literalCaseInsensitive: options.literal_case_insensitive, + literalWholeWords: options.literal_whole_words, + regexWholeWords: options.regex_whole_words, + fuzzyCaseInsensitive: options.fuzzy_case_insensitive, + fuzzyWholeWords: options.fuzzy_whole_words, + fuzzyNormalizeDiacritics: options.fuzzy_normalize_diacritics, + }; +} + +function toNapiRegexMeta(meta) { + return { + label: meta.label, + score: meta.score, + sourceDetail: meta.source_detail, + requiresValidation: meta.requires_validation, + minByteLength: meta.min_byte_length, + }; +} + +function toNapiDenyListFilters(filters) { + return { + stopwords: filters.stopwords, + allowList: filters.allow_list, + personStopwords: filters.person_stopwords, + addressStopwords: filters.address_stopwords, + streetTypes: filters.street_types, + firstNames: filters.first_names, + genericRoles: filters.generic_roles, + sentenceStarters: filters.sentence_starters, + trailingAddressWordExclusions: filters.trailing_address_word_exclusions, + definedTermCues: filters.defined_term_cues, + }; +} + +function nativeLibraryPath(name) { + if (process.platform === "darwin") { + return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(ROOT_DIR, "target", "release", `lib${name}.so`); + } + return join(ROOT_DIR, "target", "release", `${name}.dll`); +} + +function ensureNativeAdapterBuilt() { + runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "--release", + "--locked", + ]); +} + function runCommand(command, args) { const result = spawnSync(command, args, { cwd: ROOT_DIR, @@ -646,6 +1006,15 @@ function runCommand(command, args) { } } +function validateRuntime(runtime) { + if (runtime === "typescript" || runtime === "native-static") { + return; + } + throw new Error( + `ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME must be typescript or native-static, got ${runtime}`, + ); +} + function importSource(sourceRoot, relativePath, variant) { const path = join(sourceRoot, relativePath); if (!existsSync(path)) { diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index 0fe6e30e..ff941104 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -17,6 +17,12 @@ const configJson = JSON.stringify({ case_insensitive: true, whole_words: true, }, + { + kind: "literal-with-options", + pattern: "Prague", + case_insensitive: true, + whole_words: true, + }, { kind: "literal-with-options", pattern: "Acme", @@ -43,19 +49,31 @@ const configJson = JSON.stringify({ slices: { regex: { start: 0, end: 1 }, custom_regex: { start: 0, end: 1 }, - deny_list: { start: 0, end: 1 }, - gazetteer: { start: 1, end: 3 }, - countries: { start: 3, end: 4 }, + deny_list: { start: 0, end: 2 }, + gazetteer: { start: 2, end: 4 }, + countries: { start: 4, end: 5 }, }, regex_meta: [{ label: "registration number", score: 0.9 }], custom_regex_meta: [ { label: "matter id", score: 1, source_detail: "custom-regex" }, ], deny_list_data: { - labels: [["matter"]], - custom_labels: [["matter"]], - originals: ["Secret Code"], - sources: [["custom-deny-list"]], + labels: [["matter"], ["address"]], + custom_labels: [["matter"], []], + originals: ["Secret Code", "Prague"], + sources: [["custom-deny-list"], ["city"]], + filters: { + stopwords: [], + allow_list: [], + person_stopwords: [], + address_stopwords: [], + street_types: [], + first_names: [], + generic_roles: [], + sentence_starters: [], + trailing_address_word_exclusions: [], + defined_term_cues: [], + }, }, gazetteer_data: { labels: ["organization", "address"], @@ -186,7 +204,7 @@ function buildCases() { fixtureCases.push({ text: `Reference ${registration} for Acme s.r.o. near ` + - `${place}, Turkey, matter ${matter}, code Secret Code.`, + `${place}, Turkey, Prague, matter ${matter}, code Secret Code.`, operatorsJson: operators[index % operators.length], }); } @@ -222,6 +240,10 @@ function toNapiConfig(config) { customLabels: config.deny_list_data.custom_labels, originals: config.deny_list_data.originals, sources: config.deny_list_data.sources, + filters: + config.deny_list_data.filters === undefined + ? undefined + : toNapiDenyListFilters(config.deny_list_data.filters), }, gazetteerData: config.gazetteer_data === undefined @@ -241,6 +263,10 @@ function toNapiPattern(pattern) { distance: pattern.distance, caseInsensitive: pattern.case_insensitive, wholeWords: pattern.whole_words, + lazy: pattern.lazy, + prefilterAny: pattern.prefilter_any, + prefilterCaseInsensitive: pattern.prefilter_case_insensitive, + prefilterRegex: pattern.prefilter_regex, }; } @@ -264,6 +290,22 @@ function toNapiRegexMeta(meta) { score: meta.score, sourceDetail: meta.source_detail, requiresValidation: meta.requires_validation, + minByteLength: meta.min_byte_length, + }; +} + +function toNapiDenyListFilters(filters) { + return { + stopwords: filters.stopwords, + allowList: filters.allow_list, + personStopwords: filters.person_stopwords, + addressStopwords: filters.address_stopwords, + streetTypes: filters.street_types, + firstNames: filters.first_names, + genericRoles: filters.generic_roles, + sentenceStarters: filters.sentence_starters, + trailingAddressWordExclusions: filters.trailing_address_word_exclusions, + definedTermCues: filters.defined_term_cues, }; } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index fc7e038c..e19b73c0 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -15,6 +15,11 @@ type NativeAdapter = { fullText: string, operatorsJson?: string, ) => string; + redactStaticEntitiesDiagnosticsJson: ( + configJson: string, + fullText: string, + operatorsJson?: string, + ) => string; }; type RedactionEntry = { @@ -43,6 +48,30 @@ type StaticRedactionResult = { }; }; +type StaticRedactionDiagnosticResult = { + result: StaticRedactionResult; + diagnostics: { + events: Array<{ + stage: string; + kind: string; + count?: number; + engine?: string; + pattern?: number; + source?: string; + source_detail?: string; + label?: string; + start?: number; + end?: number; + text?: string; + score?: number; + span_valid?: boolean; + elapsed_us?: number; + input_bytes?: number; + reason?: string; + }>; + }; +}; + type GeneratedNativeCase = { text: string; operators: Record | null; @@ -61,6 +90,12 @@ const CONFIG_JSON = JSON.stringify({ case_insensitive: true, whole_words: true, }, + { + kind: "literal-with-options", + pattern: "Prague", + case_insensitive: true, + whole_words: true, + }, { kind: "literal-with-options", pattern: "Acme", @@ -87,19 +122,31 @@ const CONFIG_JSON = JSON.stringify({ slices: { regex: { start: 0, end: 1 }, custom_regex: { start: 0, end: 1 }, - deny_list: { start: 0, end: 1 }, - gazetteer: { start: 1, end: 3 }, - countries: { start: 3, end: 4 }, + deny_list: { start: 0, end: 2 }, + gazetteer: { start: 2, end: 4 }, + countries: { start: 4, end: 5 }, }, regex_meta: [{ label: "registration number", score: 0.9 }], custom_regex_meta: [ { label: "matter id", score: 1, source_detail: "custom-regex" }, ], deny_list_data: { - labels: [["matter"]], - custom_labels: [["matter"]], - originals: ["Secret Code"], - sources: [["custom-deny-list"]], + labels: [["matter"], ["address"]], + custom_labels: [["matter"], []], + originals: ["Secret Code", "Prague"], + sources: [["custom-deny-list"], ["city"]], + filters: { + stopwords: [], + allow_list: [], + person_stopwords: [], + address_stopwords: [], + street_types: [], + first_names: [], + generic_roles: [], + sentence_starters: [], + trailing_address_word_exclusions: [], + defined_term_cues: [], + }, }, gazetteer_data: { labels: ["organization", "address"], @@ -212,7 +259,7 @@ const generatedCaseArb: fc.Arbitrary = fc ({ left, middle, right, registration, matter, fuzzyPlace, operators }) => { const text = `${left}Reference ${registration} for Acme s.r.o. near ` + - `${fuzzyPlace}, Turkey, matter ${matter}, code Secret Code.` + + `${fuzzyPlace}, Turkey, Prague, matter ${matter}, code Secret Code.` + `${middle}${right}`; return { text, @@ -222,6 +269,7 @@ const generatedCaseArb: fc.Arbitrary = fc "Acme s.r.o.", fuzzyPlace, "Turkey", + "Prague", matter, "Secret Code", ], @@ -259,7 +307,7 @@ describe("native adapter parity", () => { for (const [index, item] of cases.entries()) { const result = tsResults.at(index); expect(result).toBeDefined(); - expect(result?.redaction.entity_count).toBe(6); + expect(result?.redaction.entity_count).toBe(7); for (const value of item.sensitiveValues) { expect(result?.redaction.redacted_text).not.toContain(value); } @@ -269,6 +317,49 @@ describe("native adapter parity", () => { { numRuns: 5, seed: 20_260_624 }, ); }); + + test("diagnostics JSON is identical through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const operators = { country: "redact" }; + + const tsResult = runTsDiagnosticsAdapter(adapters.native, text, operators); + const pyResult = callPythonDiagnostics( + adapters.pythonModulePath, + text, + operators, + ); + + expect(stripDiagnosticTimings(pyResult)).toEqual( + stripDiagnosticTimings(tsResult), + ); + expect( + tsResult.diagnostics.events.some( + (event) => + event.kind === "stage-summary" && + typeof event.elapsed_us === "number", + ), + ).toBe(true); + expect( + tsResult.diagnostics.events.some( + (event) => + event.stage === "search.literal" && + event.kind === "stage-summary" && + typeof event.count === "number" && + event.count > 0, + ), + ).toBe(true); + expect( + tsResult.diagnostics.events.some( + (event) => + event.stage === "resolution.sanitize" && + event.kind === "entity" && + event.span_valid === true, + ), + ).toBe(true); + }); }); const getAdapters = () => { @@ -314,13 +405,22 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { Object(loaded), "redactStaticEntitiesJson", ); + const redactStaticEntitiesDiagnosticsJson = Reflect.get( + Object(loaded), + "redactStaticEntitiesDiagnosticsJson", + ); if ( typeof normalizeForSearch !== "function" || - typeof redactStaticEntitiesJson !== "function" + typeof redactStaticEntitiesJson !== "function" || + typeof redactStaticEntitiesDiagnosticsJson !== "function" ) { throw new TypeError("Native anonymize adapter exports are incomplete"); } - return { normalizeForSearch, redactStaticEntitiesJson }; + return { + normalizeForSearch, + redactStaticEntitiesJson, + redactStaticEntitiesDiagnosticsJson, + }; }; const runTsAdapter = ( @@ -334,6 +434,21 @@ const runTsAdapter = ( ); }; +const runTsDiagnosticsAdapter = ( + adapter: NativeAdapter, + text: string, + operators: Record | null, +): StaticRedactionDiagnosticResult => { + const operatorsJson = operatorConfigJson(operators); + return JSON.parse( + adapter.redactStaticEntitiesDiagnosticsJson( + CONFIG_JSON, + text, + operatorsJson, + ), + ); +}; + const runPythonAdapters = ( pythonModulePath: string, cases: GeneratedNativeCase[], @@ -398,6 +513,74 @@ print(module.normalize_for_search(payload["text"])) } }; +const callPythonDiagnostics = ( + pythonModulePath: string, + text: string, + operators: Record | null, +): StaticRedactionDiagnosticResult => { + const payloadDir = mkdtempSync( + join(tmpdir(), "stella-anonymize-diagnostics-"), + ); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + }), + ); + try { + const output = runCommand( + "python3", + [ + "-c", + ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +print( + module.redact_static_entities_diagnostics_json( + payload["config_json"], + payload["text"], + payload.get("operators_json"), + ) +) +`, + ], + { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + +const stripDiagnosticTimings = ( + result: StaticRedactionDiagnosticResult, +): StaticRedactionDiagnosticResult => ({ + result: result.result, + diagnostics: { + events: result.diagnostics.events.map( + ({ elapsed_us: _elapsedUs, ...event }) => event, + ), + }, +}); + const operatorConfigJson = ( operators: Record | null, ): string | undefined => { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 82d5a31c..59efce96 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -57,6 +57,86 @@ type PatternSlice = { end: number; }; +type NativeSearchPatternKind = + | "literal" + | "literal-with-options" + | "regex" + | "fuzzy"; + +export type NativeSearchPattern = { + kind: NativeSearchPatternKind; + pattern: string; + distance?: number; + case_insensitive?: boolean; + whole_words?: boolean; + lazy?: boolean; + prefilter_any?: string[]; + prefilter_case_insensitive?: boolean; + prefilter_regex?: string; +}; + +export type NativeSearchOptions = { + literal_case_insensitive?: boolean; + literal_whole_words?: boolean; + regex_whole_words?: boolean; + fuzzy_case_insensitive?: boolean; + fuzzy_whole_words?: boolean; + fuzzy_normalize_diacritics?: boolean; +}; + +export type NativeRegexMatchMeta = { + label: string; + score: number; + source_detail?: string; + requires_validation?: boolean; + min_byte_length?: number; +}; + +export type NativeDenyListFilterData = { + stopwords: string[]; + allow_list: string[]; + person_stopwords: string[]; + address_stopwords: string[]; + street_types: string[]; + first_names: string[]; + generic_roles: string[]; + sentence_starters: string[]; + trailing_address_word_exclusions: string[]; + defined_term_cues: string[]; +}; + +export type NativeDenyListMatchData = { + labels: string[][]; + custom_labels: string[][]; + originals: string[]; + sources: string[][]; + filters?: NativeDenyListFilterData; +}; + +export type NativePreparedSearchConfig = { + regex_patterns: NativeSearchPattern[]; + custom_regex_patterns: NativeSearchPattern[]; + literal_patterns: NativeSearchPattern[]; + regex_options: NativeSearchOptions; + custom_regex_options: NativeSearchOptions; + literal_options: NativeSearchOptions; + slices: { + regex: PatternSlice; + custom_regex: PatternSlice; + legal_forms?: PatternSlice; + triggers?: PatternSlice; + deny_list: PatternSlice; + street_types?: PatternSlice; + gazetteer: PatternSlice; + countries: PatternSlice; + }; + regex_meta: NativeRegexMatchMeta[]; + custom_regex_meta: NativeRegexMatchMeta[]; + deny_list_data?: NativeDenyListMatchData; + gazetteer_data?: GazetteerData; + country_data?: CountryData; +}; + const createAllowedLabelSet = ( labels: readonly string[], ): ReadonlySet | null => (labels.length > 0 ? new Set(labels) : null); @@ -99,6 +179,7 @@ export type UnifiedSearchInstance = { denyListData: DenyListData | null; gazetteerData: GazetteerData | null; countryData: CountryData | null; + nativeStaticConfig: NativePreparedSearchConfig; }; export const buildUnifiedSearch = async ( @@ -354,6 +435,19 @@ export const buildUnifiedSearch = async ( }) : new (getTextSearch())([]); + const nativeStaticConfig = buildNativeStaticConfig({ + regexPatterns: allRegex, + regexMeta, + customRegexes, + customRegexMeta, + denyListData, + gazetteerPatterns: gazResult?.patterns ?? [], + gazetteerData: gazResult?.data ?? null, + countryPatterns: countryResult?.patterns ?? [], + countryData: countryResult?.data ?? null, + customDenyListNeedsWholeWords, + }); + return { tsRegex, tsCustomRegex, @@ -374,5 +468,270 @@ export const buildUnifiedSearch = async ( denyListData, gazetteerData: gazResult?.data ?? null, countryData: countryResult?.data ?? null, + nativeStaticConfig, + }; +}; + +type BuildNativeStaticConfigArgs = { + regexPatterns: readonly PatternEntry[]; + regexMeta: readonly RegexMeta[]; + customRegexes: readonly { pattern: string }[]; + customRegexMeta: readonly RegexMeta[]; + denyListData: DenyListData | null; + gazetteerPatterns: readonly PatternEntry[]; + gazetteerData: GazetteerData | null; + countryPatterns: readonly PatternEntry[]; + countryData: CountryData | null; + customDenyListNeedsWholeWords: (pattern: string) => boolean; +}; + +const buildNativeStaticConfig = ({ + regexPatterns, + regexMeta, + customRegexes, + customRegexMeta, + denyListData, + gazetteerPatterns, + gazetteerData, + countryPatterns, + countryData, + customDenyListNeedsWholeWords, +}: BuildNativeStaticConfigArgs): NativePreparedSearchConfig => { + const nativeRegexPatterns: NativeSearchPattern[] = []; + const nativeRegexMeta: NativeRegexMatchMeta[] = []; + for (const [index, pattern] of regexPatterns.entries()) { + const meta = regexMeta[index]; + if (!meta || meta.validator) { + continue; + } + nativeRegexPatterns.push(toNativeRegexPattern(pattern)); + nativeRegexMeta.push(toNativeRegexMeta(meta)); + } + + const nativeCustomRegexPatterns = customRegexes.map((entry) => ({ + kind: "regex" as const, + pattern: entry.pattern, + })); + const nativeCustomRegexMeta = customRegexMeta.map(toNativeRegexMeta); + + const denyPatterns = + denyListData?.originals.map((pattern, index) => + toNativeDenyListPattern( + pattern, + stringArrayValue(denyListData.sources[index]).includes( + "custom-deny-list", + ) + ? customDenyListNeedsWholeWords(pattern) + : true, + ), + ) ?? []; + const gazetteerNativePatterns = gazetteerPatterns.map(toNativeLiteralPattern); + const countryNativePatterns = countryPatterns.map(toNativeLiteralPattern); + + let literalOffset = 0; + const denyListSlice = { + start: literalOffset, + end: literalOffset + denyPatterns.length, }; + literalOffset = denyListSlice.end; + const gazetteerSlice = { + start: literalOffset, + end: literalOffset + gazetteerNativePatterns.length, + }; + literalOffset = gazetteerSlice.end; + const countriesSlice = { + start: literalOffset, + end: literalOffset + countryNativePatterns.length, + }; + + const nativeConfig: NativePreparedSearchConfig = { + regex_patterns: nativeRegexPatterns, + custom_regex_patterns: nativeCustomRegexPatterns, + literal_patterns: [ + ...denyPatterns, + ...gazetteerNativePatterns, + ...countryNativePatterns, + ], + regex_options: { regex_whole_words: false }, + custom_regex_options: { regex_whole_words: false }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: false, + fuzzy_case_insensitive: true, + fuzzy_whole_words: true, + fuzzy_normalize_diacritics: true, + }, + slices: { + regex: { start: 0, end: nativeRegexPatterns.length }, + custom_regex: { start: 0, end: nativeCustomRegexPatterns.length }, + legal_forms: { start: 0, end: 0 }, + triggers: { start: 0, end: 0 }, + deny_list: denyListSlice, + street_types: { start: 0, end: 0 }, + gazetteer: gazetteerSlice, + countries: countriesSlice, + }, + regex_meta: nativeRegexMeta, + custom_regex_meta: nativeCustomRegexMeta, + }; + if (denyListData) { + nativeConfig.deny_list_data = toNativeDenyListData(denyListData); + } + if (gazetteerData) { + nativeConfig.gazetteer_data = gazetteerData; + } + if (countryData) { + nativeConfig.country_data = countryData; + } + return nativeConfig; +}; + +const toNativeDenyListPattern = ( + pattern: string, + wholeWords: boolean, +): NativeSearchPattern => ({ + kind: "literal-with-options", + pattern, + case_insensitive: true, + whole_words: wholeWords, +}); + +const toNativeRegexPattern = (entry: PatternEntry): NativeSearchPattern => { + const pattern: NativeSearchPattern = { + kind: "regex", + pattern: patternEntryText(entry), + }; + if ( + typeof entry === "string" || + entry instanceof RegExp || + entry.pattern instanceof RegExp + ) { + return pattern; + } + + const regexEntry = entry as { + lazy?: boolean; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; + }; + if (regexEntry.lazy !== undefined) { + pattern.lazy = regexEntry.lazy; + } + if (regexEntry.prefilterAny !== undefined) { + pattern.prefilter_any = [...regexEntry.prefilterAny]; + } + if (regexEntry.prefilterCaseInsensitive !== undefined) { + pattern.prefilter_case_insensitive = regexEntry.prefilterCaseInsensitive; + } + if (regexEntry.prefilterRegex !== undefined) { + pattern.prefilter_regex = toNativeRegexSource(regexEntry.prefilterRegex); + } + return pattern; +}; + +const toNativeRegexSource = (regex: RegExp): string => + regex.ignoreCase ? `(?i:${regex.source})` : regex.source; + +const toNativeLiteralPattern = (entry: PatternEntry): NativeSearchPattern => { + if (typeof entry === "string") { + return { kind: "literal", pattern: entry }; + } + if (entry instanceof RegExp) { + throw new Error("Native static config does not accept RegExp objects"); + } + if (entry.pattern instanceof RegExp) { + throw new Error("Native static config does not accept RegExp entries"); + } + if ("distance" in entry) { + const pattern: NativeSearchPattern = { + kind: "fuzzy", + pattern: entry.pattern, + }; + if (entry.distance !== "auto") { + pattern.distance = entry.distance; + } + return pattern; + } + if (entry.literal === true) { + const pattern: NativeSearchPattern = { + kind: "literal-with-options", + pattern: entry.pattern, + }; + if (entry.caseInsensitive !== undefined) { + pattern.case_insensitive = entry.caseInsensitive; + } + if (entry.wholeWords !== undefined) { + pattern.whole_words = entry.wholeWords; + } + return pattern; + } + return { kind: "regex", pattern: entry.pattern }; +}; + +const patternEntryText = (entry: PatternEntry): string => { + if (typeof entry === "string") { + return entry; + } + if (entry instanceof RegExp) { + return entry.source; + } + if (entry.pattern instanceof RegExp) { + return entry.pattern.source; + } + return entry.pattern; +}; + +const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { + const result: NativeRegexMatchMeta = { + label: meta.label, + score: meta.score, + }; + if (meta.sourceDetail) { + result.source_detail = meta.sourceDetail; + } + if (meta.validator) { + result.requires_validation = true; + } + if (meta.minByteLength !== undefined) { + result.min_byte_length = meta.minByteLength; + } + return result; +}; + +const toNativeDenyListData = (data: DenyListData): NativeDenyListMatchData => ({ + labels: data.labels.map(stringArrayValue), + custom_labels: data.originals.map((_, index) => + stringArrayValue(data.customLabels[index]), + ), + originals: data.originals, + sources: data.sources.map(stringArrayValue), + filters: toNativeDenyListFilters(data.filters), +}); + +const toNativeDenyListFilters = ( + filters: DenyListData["filters"], +): NativeDenyListFilterData => ({ + stopwords: filters.stopwords, + allow_list: filters.allowList, + person_stopwords: filters.personStopwords, + address_stopwords: filters.addressStopwords, + street_types: filters.streetTypes, + first_names: filters.firstNames, + generic_roles: filters.genericRoles, + sentence_starters: filters.sentenceStarters, + trailing_address_word_exclusions: filters.trailingAddressWordExclusions, + defined_term_cues: filters.definedTermCues, +}); + +const stringArrayValue = ( + value: string | readonly string[] | undefined, +): string[] => { + if (value === undefined) { + return []; + } + if (typeof value === "string") { + return [value]; + } + return [...value]; }; diff --git a/packages/anonymize/src/data/deny-list-filters.json b/packages/anonymize/src/data/deny-list-filters.json new file mode 100644 index 00000000..fa30d32f --- /dev/null +++ b/packages/anonymize/src/data/deny-list-filters.json @@ -0,0 +1,81 @@ +{ + "cs": { + "trailingAddressWordExclusions": [ + "nájemce", + "pronajímatel", + "kupující", + "prodávající", + "objednatel", + "zhotovitel", + "dodavatel", + "odběratel", + "věřitel", + "dlužník", + "zadavatel", + "uchazeč", + "příjemce", + "plátce", + "správa", + "sekretariát", + "kancelář", + "odbor", + "oddělení", + "úřad", + "inspekce", + "agentura", + "článek", + "smlouva", + "dodatek", + "příloha", + "předmět", + "podmínky", + "ustanovení" + ] + }, + "en": { + "definedTermCues": [ + "mean", + "means", + "shall mean", + "shall means", + "shall have the meaning", + "shall have the meanings", + "refer to", + "refers to", + "has the meaning", + "has the meanings", + "is defined" + ], + "sentenceStarters": [ + "the", + "this", + "these", + "those", + "an", + "any", + "all", + "each", + "every", + "no", + "now", + "whereas", + "whereby", + "wherein", + "whereof", + "notwithstanding", + "subject", + "in", + "on", + "at", + "by", + "for", + "if", + "upon", + "unless", + "until", + "provided", + "pursuant", + "such" + ] + } +} diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index b1ea6075..3b396e85 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -18,9 +18,11 @@ import type { import type { PipelineContext } from "../context"; import { defaultContext } from "../context"; import { loadGenericRoles } from "../filters/false-positives"; +import { buildStreetTypePatterns } from "./address-seeds"; import { normalizeForSearch } from "../util/normalize"; import { ALL_UPPER_RE, UPPER_START_RE } from "../util/text"; import { DASH } from "../util/char-groups"; +import denyListFiltersByLanguage from "../data/deny-list-filters.json"; export type DenyListConfig = Pick< PipelineConfig, @@ -420,46 +422,73 @@ const hasAdjacentAddressEvidence = ( return streetRe !== null && streetRe.test(window); }; -/** - * Capitalised words that almost never start a person name. When a - * single-token surname candidate is immediately followed by one of - * these, the "next-word is uppercase" promotion heuristic would - * otherwise turn section headings ("Purchase Price↵The Purchaser - * undertakes…") into spurious person hits. Kept narrow on purpose; - * the surrounding pipeline still chains real names via the deny-list - * cascade when both halves are surnames. - */ -const SENTENCE_STARTER_WORDS: ReadonlySet = new Set([ - "The", - "This", - "These", - "Those", - "An", - "Any", - "All", - "Each", - "Every", - "No", - "Now", - "Whereas", - "Whereby", - "Wherein", - "Whereof", - "Notwithstanding", - "Subject", - "In", - "On", - "At", - "By", - "For", - "If", - "Upon", - "Unless", - "Until", - "Provided", - "Pursuant", - "Such", -]); +type DenyListLanguageFilters = { + sentenceStarters?: readonly string[]; + trailingAddressWordExclusions?: readonly string[]; + definedTermCues?: readonly string[]; +}; + +export type DenyListFilterData = { + stopwords: string[]; + allowList: string[]; + personStopwords: string[]; + addressStopwords: string[]; + streetTypes: string[]; + firstNames: string[]; + genericRoles: string[]; + sentenceStarters: string[]; + trailingAddressWordExclusions: string[]; + definedTermCues: string[]; +}; + +const DENY_LIST_FILTER_GROUPS: readonly DenyListLanguageFilters[] = + Object.values(denyListFiltersByLanguage); + +const lowerSortedUnique = (values: Iterable): string[] => + [...new Set([...values].map((value) => value.toLowerCase()))].toSorted(); + +const escapeRegExp = (value: string): string => + value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + +const collectLanguageFilterValues = ( + selector: (filters: DenyListLanguageFilters) => readonly string[] | undefined, +): string[] => + lowerSortedUnique( + DENY_LIST_FILTER_GROUPS.flatMap((filters) => selector(filters) ?? []), + ); + +const DENY_LIST_STATIC_FILTERS = { + definedTermCues: collectLanguageFilterValues( + (filters) => filters.definedTermCues, + ), + sentenceStarters: collectLanguageFilterValues( + (filters) => filters.sentenceStarters, + ), + trailingAddressWordExclusions: collectLanguageFilterValues( + (filters) => filters.trailingAddressWordExclusions, + ), +}; + +const SENTENCE_STARTER_WORDS: ReadonlySet = new Set( + DENY_LIST_STATIC_FILTERS.sentenceStarters, +); +const TRAILING_ADDRESS_WORD_EXCLUSIONS: ReadonlySet = new Set( + DENY_LIST_STATIC_FILTERS.trailingAddressWordExclusions, +); + +const buildDefinedTermCueRe = (): RegExp => { + const cues = DENY_LIST_STATIC_FILTERS.definedTermCues.toSorted( + (left, right) => right.length - left.length, + ); + if (cues.length === 0) { + return /$(?!)/; + } + const pattern = cues + .map((cue) => escapeRegExp(cue).replace(/\s+/g, "\\s+")) + .join("|"); + return new RegExp(`^[\\s,]*(?:${pattern})\\b`, "iu"); +}; +const DEFINED_TERM_CUE_RE = buildDefinedTermCueRe(); const PERSON_CHAIN_BREAK_RE = /[!?;:]|,/u; const WORD_CHAR_RE = /[\p{L}\p{N}]/u; @@ -601,6 +630,7 @@ export type DenyListData = { originals: string[]; /** Maps pattern index → source types (plural). */ sources: PatternSources[]; + filters: DenyListFilterData; }; const getCityEntries = ( @@ -668,6 +698,7 @@ export const buildDenyList = async ( ]); const commonWords = await loadCommonWords(); const monthNames = await loadMonthNames(); + const filters = await buildDenyListFilterData(ctx); const dictionaries = config.dictionaries; const hasDenyList = dictionaries?.denyList && dictionaries?.denyListMeta; @@ -683,7 +714,7 @@ export const buildDenyList = async ( // No dictionary data available — skip deny-list building if (!hasDenyList && !hasCities && !hasCustomDenyList) { // Still build name corpus entries if available - return buildNameCorpusOnly(config, ctx); + return buildNameCorpusOnly(config, ctx, filters); } const excluded = config.denyListExcludeCategories; @@ -835,6 +866,7 @@ export const buildDenyList = async ( customLabels: customLabelList, originals: patternList, sources: sourceList, + filters, }; }; @@ -847,6 +879,7 @@ export const buildDenyList = async ( const buildNameCorpusOnly = ( config: DenyListConfig, ctx: PipelineContext, + filters: DenyListFilterData, ): DenyListData | null => { if (!config.enableNameCorpus) { return null; @@ -882,6 +915,7 @@ const buildNameCorpusOnly = ( customLabels: customLabelList, originals: patternList, sources: sourceList, + filters, }; }; @@ -979,6 +1013,26 @@ type RawMatch = { patternIdx: number; }; +const buildStreetTypeFilterValues = async (): Promise => + lowerSortedUnique(await buildStreetTypePatterns()); + +const buildDenyListFilterData = async ( + ctx: PipelineContext, +): Promise => ({ + stopwords: [...getStopwords(ctx)], + allowList: [...getAllowList(ctx)], + personStopwords: [...getPersonStopwords(ctx)], + addressStopwords: [...getAddressStopwords(ctx)], + streetTypes: await buildStreetTypeFilterValues(), + firstNames: [...getNameCorpusFirstNames(ctx)], + genericRoles: [...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES)], + sentenceStarters: [...DENY_LIST_STATIC_FILTERS.sentenceStarters], + trailingAddressWordExclusions: [ + ...DENY_LIST_STATIC_FILTERS.trailingAddressWordExclusions, + ], + definedTermCues: [...DENY_LIST_STATIC_FILTERS.definedTermCues], +}); + const customMatchHasValidEdges = ( fullText: string, start: number, @@ -1262,14 +1316,9 @@ export const processDenyListMatches = ( continue; } - // Skip the trailing-capitalised-word extension when the - // chain sits inside a defined-term quote - // (`"Bond Hedge Transactions"`, `"Blue Sky Laws"`). - // Legal prose uses curly or straight quotes to introduce - // capitalised noun phrases that are not personal names; - // chaining beyond the name corpus inside that bracketed - // context produces unstable spans like - // `"Bond Hedge Transactions"`-as-person. + // Skip extension inside quoted defined-term contexts: + // legal prose often uses quoted capitalised noun phrases + // that are not personal names. const insideDefinedTermQuote = isSuppressibleDefinedTermQuote( fullText, first.start, @@ -1285,28 +1334,22 @@ export const processDenyListMatches = ( // Score: chained names get 0.9, single names 0.5 const score = chain.length >= 2 ? 0.9 : 0.5; - // Single-word deny-list matches are too noisy: - // "Rate", "Server", "Code" etc. are surnames but - // also common English words. Only accept single- - // word matches when the next word is also uppercase - // (likely a full name: "Alena Zemanová"). Skip - // sentence-starter articles ("The Purchaser…") - // which otherwise turn section headings like - // "Purchase Price↵The Purchaser…" into person hits. + // Single-word deny-list matches are noisy. Only accept + // them when the next token has the shape of a name word, + // while excluding language-data sentence starters. if (chain.length === 1) { const afterEnd = last.end; const rest = fullText.slice(afterEnd).trimStart(); - // Require Cap + lowercase: filters out acronyms like - // "EU", "USA" so "Rady EU" doesn't read as a name. + // Require Cap + lowercase so acronym-shaped tokens + // do not promote a single-token hit. const nextIsUpper = rest.length > 1 && /^\p{Lu}\p{Ll}/u.test(rest); if (!nextIsUpper) { continue; } - // Reject sentence-starter articles ("The Purchaser…") - // so section headings followed by a sentence don't - // get promoted to person hits. + // Reject sentence starters so headings followed by + // prose do not get promoted to person hits. const nextWord = /^\p{L}+/u.exec(rest)?.[0] ?? ""; - if (SENTENCE_STARTER_WORDS.has(nextWord)) { + if (SENTENCE_STARTER_WORDS.has(nextWord.toLowerCase())) { continue; } } @@ -1358,44 +1401,6 @@ const POSTAL_PREFIX_RE = new RegExp( `(?:\\d{5}|\\d{3}\\s\\d{2})\\s*${DASH}?\\s*$`, ); -// Words that must NOT be absorbed into an address span -// when they follow a postal-code + city pattern. Party -// roles, organizational nouns, and common legal terms. -const TRAILING_WORD_EXCLUSIONS: ReadonlySet = new Set([ - // CZ/SK party roles - "nájemce", - "pronajímatel", - "kupující", - "prodávající", - "objednatel", - "zhotovitel", - "dodavatel", - "odběratel", - "věřitel", - "dlužník", - "zadavatel", - "uchazeč", - "příjemce", - "plátce", - // Organizational nouns - "správa", - "sekretariát", - "kancelář", - "odbor", - "oddělení", - "úřad", - "inspekce", - "agentura", - // Legal clause starters - "článek", - "smlouva", - "dodatek", - "příloha", - "předmět", - "podmínky", - "ustanovení", -]); - const extendCityDistricts = (entities: Entity[], fullText: string): void => { for (const entity of entities) { if (entity.label !== "address") { @@ -1446,7 +1451,7 @@ const extendCityDistricts = (entities: Entity[], fullText: string): void => { const trailingWordM = /^[\s]{1,4}(\p{Lu}\p{Ll}+)/u.exec(afterExt); if (trailingWordM && !trailingWordM[0].includes("\n")) { const candidate = (trailingWordM[1] ?? "").toLowerCase(); - if (!TRAILING_WORD_EXCLUSIONS.has(candidate)) { + if (!TRAILING_ADDRESS_WORD_EXCLUSIONS.has(candidate)) { entity.end += trailingWordM[0].length; entity.text = fullText.slice(entity.start, entity.end); } @@ -1456,30 +1461,20 @@ const extendCityDistricts = (entities: Entity[], fullText: string): void => { /** * Extend a person name match to include subsequent - * capitalized words. "Pavel" + " Heřmánek" → "Pavel - * Heřmánek". Stops at lowercase words, punctuation, - * or end of text. Also extends backward if preceded - * by a capitalized word (for "Miroslav Braňka" when - * only "Braňka" matched). + * capitalized words. Stops at lowercase words, + * punctuation, or end of text. */ /** * Defined-term marker: an opening typographic or straight * quote enclosing the chain start, AND a closing quote - * within a short window followed by a - * definitional cue (`means`, `shall mean`, `shall have - * the meaning(s)`, `refers to`). Legal documents reserve - * this construction for defined terms; the contents are - * not personal names even when individual tokens collide - * with the name corpus. - * - * Plain quotations like `"John Unknown" said ...` do NOT - * count: there is no definitional cue, so the trailing - * surname extension is still allowed to absorb `Unknown`. + * within a short window followed by a language-data + * definitional cue. Legal documents reserve this + * construction for defined terms; the contents are not + * personal names even when individual tokens collide with + * the name corpus. */ const OPENING_QUOTES = new Set(['"', "'", "“", "„", "‟", "‘", "‛", "«"]); const CLOSING_QUOTES = new Set(['"', "'", "”", "’", "»", "“"]); -const DEFINED_TERM_CUE_RE = - /^[\s,]*(?:means?|shall\s+means?|shall\s+have\s+the\s+meanings?|refers?\s+to|has\s+the\s+meanings?|is\s+defined)\b/iu; const DEFINED_TERM_LOOKAHEAD = 120; const DEFINED_TERM_LOOKBEHIND = 80; const EMPTY_GENERIC_ROLES: ReadonlySet = new Set(); @@ -1591,12 +1586,9 @@ const isSuppressibleDefinedTermQuote = ( const words = definedTermQuote.content.match(WORD_RE) ?? []; - // A quoted defined term can itself be a real person: - // `"John Smith" shall mean the employee...`. Preserve those - // when the definition itself points at a legal/business role - // from dictionary data. Legal terms such as `"Bond Hedge"` - // stay suppressible even if their first token collides with - // a given-name corpus entry. + // A quoted defined term can itself be a real person. + // Preserve those when the definition points at a role from + // dictionary data. if ( words.length >= 2 && startsWithKnownFirstName(definedTermQuote.content, ctx) && @@ -1638,31 +1630,17 @@ const extendPersonName = ( wordEnd++; } - // Skip trailing punctuation (commas, periods, - // typographic closing quotes). Curly quotes survive - // normalisation because they often appear inside - // defined-term clauses (`"Blue Sky Laws"`); strip - // them so the allow-list / stopword check sees the - // bare word. + // Skip trailing punctuation and typographic closing + // quotes so stopword checks see the bare word. const word = text.slice(wordStart, wordEnd); const stripped = word.replace(/[,;.”"’'“»]+$/, ""); if (stripped.length < 2) { break; } - // Don't extend into stopwords or person stopwords. - // The global allow list is intentionally NOT consulted - // here: real surnames such as `Law`, `Tesla`, or - // `Vote` are common English words and live on the - // allow list to suppress single-token noise, but they - // are legitimate name extensions when preceded by a - // first name in plain prose (`John Law`, `Elon - // Tesla`). Defined-term contexts (`"Blue Sky Laws"`, - // `"Bond Hedge Transactions"`) are filtered earlier by - // `isInsideDefinedTermQuote`, so by the time - // `extendPersonName` runs we are in ordinary prose and - // the allow-list block would only swallow real - // surnames. + // Do not consult the global allow list here: common + // words can be legitimate name extensions once a first + // name has established person context. const lower = stripped.toLowerCase(); if (getStopwords(ctx).has(lower) || getPersonStopwords(ctx).has(lower)) { break; diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 71a6bcab..d3123242 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -76,6 +76,26 @@ const escapeRegexPhrase = (s: string): string => /** Escape for use inside a regex character class. */ const escapeCharClass = (s: string): string => s.replace(/[\]\\^-]/g, "\\$&"); +const utf8ByteLength = (text: string): number => { + let length = 0; + for (const char of text) { + const codePoint = char.codePointAt(0); + if (codePoint === undefined) { + continue; + } + if (codePoint <= 0x7f) { + length += 1; + } else if (codePoint <= 0x7ff) { + length += 2; + } else if (codePoint <= 0xffff) { + length += 3; + } else { + length += 4; + } + } + return length; +}; + const toSortedAlternation = (values: readonly string[]): string => [ ...new Set( @@ -141,6 +161,7 @@ export type RegexMeta = { label: string; score: number; sourceDetail?: Entity["sourceDetail"]; + minByteLength?: number; /** Post-match stdnum validator for confirmation. */ validator?: Validator; /** Extract the identifier portion when context is part of the regex span. */ @@ -151,6 +172,7 @@ type RegexDef = { pattern: string; label: string; score: number; + minByteLength?: number; validator?: Validator; validatorInput?: (text: string) => string; }; @@ -468,6 +490,7 @@ const INTL_PHONE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{0,4}\\b`, label: "phone number", score: 1, + minByteLength: MIN_PHONE_LENGTH, }; // Czech phone numbers: mobiles start with 6/7, @@ -483,6 +506,7 @@ const CZ_PHONE: RegexDef = { `(?![^\\S\\n]*(?:Kč|,-|korun|EUR|USD|€|\\$))\\b`, label: "phone number", score: 0.85, + minByteLength: MIN_PHONE_LENGTH, }; /** @@ -498,6 +522,7 @@ const TEL_PREFIX_PHONE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{3}\\b`, label: "phone number", score: 0.95, + minByteLength: MIN_PHONE_LENGTH, }; /** @@ -518,6 +543,7 @@ const US_PAREN_PHONE: RegexDef = { `\\(\\d{3}\\)(?:[^\\S\\n]|[.\\-])?\\d{3}` + `(?:[^\\S\\n]|[.\\-])\\d{4}\\b`, label: "phone number", score: 0.9, + minByteLength: MIN_PHONE_LENGTH, }; const CREDIT_CARD: RegexDef = { @@ -597,6 +623,7 @@ const HU_LANDLINE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{4}\\b`, label: "phone number", score: 0.9, + minByteLength: MIN_PHONE_LENGTH, }; // Czech license plates (SPZ/RZ). @@ -1139,6 +1166,9 @@ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( if (d.validator) { meta.validator = d.validator; } + if (d.minByteLength) { + meta.minByteLength = d.minByteLength; + } if (d.validatorInput) { meta.validatorInput = d.validatorInput; } @@ -1777,8 +1807,8 @@ export const processRegexMatches = ( } if ( meta.sourceDetail !== "custom-regex" && - meta.label === "phone number" && - match.text.length < MIN_PHONE_LENGTH + meta.minByteLength !== undefined && + utf8ByteLength(match.text) < meta.minByteLength ) { continue; } From 539164ee267cf6043f31847f5c867dee3419d37f Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 19:49:31 +0200 Subject: [PATCH 024/130] perf: skip ts search for native prep --- .../scripts/migration-fixture-perf.mjs | 31 ++- .../anonymize/src/build-unified-search.ts | 204 +++++++++++++----- 2 files changed, 181 insertions(+), 54 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index b4add88e..cb1ee94b 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -183,7 +183,15 @@ async function runWorker() { const context = indexModule.createPipelineContext(); const prepareStart = Bun.nanoseconds(); - const search = await indexModule.preparePipelineSearch({ config, context }); + const search = + runtime === "native-static" + ? await prepareNativeStaticSearch({ + sourceRoot, + variant, + config, + context, + }) + : await indexModule.preparePipelineSearch({ config, context }); const prepareMs = elapsedMs(prepareStart); const nativeRewrite = describeNativeRewrite(config, search, runtime); @@ -269,6 +277,27 @@ async function runWorker() { ); } +async function prepareNativeStaticSearch({ + sourceRoot, + variant, + config, + context, +}) { + const module = await importSource( + sourceRoot, + "packages/anonymize/src/build-unified-search.ts", + variant, + ); + const buildNativeStaticSearchBundle = Reflect.get( + Object(module), + "buildNativeStaticSearchBundle", + ); + if (typeof buildNativeStaticSearchBundle !== "function") { + throw new TypeError("Native static search bundle builder is unavailable"); + } + return buildNativeStaticSearchBundle(config, [], context); +} + async function runTypeScriptFixtureSweep({ indexModule, config, diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 59efce96..ea927fa1 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -23,6 +23,7 @@ import { getTextSearch } from "./search-engine"; import { isLegalFormsEnabled, + type CustomRegexPattern, type GazetteerEntry, type PipelineConfig, } from "./types"; @@ -182,11 +183,49 @@ export type UnifiedSearchInstance = { nativeStaticConfig: NativePreparedSearchConfig; }; -export const buildUnifiedSearch = async ( +type GazetteerPatternResult = { + patterns: PatternEntry[]; + data: GazetteerData; +}; + +type CountryPatternResult = { + patterns: PatternEntry[]; + data: CountryData; +}; + +type UnifiedSearchSources = { + allRegex: PatternEntry[]; + regexMeta: RegexMeta[]; + customRegexes: CustomRegexPattern[]; + customRegexMeta: RegexMeta[]; + legalForms: readonly string[]; + triggers: { + patterns: string[]; + rules: TriggerRule[]; + }; + denyListData: DenyListData | null; + streetTypes: string[]; + gazResult: GazetteerPatternResult | null; + countryResult: CountryPatternResult | null; + slices: UnifiedSearchInstance["slices"]; + literalAllPatterns: PatternEntry[] | string[]; + canUseGlobalWholeWordLiterals: boolean; + customDenyListNeedsWholeWords: (pattern: string) => boolean; +}; + +export type NativeStaticSearchBundle = { + nativeStaticConfig: NativePreparedSearchConfig; + slices: UnifiedSearchInstance["slices"]; + regexMeta: readonly RegexMeta[]; + customRegexMeta: readonly RegexMeta[]; + denyListData: DenyListData | null; +}; + +const buildUnifiedSearchSources = async ( config: PipelineConfig, gazetteerEntries: GazetteerEntry[] = [], ctx: PipelineContext = defaultContext, -): Promise => { +): Promise => { const legalFormsEnabled = isLegalFormsEnabled(config); const searchLabels = config.enableHotwordRules === true @@ -301,28 +340,6 @@ export const buildUnifiedSearch = async ( end: offset + triggers.patterns.length, }; - // Trigger patterns need caseInsensitive on AC - // (only ~120 objects, not 200K). Regex/legal-form - // patterns are bare strings (auto-classified). - const triggerEntries = triggers.patterns.map((p) => ({ - pattern: p, - literal: true as const, - caseInsensitive: true, - })); - - const regexAllPatterns = [...allRegex, ...legalForms, ...triggerEntries]; - - // TextSearch uses static complexity routing for - // regex patterns: common regexes share bounded - // chunks, while high-risk patterns are isolated. - const tsRegex = new (getTextSearch())(regexAllPatterns); - const tsCustomRegex = new (getTextSearch())( - customRegexes.map((entry) => entry.pattern), - { - overlapStrategy: "all", - }, - ); - // ── Instance 2: deny-list + street-types + gaz ── // Deny-list and street-type patterns are plain // strings (allLiteral). Gazetteer adds exact @@ -424,10 +441,100 @@ export const buildUnifiedSearch = async ( ...(countryResult?.patterns ?? []), ]; + return { + allRegex, + regexMeta, + customRegexes, + customRegexMeta, + legalForms, + triggers, + denyListData, + streetTypes, + gazResult, + countryResult, + slices: { + regex: regexSlice, + customRegex: customRegexSlice, + legalForms: legalFormsSlice, + triggers: triggersSlice, + denyList: denyListSlice, + streetTypes: streetTypesSlice, + gazetteer: gazetteerSlice, + countries: countriesSlice, + }, + literalAllPatterns, + canUseGlobalWholeWordLiterals, + customDenyListNeedsWholeWords, + }; +}; + +export const buildNativeStaticSearchBundle = async ( + config: PipelineConfig, + gazetteerEntries: GazetteerEntry[] = [], + ctx: PipelineContext = defaultContext, +): Promise => { + const sources = await buildUnifiedSearchSources( + config, + gazetteerEntries, + ctx, + ); + return { + nativeStaticConfig: buildNativeStaticConfig({ + regexPatterns: sources.allRegex, + regexMeta: sources.regexMeta, + customRegexes: sources.customRegexes, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + gazetteerPatterns: sources.gazResult?.patterns ?? [], + gazetteerData: sources.gazResult?.data ?? null, + countryPatterns: sources.countryResult?.patterns ?? [], + countryData: sources.countryResult?.data ?? null, + customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, + }), + slices: sources.slices, + regexMeta: sources.regexMeta, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + }; +}; + +export const buildUnifiedSearch = async ( + config: PipelineConfig, + gazetteerEntries: GazetteerEntry[] = [], + ctx: PipelineContext = defaultContext, +): Promise => { + const sources = await buildUnifiedSearchSources( + config, + gazetteerEntries, + ctx, + ); + const triggerEntries = sources.triggers.patterns.map((p) => ({ + pattern: p, + literal: true as const, + caseInsensitive: true, + })); + + const regexAllPatterns = [ + ...sources.allRegex, + ...sources.legalForms, + ...triggerEntries, + ]; + + // TextSearch uses static complexity routing for + // regex patterns: common regexes share bounded + // chunks, while high-risk patterns are isolated. + const tsRegex = new (getTextSearch())(regexAllPatterns); + const tsCustomRegex = new (getTextSearch())( + sources.customRegexes.map((entry) => entry.pattern), + { + overlapStrategy: "all", + }, + ); + const tsLiterals = - literalAllPatterns.length > 0 - ? new (getTextSearch())(literalAllPatterns, { - ...(canUseGlobalWholeWordLiterals + sources.literalAllPatterns.length > 0 + ? new (getTextSearch())(sources.literalAllPatterns, { + ...(sources.canUseGlobalWholeWordLiterals ? { allLiteral: true, wholeWords: true } : {}), caseInsensitive: true, @@ -436,38 +543,29 @@ export const buildUnifiedSearch = async ( : new (getTextSearch())([]); const nativeStaticConfig = buildNativeStaticConfig({ - regexPatterns: allRegex, - regexMeta, - customRegexes, - customRegexMeta, - denyListData, - gazetteerPatterns: gazResult?.patterns ?? [], - gazetteerData: gazResult?.data ?? null, - countryPatterns: countryResult?.patterns ?? [], - countryData: countryResult?.data ?? null, - customDenyListNeedsWholeWords, + regexPatterns: sources.allRegex, + regexMeta: sources.regexMeta, + customRegexes: sources.customRegexes, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + gazetteerPatterns: sources.gazResult?.patterns ?? [], + gazetteerData: sources.gazResult?.data ?? null, + countryPatterns: sources.countryResult?.patterns ?? [], + countryData: sources.countryResult?.data ?? null, + customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, }); return { tsRegex, tsCustomRegex, tsLiterals, - slices: { - regex: regexSlice, - customRegex: customRegexSlice, - legalForms: legalFormsSlice, - triggers: triggersSlice, - denyList: denyListSlice, - streetTypes: streetTypesSlice, - gazetteer: gazetteerSlice, - countries: countriesSlice, - }, - regexMeta, - customRegexMeta, - triggerRules: triggers.rules, - denyListData, - gazetteerData: gazResult?.data ?? null, - countryData: countryResult?.data ?? null, + slices: sources.slices, + regexMeta: sources.regexMeta, + customRegexMeta: sources.customRegexMeta, + triggerRules: sources.triggers.rules, + denyListData: sources.denyListData, + gazetteerData: sources.gazResult?.data ?? null, + countryData: sources.countryResult?.data ?? null, nativeStaticConfig, }; }; From f219599d1712dd31d3b9eb61862aca7c3096d6f6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Wed, 24 Jun 2026 19:59:50 +0200 Subject: [PATCH 025/130] fix: clean migration benchmark helper --- packages/anonymize/scripts/migration-fixture-perf.mjs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index cb1ee94b..5601bd2d 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -661,7 +661,6 @@ function describeNativeRewrite(config, search, runtime) { const unsupportedPipelineStages = describeUnsupportedPipelineStages( config, search, - denyListSourceCounts, ); return { @@ -684,11 +683,7 @@ function describeNativeRewrite(config, search, runtime) { }; } -function describeUnsupportedPipelineStages( - config, - search, - denyListSourceCounts, -) { +function describeUnsupportedPipelineStages(config, search) { const stages = []; if (config.enableLegalForms) { stages.push("legal-forms-v2"); From f7f20365cbe2bdf3f32908e537f8e6448461ba11 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 09:42:09 +0200 Subject: [PATCH 026/130] feat: wire native static redaction package --- Cargo.lock | 203 ++- crates/anonymize-adapter-contract/Cargo.toml | 3 + crates/anonymize-adapter-contract/src/lib.rs | 1188 +++++++++++- crates/anonymize-core/Cargo.toml | 4 +- crates/anonymize-core/src/address_seeds.rs | 1202 ++++++++++++ crates/anonymize-core/src/anchored.rs | 158 ++ crates/anonymize-core/src/artifact_bytes.rs | 122 ++ crates/anonymize-core/src/dates.rs | 459 +++++ crates/anonymize-core/src/diagnostics.rs | 15 + crates/anonymize-core/src/false_positives.rs | 876 +++++++++ crates/anonymize-core/src/legal_forms.rs | 1612 +++++++++++++++++ crates/anonymize-core/src/lib.rs | 34 +- crates/anonymize-core/src/money.rs | 687 +++++++ crates/anonymize-core/src/prepared.rs | 1110 ++++++++++-- crates/anonymize-core/src/processors.rs | 267 ++- crates/anonymize-core/src/resolution/merge.rs | 87 + .../anonymize-core/src/resolution/sanitize.rs | 29 +- crates/anonymize-core/src/search.rs | 358 +++- crates/anonymize-core/src/signatures.rs | 641 +++++++ crates/anonymize-core/src/triggers.rs | 1192 ++++++++++++ crates/anonymize-core/src/types.rs | 10 + crates/anonymize-core/src/validators.rs | 647 +++++++ crates/anonymize-core/tests/prepared.rs | 1214 ++++++++++++- crates/anonymize-core/tests/processors.rs | 211 ++- crates/anonymize-core/tests/resolution.rs | 114 ++ crates/anonymize-core/tests/search.rs | 97 +- crates/anonymize-napi/Cargo.toml | 1 + crates/anonymize-napi/src/lib.rs | 494 +++-- crates/anonymize-py/src/lib.rs | 120 +- .../scripts/migration-fixture-perf.mjs | 1295 +++++++++++-- .../anonymize/scripts/native-adapter-perf.mjs | 101 +- .../anonymize/src/__test__/countries.test.ts | 15 + ...kit-legal-services-framework.snapshot.json | 6 +- .../cs/sanofi-bonus-agreement.snapshot.json | 8 +- ...chaeftsfuehrer-dienstvertrag.snapshot.json | 4 +- .../software-license-agreement.snapshot.json | 9 +- .../src/__test__/load-dictionaries.ts | 143 +- .../__test__/native-adapter-parity.test.ts | 292 +++ .../src/__test__/pipeline-config.test.ts | 115 ++ .../src/__test__/us-bank-routing.test.ts | 10 + .../anonymize/src/build-unified-search.ts | 596 +++++- packages/anonymize/src/context.ts | 4 + .../src/data/address-boundaries.json | 4 + .../data/address-jurisdiction-prefixes.json | 4 + .../src/data/address-stop-keywords.json | 10 + .../src/data/ambiguous-country-surfaces.json | 4 + .../anonymize/src/data/clause-noun-heads.json | 6 +- .../src/data/defined-term-heads.json | 4 + .../anonymize/src/data/deny-list-filters.json | 33 - .../anonymize/src/data/language-scopes.json | 73 + .../src/data/legal-form-rule-words.json | 27 + .../src/data/legal-role-heads.cs.json | 6 + .../src/data/organization-unit-heads.json | 13 + .../anonymize/src/data/person-stopwords.json | 2 + .../anonymize/src/data/signing-clauses.json | 34 +- packages/anonymize/src/data/triggers.de.json | 2 +- packages/anonymize/src/data/triggers.en.json | 11 + .../anonymize/src/detectors/address-seeds.ts | 70 + packages/anonymize/src/detectors/countries.ts | 3 +- packages/anonymize/src/detectors/deny-list.ts | 259 ++- .../anonymize/src/detectors/legal-forms.ts | 10 +- packages/anonymize/src/detectors/regex.ts | 254 ++- packages/anonymize/src/detectors/triggers.ts | 13 +- .../anonymize/src/filters/false-positives.ts | 17 +- packages/anonymize/src/language-scope.ts | 86 + packages/anonymize/src/pipeline.ts | 13 +- packages/anonymize/src/types.ts | 18 +- packages/data/config/address-boundaries.json | 4 + .../config/address-jurisdiction-prefixes.json | 4 + .../data/config/address-stop-keywords.json | 10 + .../config/ambiguous-country-surfaces.json | 4 + packages/data/config/clause-noun-heads.json | 6 +- packages/data/config/defined-term-heads.json | 4 + packages/data/config/deny-list-filters.json | 48 + packages/data/config/language-scopes.json | 73 + .../data/config/legal-form-rule-words.json | 27 + packages/data/config/legal-role-heads.cs.json | 6 + .../data/config/organization-unit-heads.json | 13 + packages/data/config/person-stopwords.json | 2 + packages/data/config/signing-clauses.json | 34 +- packages/data/config/triggers.de.json | 2 +- packages/data/config/triggers.en.json | 11 + packages/data/dictionaries/index.ts | 147 +- 83 files changed, 16104 insertions(+), 1020 deletions(-) create mode 100644 crates/anonymize-core/src/address_seeds.rs create mode 100644 crates/anonymize-core/src/anchored.rs create mode 100644 crates/anonymize-core/src/artifact_bytes.rs create mode 100644 crates/anonymize-core/src/dates.rs create mode 100644 crates/anonymize-core/src/false_positives.rs create mode 100644 crates/anonymize-core/src/legal_forms.rs create mode 100644 crates/anonymize-core/src/money.rs create mode 100644 crates/anonymize-core/src/signatures.rs create mode 100644 crates/anonymize-core/src/triggers.rs create mode 100644 crates/anonymize-core/src/validators.rs create mode 100644 packages/anonymize/src/data/address-jurisdiction-prefixes.json create mode 100644 packages/anonymize/src/data/ambiguous-country-surfaces.json create mode 100644 packages/anonymize/src/data/defined-term-heads.json create mode 100644 packages/anonymize/src/data/language-scopes.json create mode 100644 packages/anonymize/src/data/legal-form-rule-words.json create mode 100644 packages/anonymize/src/data/organization-unit-heads.json create mode 100644 packages/anonymize/src/language-scope.ts create mode 100644 packages/data/config/address-jurisdiction-prefixes.json create mode 100644 packages/data/config/ambiguous-country-surfaces.json create mode 100644 packages/data/config/defined-term-heads.json create mode 100644 packages/data/config/deny-list-filters.json create mode 100644 packages/data/config/language-scopes.json create mode 100644 packages/data/config/legal-form-rule-words.json create mode 100644 packages/data/config/organization-unit-heads.json diff --git a/Cargo.lock b/Cargo.lock index 5f81042e..47506a03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,38 @@ dependencies = [ "memchr", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" + +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -32,12 +64,44 @@ version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "cc" +version = "1.2.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e228eec9be7c17ccb640b59b36a5cd805ea2a564a4c5e162c2f659fea30d3b96" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "convert_case" version = "0.11.0" @@ -47,12 +111,38 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "ctor" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01334b89b69ff726750c5ce5073fc8bd860e99aa9a8fc5ca11b04730e3aee97a" +[[package]] +name = "daachorse" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" + +[[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + [[package]] name = "fancy-regex" version = "0.18.0" @@ -64,6 +154,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + [[package]] name = "futures" version = "0.3.32" @@ -152,6 +248,18 @@ dependencies = [ "slab", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "heck" version = "0.5.0" @@ -164,6 +272,16 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom", + "libc", +] + [[package]] name = "libc" version = "0.2.186" @@ -263,6 +381,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + [[package]] name = "portable-atomic" version = "1.13.1" @@ -344,6 +468,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "regex" version = "1.12.4" @@ -428,6 +558,12 @@ dependencies = [ "zmij", ] +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + [[package]] name = "slab" version = "0.4.12" @@ -437,9 +573,8 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" -source = "git+https://github.com/stella/aho-corasick?rev=b67e487aedc17725f57fb5b5d60678475b6b6667#b67e487aedc17725f57fb5b5d60678475b6b6667" dependencies = [ - "aho-corasick", + "daachorse", "unicode-case-mapping", ] @@ -447,15 +582,20 @@ dependencies = [ name = "stella-anonymize-adapter-contract" version = "1.5.0" dependencies = [ + "bincode", + "blake3", "serde", "serde_json", "stella-anonymize-core", + "zstd", ] [[package]] name = "stella-anonymize-core" version = "1.5.0" dependencies = [ + "fancy-regex 0.16.2", + "regex", "stella-text-search-core", ] @@ -463,6 +603,7 @@ dependencies = [ name = "stella-anonymize-napi" version = "1.5.0" dependencies = [ + "blake3", "napi", "napi-build", "napi-derive", @@ -497,7 +638,7 @@ name = "stella-regex-set-core" version = "1.0.5" source = "git+https://github.com/stella/regex-set?rev=8b80241a5a54cef8fdc6b6b34119981db0c6f597#8b80241a5a54cef8fdc6b6b34119981db0c6f597" dependencies = [ - "fancy-regex", + "fancy-regex 0.18.0", "regex", "regex-automata", "regex-syntax", @@ -507,7 +648,6 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e#e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", @@ -573,14 +713,69 @@ version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/crates/anonymize-adapter-contract/Cargo.toml b/crates/anonymize-adapter-contract/Cargo.toml index 14a109fe..804516be 100644 --- a/crates/anonymize-adapter-contract/Cargo.toml +++ b/crates/anonymize-adapter-contract/Cargo.toml @@ -8,8 +8,11 @@ publish.workspace = true repository.workspace = true [dependencies] +bincode = { version = "2", features = ["serde"] } +blake3 = "1" serde = { version = "1", features = ["derive"] } stella-anonymize-core = { path = "../anonymize-core" } +zstd = "0.13" [dev-dependencies] serde_json = "1" diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 484119ce..c5b0015f 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -1,21 +1,36 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet}; use serde::{Deserialize, Serialize}; use stella_anonymize_core::{ - CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, - DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, - GazetteerMatchData, LiteralSearchOptions, OperatorConfig, OperatorType, - PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, SourceDetail, - StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, - StaticRedactionResult, + AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, + DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, + LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, + OperatorConfig, OperatorType, PatternSlice, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchEngine, + SearchOptions, SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, + SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, + StaticRedactionResult, StringGroups, TriggerData, TriggerRule, + TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; pub type Result = std::result::Result; +const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 3; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 1; +const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; +const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; + #[derive(Clone, Debug, Eq, PartialEq)] pub enum ContractError { + CompactStringIndexOutOfRange { field: &'static str, index: u32 }, FuzzyDistanceOutOfRange { distance: u32 }, + InvalidCompactStringGroups { field: &'static str, reason: String }, + InvalidPreparedSearchPackage { reason: String }, + MissingDenyListDataForLiteralPatterns, UnsupportedOperator { value: String }, UnsupportedSearchPatternKind { kind: String }, UnsupportedSourceDetail { value: String }, @@ -24,9 +39,27 @@ pub enum ContractError { impl std::fmt::Display for ContractError { fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + Self::CompactStringIndexOutOfRange { field, index } => { + write!( + formatter, + "Compact string index out of range in {field}: {index}" + ) + } Self::FuzzyDistanceOutOfRange { distance } => { write!(formatter, "Fuzzy distance exceeds u8 range: {distance}") } + Self::InvalidCompactStringGroups { field, reason } => { + write!( + formatter, + "Compact string groups are invalid in {field}: {reason}" + ) + } + Self::InvalidPreparedSearchPackage { reason } => { + write!(formatter, "Prepared search package is invalid: {reason}") + } + Self::MissingDenyListDataForLiteralPatterns => formatter.write_str( + "Deny-list data is required when literal patterns are derived from it", + ), Self::UnsupportedOperator { value } => { write!(formatter, "Unsupported anonymization operator: {value}") } @@ -93,6 +126,8 @@ pub struct BindingRegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: Option, + pub validator_id: Option, + pub validator_input: Option, pub min_byte_length: Option, } @@ -107,12 +142,190 @@ pub struct BindingCountryMatchData { pub labels: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingTriggerData { + pub rules: Vec, + #[serde(default)] + pub address_stop_keywords: Vec, + #[serde(default)] + pub party_position_terms: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingTriggerRule { + pub trigger: String, + pub label: String, + pub strategy: BindingTriggerStrategy, + pub validations: Vec, + pub include_trigger: bool, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(tag = "type", rename_all = "kebab-case")] +pub enum BindingTriggerStrategy { + ToNextComma { + #[serde(default)] + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(tag = "type", rename_all = "kebab-case")] +pub enum BindingTriggerValidation { + StartsUppercase, + MinLength { + min: u32, + }, + MaxLength { + max: u32, + }, + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingLegalFormData { + #[serde(default)] + pub suffixes: Vec, + #[serde(default)] + pub normalized_boundary_suffixes: Vec, + #[serde(default)] + pub normalized_in_name_words: Vec, + #[serde(default)] + pub normalized_suffix_words: Vec, + #[serde(default)] + pub role_heads: Vec, + #[serde(default)] + pub sentence_verb_indicators: Vec, + #[serde(default)] + pub clause_noun_heads: Vec, + #[serde(default)] + pub connector_prose_heads: Vec, + #[serde(default)] + pub structural_single_cap_prefixes: Vec, + #[serde(default)] + pub leading_clause_phrases: Vec, + #[serde(default)] + pub leading_clause_direct_prefixes: Vec, + #[serde(default)] + pub connector_words: Vec, + #[serde(default)] + pub and_connector_words: Vec, + #[serde(default)] + pub in_name_prepositions: Vec, + #[serde(default)] + pub company_suffix_words: Vec, + #[serde(default)] + pub comma_gated_direct_prefixes: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDateData { + #[serde(default)] + pub month_names_by_language: BTreeMap>, + #[serde(default)] + pub year_words_by_language: BTreeMap>, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingMonetaryData { + #[serde(default)] + pub currencies: BindingCurrencyData, + #[serde(default)] + pub amount_words: BindingAmountWordsData, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCurrencyData { + #[serde(default)] + pub codes: Vec, + #[serde(default)] + pub symbols: Vec, + #[serde(default)] + pub local_names: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAmountWordsData { + #[serde(default)] + pub written_amount_patterns: Vec, + #[serde(default)] + pub magnitude_suffixes: Vec, + #[serde(default)] + pub share_quantity_terms: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingWrittenAmountPatternData { + #[serde(default)] + pub keywords: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingMagnitudeSuffixData { + #[serde(default)] + pub words: Vec, + #[serde(default)] + pub abbreviations_case_insensitive: Vec, + #[serde(default)] + pub abbreviations_case_sensitive: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingShareQuantityTermData { + #[serde(default)] + pub modifiers: Vec, + #[serde(default)] + pub nouns: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAddressSeedData { + #[serde(default)] + pub boundary_words: Vec, + #[serde(default)] + pub br_cep_cue_words: Vec, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingDenyListMatchData { + #[serde(default)] pub labels: Vec>, + #[serde(default)] + pub label_table: Vec, + #[serde(default)] + pub label_indices: Vec>, + #[serde(default)] pub custom_labels: Vec>, + #[serde(default)] + pub custom_label_indices: Vec>, pub originals: Vec, + #[serde(default)] pub sources: Vec>, + #[serde(default)] + pub source_table: Vec, + #[serde(default)] + pub source_indices: Vec>, pub filters: Option, } @@ -121,13 +334,27 @@ pub struct BindingDenyListFilterData { pub stopwords: Vec, pub allow_list: Vec, pub person_stopwords: Vec, + #[serde(default)] + pub person_trailing_nouns: Vec, pub address_stopwords: Vec, + #[serde(default)] + pub address_jurisdiction_prefixes: Vec, pub street_types: Vec, pub first_names: Vec, pub generic_roles: Vec, pub sentence_starters: Vec, pub trailing_address_word_exclusions: Vec, pub defined_term_cues: Vec, + #[serde(default)] + pub signing_place_guards: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSigningPlaceGuardData { + #[serde(default)] + pub prefix_phrases: Vec, + #[serde(default)] + pub suffix_phrases: Vec, } #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] @@ -145,6 +372,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub literal_options: Option, #[serde(default)] + pub literal_patterns_from_deny_list_data: bool, + #[serde(default)] pub slices: BindingPreparedSearchSlices, #[serde(default)] pub regex_meta: Vec, @@ -156,6 +385,410 @@ pub struct BindingPreparedSearchConfig { pub gazetteer_data: Option, #[serde(default)] pub country_data: Option, + #[serde(default)] + pub trigger_data: Option, + #[serde(default)] + pub legal_form_data: Option, + #[serde(default)] + pub address_seed_data: Option, + #[serde(default)] + pub date_data: Option, + #[serde(default)] + pub monetary_data: Option, +} + +#[derive(Deserialize)] +struct BinaryPreparedSearchPackageOwned { + config: BinaryPreparedSearchConfig, + artifacts: Vec, +} + +#[derive(Serialize)] +struct BinaryPreparedSearchPackageRef<'a> { + config: BinaryPreparedSearchConfig, + artifacts: &'a [u8], +} + +#[derive(Clone, Debug, PartialEq)] +pub struct BindingPreparedSearchPackage { + pub config: BindingPreparedSearchConfig, + pub artifacts: Vec, +} + +#[derive(Deserialize, Serialize)] +struct BinaryPreparedSearchConfig { + regex_patterns: Vec, + custom_regex_patterns: Vec, + literal_patterns: Vec, + regex_options: Option, + custom_regex_options: Option, + literal_options: Option, + literal_patterns_from_deny_list_data: bool, + slices: BindingPreparedSearchSlices, + regex_meta: Vec, + custom_regex_meta: Vec, + deny_list_data: Option, + gazetteer_data: Option, + country_data: Option, + trigger_data: Option, + legal_form_data: Option, + address_seed_data: Option, + date_data: Option, + monetary_data: Option, +} + +#[derive(Deserialize, Serialize)] +struct BinaryTriggerData { + rules: Vec, + address_stop_keywords: Vec, + party_position_terms: Vec, +} + +#[derive(Deserialize, Serialize)] +struct BinaryTriggerRule { + trigger: String, + label: String, + strategy: BinaryTriggerStrategy, + validations: Vec, + include_trigger: bool, +} + +#[derive(Deserialize, Serialize)] +enum BinaryTriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Deserialize, Serialize)] +enum BinaryTriggerValidation { + StartsUppercase, + MinLength { + min: u32, + }, + MaxLength { + max: u32, + }, + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +pub fn prepared_search_package_to_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; + let digest = blake3::hash(&payload); + let mut bytes = Vec::with_capacity(raw_package_header_len(&payload)); + write_package_header( + &mut bytes, + PREPARED_SEARCH_PACKAGE_HEADER, + PREPARED_SEARCH_PACKAGE_VERSION, + digest.as_bytes(), + ); + bytes.extend_from_slice(&payload); + Ok(bytes) +} + +pub fn prepared_search_package_to_compressed_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; + prepared_search_package_compress_payload(&payload) +} + +pub fn prepared_search_package_digest(bytes: &[u8]) -> Result<[u8; 32]> { + Ok(prepared_search_package_parts(bytes)?.digest()) +} + +pub fn prepared_search_package_from_bytes( + bytes: &[u8], +) -> Result { + let parts = prepared_search_package_parts(bytes)?; + let payload = parts.payload()?; + verify_prepared_search_package_digest(parts.digest(), payload.as_ref())?; + let (package, read) = bincode::serde::decode_from_slice::< + BinaryPreparedSearchPackageOwned, + _, + >(payload.as_ref(), package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + if read != payload.as_ref().len() { + return Err(invalid_prepared_search_package("trailing payload data")); + } + Ok(BindingPreparedSearchPackage { + config: BindingPreparedSearchConfig::from(package.config), + artifacts: package.artifacts, + }) +} + +impl From for BinaryPreparedSearchConfig { + fn from(config: BindingPreparedSearchConfig) -> Self { + Self { + regex_patterns: config.regex_patterns, + custom_regex_patterns: config.custom_regex_patterns, + literal_patterns: config.literal_patterns, + regex_options: config.regex_options, + custom_regex_options: config.custom_regex_options, + literal_options: config.literal_options, + literal_patterns_from_deny_list_data: config + .literal_patterns_from_deny_list_data, + slices: config.slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + trigger_data: config.trigger_data.map(BinaryTriggerData::from), + legal_form_data: config.legal_form_data, + address_seed_data: config.address_seed_data, + date_data: config.date_data, + monetary_data: config.monetary_data, + } + } +} + +impl From for BindingPreparedSearchConfig { + fn from(config: BinaryPreparedSearchConfig) -> Self { + Self { + regex_patterns: config.regex_patterns, + custom_regex_patterns: config.custom_regex_patterns, + literal_patterns: config.literal_patterns, + regex_options: config.regex_options, + custom_regex_options: config.custom_regex_options, + literal_options: config.literal_options, + literal_patterns_from_deny_list_data: config + .literal_patterns_from_deny_list_data, + slices: config.slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + trigger_data: config.trigger_data.map(BindingTriggerData::from), + legal_form_data: config.legal_form_data, + address_seed_data: config.address_seed_data, + date_data: config.date_data, + monetary_data: config.monetary_data, + } + } +} + +impl From for BinaryTriggerData { + fn from(data: BindingTriggerData) -> Self { + Self { + rules: data + .rules + .into_iter() + .map(BinaryTriggerRule::from) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + } + } +} + +impl From for BindingTriggerData { + fn from(data: BinaryTriggerData) -> Self { + Self { + rules: data + .rules + .into_iter() + .map(BindingTriggerRule::from) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + } + } +} + +impl From for BinaryTriggerRule { + fn from(rule: BindingTriggerRule) -> Self { + Self { + trigger: rule.trigger, + label: rule.label, + strategy: BinaryTriggerStrategy::from(rule.strategy), + validations: rule + .validations + .into_iter() + .map(BinaryTriggerValidation::from) + .collect(), + include_trigger: rule.include_trigger, + } + } +} + +impl From for BindingTriggerRule { + fn from(rule: BinaryTriggerRule) -> Self { + Self { + trigger: rule.trigger, + label: rule.label, + strategy: BindingTriggerStrategy::from(rule.strategy), + validations: rule + .validations + .into_iter() + .map(BindingTriggerValidation::from) + .collect(), + include_trigger: rule.include_trigger, + } + } +} + +impl From for BinaryTriggerStrategy { + fn from(strategy: BindingTriggerStrategy) -> Self { + match strategy { + BindingTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length, + }, + BindingTriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + BindingTriggerStrategy::NWords { count } => Self::NWords { count }, + BindingTriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + BindingTriggerStrategy::Address { max_chars } => { + Self::Address { max_chars } + } + BindingTriggerStrategy::MatchPattern { pattern, flags } => { + Self::MatchPattern { pattern, flags } + } + } + } +} + +impl From for BindingTriggerStrategy { + fn from(strategy: BinaryTriggerStrategy) -> Self { + match strategy { + BinaryTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length, + }, + BinaryTriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + BinaryTriggerStrategy::NWords { count } => Self::NWords { count }, + BinaryTriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + BinaryTriggerStrategy::Address { max_chars } => { + Self::Address { max_chars } + } + BinaryTriggerStrategy::MatchPattern { pattern, flags } => { + Self::MatchPattern { pattern, flags } + } + } + } +} + +impl From for BinaryTriggerValidation { + fn from(validation: BindingTriggerValidation) -> Self { + match validation { + BindingTriggerValidation::StartsUppercase => Self::StartsUppercase, + BindingTriggerValidation::MinLength { min } => Self::MinLength { min }, + BindingTriggerValidation::MaxLength { max } => Self::MaxLength { max }, + BindingTriggerValidation::NoDigits => Self::NoDigits, + BindingTriggerValidation::HasDigits => Self::HasDigits, + BindingTriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { pattern, flags } + } + BindingTriggerValidation::ValidId { validator } => { + Self::ValidId { validator } + } + } + } +} + +impl From for BindingTriggerValidation { + fn from(validation: BinaryTriggerValidation) -> Self { + match validation { + BinaryTriggerValidation::StartsUppercase => Self::StartsUppercase, + BinaryTriggerValidation::MinLength { min } => Self::MinLength { min }, + BinaryTriggerValidation::MaxLength { max } => Self::MaxLength { max }, + BinaryTriggerValidation::NoDigits => Self::NoDigits, + BinaryTriggerValidation::HasDigits => Self::HasDigits, + BinaryTriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { pattern, flags } + } + BinaryTriggerValidation::ValidId { validator } => { + Self::ValidId { validator } + } + } + } +} + +fn prepared_search_package_payload_to_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + bincode::serde::encode_to_vec( + BinaryPreparedSearchPackageRef { + config: BinaryPreparedSearchConfig::from(config.clone()), + artifacts, + }, + package_bincode_config(), + ) + .map_err(|error| invalid_prepared_search_package(error.to_string())) +} + +fn prepared_search_package_compress_payload(payload: &[u8]) -> Result> { + let compressed = + zstd::bulk::compress(payload, PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + let digest = blake3::hash(payload); + let mut bytes = Vec::with_capacity( + raw_package_header_len(&compressed) + .saturating_add(std::mem::size_of::()), + ); + write_package_header( + &mut bytes, + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + digest.as_bytes(), + ); + let payload_len = u64::try_from(payload.len()) + .map_err(|_| invalid_prepared_search_package("payload length overflow"))?; + bytes.extend_from_slice(&payload_len.to_le_bytes()); + bytes.extend_from_slice(&compressed); + Ok(bytes) +} + +const fn raw_package_header_len(payload: &[u8]) -> usize { + PREPARED_SEARCH_PACKAGE_HEADER + .len() + .saturating_add(std::mem::size_of::()) + .saturating_add(PREPARED_SEARCH_PACKAGE_DIGEST_BYTES) + .saturating_add(payload.len()) +} + +fn write_package_header( + bytes: &mut Vec, + header: [u8; 8], + version: u32, + digest: &[u8; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES], +) { + bytes.extend_from_slice(&header); + bytes.extend_from_slice(&version.to_le_bytes()); + bytes.extend_from_slice(digest); } #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] @@ -249,12 +882,39 @@ pub struct BindingStaticRedactionDiagnosticResult { pub fn prepared_search_config_from_binding( config: BindingPreparedSearchConfig, ) -> Result { + let deny_list_data = config.deny_list_data; + let literal_patterns = literal_patterns_from_binding( + config.literal_patterns, + config.literal_patterns_from_deny_list_data, + deny_list_data.as_ref(), + )?; + let legal_form_data = config.legal_form_data.map(|data| LegalFormData { + suffixes: data.suffixes, + normalized_boundary_suffixes: data.normalized_boundary_suffixes, + normalized_in_name_words: data.normalized_in_name_words, + normalized_suffix_words: data.normalized_suffix_words, + role_heads: data.role_heads, + sentence_verb_indicators: data.sentence_verb_indicators, + clause_noun_heads: data.clause_noun_heads, + connector_prose_heads: data.connector_prose_heads, + structural_single_cap_prefixes: data.structural_single_cap_prefixes, + leading_clause_phrases: data.leading_clause_phrases, + leading_clause_direct_prefixes: data.leading_clause_direct_prefixes, + connector_words: data.connector_words, + and_connector_words: data.and_connector_words, + in_name_prepositions: data.in_name_prepositions, + company_suffix_words: data.company_suffix_words, + comma_gated_direct_prefixes: data.comma_gated_direct_prefixes, + }); + let legal_form_suffixes = legal_form_data + .as_ref() + .map_or_else(Vec::new, |data| data.suffixes.clone()); Ok(PreparedSearchConfig { regex_patterns: search_patterns_from_binding(config.regex_patterns)?, custom_regex_patterns: search_patterns_from_binding( config.custom_regex_patterns, )?, - literal_patterns: search_patterns_from_binding(config.literal_patterns)?, + literal_patterns, regex_options: search_options_from_binding(config.regex_options), custom_regex_options: search_options_from_binding( config.custom_regex_options, @@ -263,13 +923,9 @@ pub fn prepared_search_config_from_binding( slices: slices_from_binding(&config.slices), regex_meta: regex_meta_from_binding(config.regex_meta)?, custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, - deny_list_data: config.deny_list_data.map(|data| DenyListMatchData { - labels: data.labels, - custom_labels: data.custom_labels, - originals: data.originals, - sources: data.sources, - filters: data.filters.map(deny_list_filters_from_binding), - }), + deny_list_data: deny_list_data + .map(deny_list_data_from_binding) + .transpose()?, gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { labels: data.labels, is_fuzzy: data.is_fuzzy, @@ -277,9 +933,277 @@ pub fn prepared_search_config_from_binding( country_data: config.country_data.map(|data| CountryMatchData { labels: data.labels, }), + trigger_data: config + .trigger_data + .map(|data| trigger_data_from_binding(data, legal_form_suffixes)), + legal_form_data, + address_seed_data: config.address_seed_data.map(|data| AddressSeedData { + boundary_words: data.boundary_words, + br_cep_cue_words: data.br_cep_cue_words, + }), + date_data: config.date_data.map(|data| DateData { + month_names_by_language: data.month_names_by_language, + year_words_by_language: data.year_words_by_language, + }), + monetary_data: config.monetary_data.map(monetary_data_from_binding), + }) +} + +enum PreparedSearchPackageParts<'a> { + Raw { + digest: [u8; 32], + payload: &'a [u8], + }, + Compressed { + digest: [u8; 32], + uncompressed_len: usize, + payload: &'a [u8], + }, +} + +impl PreparedSearchPackageParts<'_> { + const fn digest(&self) -> [u8; 32] { + match self { + Self::Raw { digest, .. } | Self::Compressed { digest, .. } => *digest, + } + } + + fn payload(&self) -> Result> { + match self { + Self::Raw { payload, .. } => Ok(Cow::Borrowed(payload)), + Self::Compressed { + uncompressed_len, + payload, + .. + } => zstd::bulk::decompress(payload, *uncompressed_len) + .map(Cow::Owned) + .map_err(|error| invalid_prepared_search_package(error.to_string())), + } + } +} + +struct RawPackageHeader<'a> { + digest: [u8; 32], + payload: &'a [u8], +} + +fn prepared_search_package_parts( + bytes: &[u8], +) -> Result> { + let header = bytes + .get(..PREPARED_SEARCH_PACKAGE_HEADER.len()) + .ok_or_else(|| invalid_prepared_search_package("truncated header"))?; + if header == PREPARED_SEARCH_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_PACKAGE_VERSION, + PREPARED_SEARCH_PACKAGE_HEADER.len(), + )?; + return Ok(PreparedSearchPackageParts::Raw { + digest: raw.digest, + payload: raw.payload, + }); + } + if header == PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER.len(), + )?; + let len_end = std::mem::size_of::(); + let len_bytes = raw + .payload + .get(..len_end) + .ok_or_else(|| invalid_prepared_search_package("truncated length"))?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed length"))?; + let uncompressed_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("length overflow"))?; + let payload = raw + .payload + .get(len_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + return Ok(PreparedSearchPackageParts::Compressed { + digest: raw.digest, + uncompressed_len, + payload, + }); + } + Err(invalid_prepared_search_package("unexpected header")) +} + +fn raw_package_header( + bytes: &[u8], + expected_version: u32, + header_len: usize, +) -> Result> { + let version_start = header_len; + let version_end = version_start.saturating_add(std::mem::size_of::()); + let version_bytes = bytes + .get(version_start..version_end) + .ok_or_else(|| invalid_prepared_search_package("truncated version"))?; + let version_array = <[u8; 4]>::try_from(version_bytes) + .map_err(|_| invalid_prepared_search_package("malformed version"))?; + let version = u32::from_le_bytes(version_array); + if version != expected_version { + return Err(invalid_prepared_search_package("unsupported version")); + } + let digest_end = + version_end.saturating_add(PREPARED_SEARCH_PACKAGE_DIGEST_BYTES); + let digest_bytes = bytes + .get(version_end..digest_end) + .ok_or_else(|| invalid_prepared_search_package("truncated digest"))?; + let digest = + <[u8; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES]>::try_from(digest_bytes) + .map_err(|_| invalid_prepared_search_package("malformed digest"))?; + let payload = bytes + .get(digest_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + Ok(RawPackageHeader { digest, payload }) +} + +fn verify_prepared_search_package_digest( + expected: [u8; 32], + payload: &[u8], +) -> Result<()> { + let actual = blake3::hash(payload); + if actual.as_bytes() != &expected { + return Err(invalid_prepared_search_package("digest mismatch")); + } + Ok(()) +} + +fn package_bincode_config() -> impl bincode::config::Config { + bincode::config::standard() + .with_little_endian() + .with_variable_int_encoding() +} + +fn invalid_prepared_search_package(reason: impl Into) -> ContractError { + ContractError::InvalidPreparedSearchPackage { + reason: reason.into(), + } +} + +fn deny_list_data_from_binding( + data: BindingDenyListMatchData, +) -> Result { + let pattern_count = data.originals.len(); + Ok(DenyListMatchData { + labels: string_groups_from_binding( + data.labels, + data.label_indices, + data.label_table.clone(), + pattern_count, + "deny_list.label_indices", + )?, + custom_labels: string_groups_from_binding( + data.custom_labels, + data.custom_label_indices, + data.label_table, + pattern_count, + "deny_list.custom_label_indices", + )?, + originals: data.originals, + sources: string_groups_from_binding( + data.sources, + data.source_indices, + data.source_table, + pattern_count, + "deny_list.source_indices", + )?, + filters: data.filters.map(deny_list_filters_from_binding), }) } +fn string_groups_from_binding( + groups: Vec>, + indices: Vec>, + table: Vec, + pattern_count: usize, + field: &'static str, +) -> Result { + if !indices.is_empty() { + validate_compact_string_indices(&indices, &table, field)?; + return StringGroups::from_table_indices(table, indices, field).map_err( + |error| ContractError::InvalidCompactStringGroups { + field, + reason: error.to_string(), + }, + ); + } + + if !groups.is_empty() { + return Ok(StringGroups::from_groups(groups)); + } + + Ok(StringGroups::empty_groups(pattern_count)) +} + +fn validate_compact_string_indices( + groups: &[Vec], + table: &[String], + field: &'static str, +) -> Result<()> { + for group in groups { + for &index in group { + let Ok(index_usize) = usize::try_from(index) else { + return Err(ContractError::CompactStringIndexOutOfRange { + field, + index, + }); + }; + if index_usize >= table.len() { + return Err(ContractError::CompactStringIndexOutOfRange { + field, + index, + }); + } + } + } + + Ok(()) +} + +fn monetary_data_from_binding(data: BindingMonetaryData) -> MonetaryData { + MonetaryData { + currencies: CurrencyData { + codes: data.currencies.codes, + symbols: data.currencies.symbols, + local_names: data.currencies.local_names, + }, + amount_words: AmountWordsData { + written_amount_patterns: data + .amount_words + .written_amount_patterns + .into_iter() + .map(|entry| WrittenAmountPatternData { + keywords: entry.keywords, + }) + .collect(), + magnitude_suffixes: data + .amount_words + .magnitude_suffixes + .into_iter() + .map(|entry| MagnitudeSuffixData { + words: entry.words, + abbreviations_case_insensitive: entry.abbreviations_case_insensitive, + abbreviations_case_sensitive: entry.abbreviations_case_sensitive, + }) + .collect(), + share_quantity_terms: data + .amount_words + .share_quantity_terms + .into_iter() + .map(|entry| ShareQuantityTermData { + modifiers: entry.modifiers, + nouns: entry.nouns, + }) + .collect(), + }, + } +} + pub fn operator_config_from_binding( config: Option, ) -> Result { @@ -396,7 +1320,11 @@ fn deny_list_filters_from_binding( stopwords: lower_set(filters.stopwords), allow_list: lower_set(filters.allow_list), person_stopwords: lower_set(filters.person_stopwords), + person_trailing_nouns: lower_set(filters.person_trailing_nouns), address_stopwords: lower_set(filters.address_stopwords), + address_jurisdiction_prefixes: lower_set( + filters.address_jurisdiction_prefixes, + ), street_types: lower_set(filters.street_types), first_names: lower_set(filters.first_names), generic_roles: lower_set(filters.generic_roles), @@ -405,6 +1333,93 @@ fn deny_list_filters_from_binding( filters.trailing_address_word_exclusions, ), defined_term_cues: lower_set(filters.defined_term_cues), + signing_place_guards: filters + .signing_place_guards + .into_iter() + .map(|guard| SigningPlaceGuardData { + prefix_phrases: lower_set(guard.prefix_phrases), + suffix_phrases: lower_set(guard.suffix_phrases), + }) + .collect(), + } +} + +fn trigger_data_from_binding( + data: BindingTriggerData, + legal_form_suffixes: Vec, +) -> TriggerData { + TriggerData { + rules: data + .rules + .into_iter() + .map(trigger_rule_from_binding) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + legal_form_suffixes, + } +} + +fn trigger_rule_from_binding(rule: BindingTriggerRule) -> TriggerRule { + TriggerRule { + trigger: rule.trigger, + label: rule.label, + strategy: trigger_strategy_from_binding(rule.strategy), + validations: rule + .validations + .into_iter() + .map(trigger_validation_from_binding) + .collect(), + include_trigger: rule.include_trigger, + } +} + +fn trigger_strategy_from_binding( + strategy: BindingTriggerStrategy, +) -> TriggerStrategy { + match strategy { + BindingTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => TriggerStrategy::ToNextComma { + stop_words, + max_length, + }, + BindingTriggerStrategy::ToEndOfLine => TriggerStrategy::ToEndOfLine, + BindingTriggerStrategy::NWords { count } => { + TriggerStrategy::NWords { count } + } + BindingTriggerStrategy::CompanyIdValue => TriggerStrategy::CompanyIdValue, + BindingTriggerStrategy::Address { max_chars } => { + TriggerStrategy::Address { max_chars } + } + BindingTriggerStrategy::MatchPattern { pattern, flags } => { + TriggerStrategy::MatchPattern { pattern, flags } + } + } +} + +fn trigger_validation_from_binding( + validation: BindingTriggerValidation, +) -> TriggerValidation { + match validation { + BindingTriggerValidation::StartsUppercase => { + TriggerValidation::StartsUppercase + } + BindingTriggerValidation::MinLength { min } => { + TriggerValidation::MinLength(min) + } + BindingTriggerValidation::MaxLength { max } => { + TriggerValidation::MaxLength(max) + } + BindingTriggerValidation::NoDigits => TriggerValidation::NoDigits, + BindingTriggerValidation::HasDigits => TriggerValidation::HasDigits, + BindingTriggerValidation::MatchesPattern { pattern, flags } => { + TriggerValidation::MatchesPattern { pattern, flags } + } + BindingTriggerValidation::ValidId { validator } => { + TriggerValidation::ValidId { validator } + } } } @@ -424,6 +1439,27 @@ fn search_patterns_from_binding( .collect() } +fn literal_patterns_from_binding( + patterns: Vec, + from_deny_list_data: bool, + deny_list_data: Option<&BindingDenyListMatchData>, +) -> Result> { + let mut literal_patterns = search_patterns_from_binding(patterns)?; + if !from_deny_list_data { + return Ok(literal_patterns); + } + + let Some(data) = deny_list_data else { + return Err(ContractError::MissingDenyListDataForLiteralPatterns); + }; + let mut from_data = Vec::with_capacity( + data.originals.len().saturating_add(literal_patterns.len()), + ); + from_data.extend(data.originals.iter().cloned().map(SearchPattern::Literal)); + from_data.append(&mut literal_patterns); + Ok(from_data) +} + fn search_pattern_from_binding( pattern: BindingSearchPattern, ) -> Result { @@ -525,6 +1561,8 @@ fn regex_meta_from_binding( .map(|value| source_detail_from_binding(&value)) .transpose()?, requires_validation: entry.requires_validation.unwrap_or(false), + validator_id: entry.validator_id, + validator_input: entry.validator_input, min_byte_length: entry.min_byte_length, }) }) @@ -587,9 +1625,17 @@ fn search_engine_name(engine: SearchEngine) -> String { fn diagnostic_stage_name(stage: DiagnosticStage) -> String { match stage { + DiagnosticStage::PrepareCacheHit => "prepare.cache.hit", + DiagnosticStage::PrepareCacheMiss => "prepare.cache.miss", + DiagnosticStage::PrepareBindingParse => "prepare.binding.parse", + DiagnosticStage::PrepareBindingConvert => "prepare.binding.convert", + DiagnosticStage::PrepareArtifactsDecode => "prepare.artifacts.decode", DiagnosticStage::PrepareTotal => "prepare.total", DiagnosticStage::PrepareRegex => "prepare.regex", DiagnosticStage::PrepareCustomRegex => "prepare.custom-regex", + DiagnosticStage::PrepareAnchored => "prepare.anchored", + DiagnosticStage::PrepareLegalFormSearch => "prepare.legal-form-search", + DiagnosticStage::PrepareTriggerSearch => "prepare.trigger-search", DiagnosticStage::PrepareLiteral => "prepare.literal", DiagnosticStage::Normalize => "normalize", DiagnosticStage::FindMatches => "find-matches", @@ -598,12 +1644,19 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::FindLiteral => "find.literal", DiagnosticStage::SearchRegex => "search.regex", DiagnosticStage::SearchCustomRegex => "search.custom-regex", + DiagnosticStage::SearchLegalForm => "search.legal-form", + DiagnosticStage::SearchTrigger => "search.trigger", DiagnosticStage::SearchLiteral => "search.literal", DiagnosticStage::EntityRegex => "entity.regex", DiagnosticStage::EntityCustomRegex => "entity.custom-regex", + DiagnosticStage::EntityAnchored => "entity.anchored", DiagnosticStage::EntityDenyList => "entity.deny-list", DiagnosticStage::EntityGazetteer => "entity.gazetteer", DiagnosticStage::EntityCountry => "entity.country", + DiagnosticStage::EntityTrigger => "entity.trigger", + DiagnosticStage::EntitySignature => "entity.signature", + DiagnosticStage::EntityLegalForm => "entity.legal-form", + DiagnosticStage::EntityAddressSeed => "entity.address-seed", DiagnosticStage::Merge => "resolution.merge", DiagnosticStage::Boundary => "resolution.boundary", DiagnosticStage::Sanitize => "resolution.sanitize", @@ -628,3 +1681,108 @@ fn operator_name(operator: OperatorType) -> String { } .to_owned() } + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + + use super::{ + BindingPreparedSearchConfig, BindingSearchPattern, ContractError, + prepared_search_package_from_bytes, prepared_search_package_to_bytes, + prepared_search_package_to_compressed_bytes, + }; + + #[test] + fn prepared_search_package_roundtrips_config_and_artifacts() { + let config = BindingPreparedSearchConfig { + literal_patterns: vec![BindingSearchPattern { + kind: String::from("literal"), + pattern: String::from("Acme"), + distance: None, + case_insensitive: None, + whole_words: None, + lazy: None, + prefilter_any: None, + prefilter_case_insensitive: None, + prefilter_regex: None, + }], + ..BindingPreparedSearchConfig::default() + }; + let artifacts = b"prepared-artifacts"; + + let bytes = prepared_search_package_to_bytes(&config, artifacts).unwrap(); + let package = prepared_search_package_from_bytes(&bytes).unwrap(); + + assert_eq!(package.config, config); + assert_eq!(package.artifacts, artifacts); + } + + #[test] + fn prepared_search_package_rejects_invalid_bytes() { + let error = prepared_search_package_from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "invalid package bytes should fail before config construction" + ); + } + + #[test] + fn prepared_search_package_rejects_digest_mismatch() { + let config = BindingPreparedSearchConfig::default(); + let mut bytes = + prepared_search_package_to_bytes(&config, b"artifact").unwrap(); + let last = bytes.last_mut().unwrap(); + *last ^= 0x01; + + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "corrupted package payload should fail digest verification" + ); + } + + #[test] + fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { + let config = BindingPreparedSearchConfig { + literal_patterns: vec![BindingSearchPattern { + kind: String::from("literal"), + pattern: String::from("Acme"), + distance: None, + case_insensitive: None, + whole_words: None, + lazy: None, + prefilter_any: None, + prefilter_case_insensitive: None, + prefilter_regex: None, + }], + ..BindingPreparedSearchConfig::default() + }; + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_package_to_compressed_bytes(&config, artifacts).unwrap(); + let package = prepared_search_package_from_bytes(&bytes).unwrap(); + + assert_eq!(package.config, config); + assert_eq!(package.artifacts, artifacts); + } + + #[test] + fn prepared_search_compressed_package_rejects_digest_mismatch() { + let config = BindingPreparedSearchConfig::default(); + let mut bytes = + prepared_search_package_to_compressed_bytes(&config, b"artifact") + .unwrap(); + let last = bytes.last_mut().unwrap(); + *last ^= 0x01; + + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "corrupted compressed package should fail digest verification" + ); + } +} diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 55871a1e..e1bdbde2 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -10,7 +10,9 @@ keywords = ["anonymization", "pii", "redaction", "text"] categories = ["text-processing"] [dependencies] -stella-text-search-core = { git = "https://github.com/stella/text-search", rev = "e427c5e8f5c13a0edc8503d24c4d4b34cbf46e8e" } +fancy-regex = "0.16" +regex = "1" +stella-text-search-core = { path = "/Users/sok0/coding_projects/text-search-rust-core/crates/core" } [lints] workspace = true diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs new file mode 100644 index 00000000..00ab348b --- /dev/null +++ b/crates/anonymize-core/src/address_seeds.rs @@ -0,0 +1,1202 @@ +use regex::Regex; + +use crate::processors::PatternSlice; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::types::{Error, Result, SearchEngine, SearchMatch}; + +const ADDRESS_SCORE_BASE: f64 = 0.5; +const ADDRESS_SCORE_MAX: f64 = 0.95; +const ADDRESS_CLUSTER_MAX_GAP: usize = 150; +const ADDRESS_RIGHT_EXPAND_LIMIT: usize = 200; +const BR_CEP_CONTEXT_WINDOW: usize = 200; +const PLAIN_POSTAL_CONTEXT_WINDOW: usize = 120; +const US_ZIP_CONTEXT_WINDOW: usize = 120; + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct AddressSeedData { + pub boundary_words: Vec, + pub br_cep_cue_words: Vec, +} + +pub(crate) struct PreparedAddressSeedData { + boundary_search: Option, + br_cep_cue_search: Option, + postal_code_re: Regex, + br_cep_shape_re: Regex, + us_zip_plus_four_shape_re: Regex, + us_state_before_zip_re: Regex, + house_number_before_street_re: Regex, + house_number_after_street_re: Regex, + italian_cap_re: Regex, + street_number_re: Regex, +} + +impl PreparedAddressSeedData { + pub(crate) fn new(data: AddressSeedData) -> Result { + Ok(Self { + boundary_search: literal_search(data.boundary_words)?, + br_cep_cue_search: literal_search(data.br_cep_cue_words)?, + postal_code_re: compile_regex( + r"(?u)(?:\d{3}\s\d{2}|\d{2}[-‐‑‒–—―]\d{3}|\d{5}|\d{5}[-‐‑‒–—―]\d{3}|\d{5}[-‐‑‒–—―]\d{4})", + )?, + br_cep_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{3}$")?, + us_zip_plus_four_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{4}$")?, + us_state_before_zip_re: compile_regex( + r"(?u)(?:^|[^A-Za-z0-9])(?PA[KLRZ]|C[AOT]|D[CE]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[AR]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s*,?\s*$", + )?, + house_number_before_street_re: compile_regex( + r"(?u)\b\d{1,6}(?:[-/]\d{1,6})?\s+(?:\p{Lu}\p{L}+[^\S\n\t]+){0,4}$", + )?, + house_number_after_street_re: compile_regex( + r"(?u)^[^\S\n\t]+\d{1,6}(?:[-/]\d{1,6})?\b", + )?, + italian_cap_re: compile_regex(r"(?u)\b(?P\d{5})\s+\p{Lu}\p{L}+")?, + street_number_re: compile_regex( + r"(?u)\b(?P\p{Lu}\p{Ll}{2,})\s+(?P\d{1,5}(?:/\d{1,5})?)\s*[,\n]", + )?, + }) + } + + pub(crate) fn process( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let seeds = self.collect_seeds( + matches, + street_type_slice, + full_text, + existing_entities, + )?; + let clusters = cluster_seeds(&seeds, full_text, existing_entities); + let mut results = Vec::new(); + + for cluster in clusters { + let score = score_cluster(&cluster); + if score < 0.6 { + continue; + } + let span = self.expand_cluster(full_text, &cluster, existing_entities); + let Some(raw_text) = full_text.get(span.start..span.end) else { + continue; + }; + let resolution = resolve_newline_boundary(span.start, raw_text, &cluster); + if resolution == NewlineBoundaryResolution::Drop { + continue; + } + let relative_end = match resolution { + NewlineBoundaryResolution::Keep => raw_text.len(), + NewlineBoundaryResolution::Drop => 0, + NewlineBoundaryResolution::Trim { relative_end } => relative_end, + }; + let effective_raw = raw_text.get(..relative_end).unwrap_or_default(); + let leading = effective_raw + .len() + .saturating_sub(effective_raw.trim_start().len()); + let start = span.start.saturating_add(leading); + let end = trim_address_tail( + full_text, + start, + span.start.saturating_add(effective_raw.len()), + ); + let effective_text = full_text.get(start..end).unwrap_or_default(); + if effective_text.len() < 5 || effective_text.len() > 300 { + continue; + } + results.push(PipelineEntity::detected( + u32::try_from(start).unwrap_or(u32::MAX), + u32::try_from(end).unwrap_or(u32::MAX), + "address", + effective_text, + score, + DetectionSource::Regex, + )); + } + + Ok(results) + } + + fn collect_seeds( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut seeds = + self.collect_street_type_seeds(matches, street_type_slice, full_text)?; + collect_existing_entity_seeds(&mut seeds, full_text, existing_entities); + self.collect_street_number_seeds(&mut seeds, full_text, existing_entities); + self.collect_postal_code_seeds(&mut seeds, full_text); + self.collect_italian_cap_seeds(&mut seeds, full_text); + seeds.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| left.end.cmp(&right.end)) + .then_with(|| left.kind.cmp(&right.kind)) + }); + Ok(seeds) + } + + fn collect_street_type_seeds( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + ) -> Result> { + let mut seeds = Vec::new(); + for found in matches { + if street_type_slice.local_index(found.pattern()).is_none() { + continue; + } + let Some(seed) = seed_from_match(full_text, found, SeedType::StreetWord)? + else { + continue; + }; + if is_lowercase_street_word_in_prose(full_text, &seed, self) { + continue; + } + seeds.push(seed); + } + Ok(seeds) + } + + fn collect_postal_code_seeds(&self, seeds: &mut Vec, full_text: &str) { + for found in self.postal_code_re.find_iter(full_text) { + let start = found.start(); + let end = found.end(); + let text = found.as_str(); + if !postal_boundaries(full_text, start, end) { + continue; + } + let is_plain_five_digit = is_plain_five_digit_postal_code(text); + if seed_covered(seeds, start, end) && !is_plain_five_digit { + continue; + } + if is_plain_five_digit + && !self.has_plain_postal_context(full_text, start, end, seeds) + { + continue; + } + if self.br_cep_shape_re.is_match(text) + && !self.has_br_cue_nearby(full_text, start, end) + { + continue; + } + if self.us_zip_plus_four_shape_re.is_match(text) { + let context = self.us_zip_plus_four_context(full_text, start, seeds); + if !context.has_context { + continue; + } + if let Some(state_seed) = context.state_seed + && !seed_covered(seeds, state_seed.start, state_seed.end) + { + seeds.push(state_seed); + } + } + seeds.push(Seed { + kind: SeedType::PostalCode, + start, + end, + text: text.to_owned(), + }); + } + } + + fn has_plain_postal_context( + &self, + full_text: &str, + start: usize, + end: usize, + seeds: &[Seed], + ) -> bool { + seeds.iter().any(|seed| { + seed.start.abs_diff(start) <= PLAIN_POSTAL_CONTEXT_WINDOW + && match seed.kind { + SeedType::AddressTrigger => true, + SeedType::City | SeedType::State => { + seed.end >= start && seed.start <= end.saturating_add(4) + || seed.end <= start + && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) + } + SeedType::StreetWord => { + has_house_number_near_street_word(full_text, seed, self) + } + SeedType::PostalCode => false, + } + }) + } + + fn collect_italian_cap_seeds(&self, seeds: &mut Vec, full_text: &str) { + for captures in self.italian_cap_re.captures_iter(full_text) { + let Some(found) = captures.name("cap") else { + continue; + }; + let start = found.start(); + let end = found.end(); + if seed_covered(seeds, start, end) { + continue; + } + if !has_nearby_italian_cap_evidence(seeds, start) { + continue; + } + seeds.push(Seed { + kind: SeedType::PostalCode, + start, + end, + text: found.as_str().to_owned(), + }); + } + } + + fn collect_street_number_seeds( + &self, + seeds: &mut Vec, + full_text: &str, + existing_entities: &[PipelineEntity], + ) { + for captures in self.street_number_re.captures_iter(full_text) { + let Some(full) = captures.get(0) else { + continue; + }; + let Some(street) = captures.name("street") else { + continue; + }; + let Some(number) = captures.name("num") else { + continue; + }; + let start = full.start(); + let end = number.end(); + if range_overlaps_non_address(start, end, existing_entities) { + continue; + } + seeds.push(Seed { + kind: SeedType::StreetWord, + start, + end, + text: format!("{} {}", street.as_str(), number.as_str()), + }); + } + } + + fn has_br_cue_nearby( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + let Some(search) = &self.br_cep_cue_search else { + return false; + }; + let window_start = floor_char_boundary( + full_text, + start.saturating_sub(BR_CEP_CONTEXT_WINDOW), + ); + let window_end = ceil_char_boundary( + full_text, + end + .saturating_add(BR_CEP_CONTEXT_WINDOW) + .min(full_text.len()), + ); + full_text + .get(window_start..window_end) + .is_some_and(|window| search.is_match(window).unwrap_or(false)) + } + + fn us_zip_plus_four_context( + &self, + full_text: &str, + start: usize, + seeds: &[Seed], + ) -> UsZipPlusFourContext { + if let Some(state_seed) = self.us_state_seed_before_zip(full_text, start) { + return UsZipPlusFourContext { + state_seed: Some(state_seed), + has_context: true, + }; + } + + let has_context = seeds.iter().any(|seed| { + seed.start.abs_diff(start) <= US_ZIP_CONTEXT_WINDOW + && match seed.kind { + SeedType::AddressTrigger => true, + SeedType::City => { + seed.end <= start + && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) + } + SeedType::StreetWord => { + has_house_number_near_street_word(full_text, seed, self) + } + SeedType::PostalCode | SeedType::State => false, + } + }); + + UsZipPlusFourContext { + state_seed: None, + has_context, + } + } + + fn us_state_seed_before_zip( + &self, + full_text: &str, + start: usize, + ) -> Option { + let window_start = floor_char_boundary(full_text, start.saturating_sub(24)); + let window = full_text.get(window_start..start)?; + let captures = self.us_state_before_zip_re.captures(window)?; + let state = captures.name("state")?; + Some(Seed { + kind: SeedType::State, + start: window_start.saturating_add(state.start()), + end: window_start.saturating_add(state.end()), + text: state.as_str().to_owned(), + }) + } + + fn expand_cluster( + &self, + full_text: &str, + cluster: &SeedCluster, + existing_entities: &[PipelineEntity], + ) -> Span { + let left_bound = nearest_left_non_address(cluster.start, existing_entities); + let left_pos = expand_left(full_text, cluster.start, left_bound); + if !cluster.has_expandable_address_context() { + return Span { + start: left_pos.min(cluster.start), + end: cluster.end, + }; + } + + let right_pos = self.expand_right(full_text, cluster, existing_entities); + Span { + start: left_pos.min(cluster.start), + end: right_pos.max(cluster.end), + } + } + + fn expand_right( + &self, + full_text: &str, + cluster: &SeedCluster, + existing_entities: &[PipelineEntity], + ) -> usize { + let right_pos = cluster.end; + let remaining = full_text.get(right_pos..).unwrap_or_default(); + let mut nearest_boundary = + byte_cap_at_char_boundary(remaining, ADDRESS_RIGHT_EXPAND_LIMIT); + + if let Some(boundary) = self.nearest_boundary_word(full_text, right_pos) { + nearest_boundary = nearest_boundary.min(boundary); + } + if let Some(entity_boundary) = + nearest_right_non_address(right_pos, existing_entities) + { + nearest_boundary = nearest_boundary.min(entity_boundary); + } + if let Some(double_newline) = remaining.find("\n\n") { + nearest_boundary = nearest_boundary.min(double_newline); + } + if let Some(sentence_boundary) = sentence_boundary(remaining) { + nearest_boundary = nearest_boundary.min(sentence_boundary); + } + + let end = right_pos.saturating_add(nearest_boundary); + trim_address_tail(full_text, right_pos, end) + } + + fn nearest_boundary_word( + &self, + full_text: &str, + right_pos: usize, + ) -> Option { + let search = self.boundary_search.as_ref()?; + search + .find_iter(full_text) + .ok()? + .into_iter() + .filter_map(|found| { + let start = usize::try_from(found.start()).ok()?; + (start >= right_pos).then_some(start.saturating_sub(right_pos)) + }) + .min() + } +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +enum SeedType { + StreetWord, + PostalCode, + City, + State, + AddressTrigger, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct Seed { + kind: SeedType, + start: usize, + end: usize, + text: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct SeedCluster { + seeds: Vec, + start: usize, + end: usize, +} + +impl SeedCluster { + fn has_expandable_address_context(&self) -> bool { + self.seeds.iter().any(|seed| { + matches!( + seed.kind, + SeedType::StreetWord | SeedType::PostalCode | SeedType::AddressTrigger + ) + }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Span { + start: usize, + end: usize, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct UsZipPlusFourContext { + state_seed: Option, + has_context: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum NewlineBoundaryResolution { + Keep, + Drop, + Trim { relative_end: usize }, +} + +fn literal_search(patterns: Vec) -> Result> { + let patterns = patterns + .into_iter() + .filter(|pattern| !pattern.is_empty()) + .map(|pattern| SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(true), + }) + .collect::>(); + if patterns.is_empty() { + return Ok(None); + } + Ok(Some(SearchIndex::new(patterns, SearchOptions::default())?)) +} + +fn compile_regex(pattern: &str) -> Result { + Regex::new(pattern).map_err(|error| Error::Search { + engine: SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn seed_from_match( + full_text: &str, + found: &SearchMatch, + kind: SeedType, +) -> Result> { + let start = usize::try_from(found.start()).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + let end = usize::try_from(found.end()).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + let Some(text) = full_text.get(start..end) else { + return Ok(None); + }; + Ok(Some(Seed { + kind, + start, + end, + text: text.to_owned(), + })) +} + +fn collect_existing_entity_seeds( + seeds: &mut Vec, + full_text: &str, + existing_entities: &[PipelineEntity], +) { + for entity in existing_entities { + if entity.label != "address" { + continue; + } + if entity.source_detail == Some(SourceDetail::CustomDenyList) { + continue; + } + if overlaps_non_address(entity, existing_entities) { + continue; + } + let Some(kind) = kind_for_existing_entity(entity) else { + continue; + }; + if let Some(seed) = postal_seed_from_existing_address(full_text, entity) { + seeds.push(seed); + } + seeds.push(Seed { + kind, + start: usize::try_from(entity.start).unwrap_or(usize::MAX), + end: usize::try_from(entity.end).unwrap_or(usize::MAX), + text: entity.text.clone(), + }); + } +} + +fn postal_seed_from_existing_address( + full_text: &str, + entity: &PipelineEntity, +) -> Option { + if entity.source != DetectionSource::DenyList { + return None; + } + let mut start = usize::try_from(entity.start).ok()?; + let entity_end = usize::try_from(entity.end).ok()?; + while let Some((previous_start, ch)) = previous_char(full_text, start) { + if !ch.is_ascii_digit() { + break; + } + start = previous_start; + } + + let mut end = start; + while let Some((next_start, ch)) = next_char(full_text, end) { + if !ch.is_ascii_digit() { + break; + } + end = next_start.saturating_add(ch.len_utf8()); + } + if end > entity_end { + return None; + } + let text = full_text.get(start..end)?; + if !is_plain_five_digit_postal_code(text) { + return None; + } + Some(Seed { + kind: SeedType::PostalCode, + start, + end, + text: text.to_owned(), + }) +} + +fn kind_for_existing_entity(entity: &PipelineEntity) -> Option { + match entity.source { + DetectionSource::DenyList => Some(SeedType::City), + DetectionSource::Trigger if starts_with_digit(&entity.text) => { + Some(SeedType::PostalCode) + } + DetectionSource::Trigger => Some(SeedType::AddressTrigger), + _ => None, + } +} + +fn starts_with_digit(text: &str) -> bool { + text.chars().next().is_some_and(|ch| ch.is_ascii_digit()) +} + +fn is_lowercase_street_word_in_prose( + full_text: &str, + seed: &Seed, + data: &PreparedAddressSeedData, +) -> bool { + starts_lowercase(&seed.text) + && full_text + .get(seed.end..) + .is_some_and(starts_with_whitespace_then_lowercase) + && !has_house_number_near_street_word(full_text, seed, data) +} + +fn starts_lowercase(text: &str) -> bool { + text.chars().next().is_some_and(char::is_lowercase) +} + +fn starts_with_whitespace_then_lowercase(text: &str) -> bool { + let mut saw_whitespace = false; + for ch in text.chars() { + if ch.is_whitespace() { + saw_whitespace = true; + continue; + } + return saw_whitespace && ch.is_lowercase(); + } + false +} + +fn has_house_number_near_street_word( + full_text: &str, + seed: &Seed, + data: &PreparedAddressSeedData, +) -> bool { + if seed.text.chars().any(|ch| ch.is_ascii_digit()) { + return true; + } + + let before_start = + floor_char_boundary(full_text, seed.start.saturating_sub(50)); + let before = full_text.get(before_start..seed.start).unwrap_or_default(); + if data.house_number_before_street_re.is_match(before) { + return true; + } + + let after_end = ceil_char_boundary( + full_text, + seed.end.saturating_add(24).min(full_text.len()), + ); + let after = full_text.get(seed.end..after_end).unwrap_or_default(); + data.house_number_after_street_re.is_match(after) +} + +fn postal_boundaries(full_text: &str, start: usize, end: usize) -> bool { + let before_ok = previous_char(full_text, start) + .is_none_or(|(_, ch)| !is_postal_adjacent(ch)); + let after_ok = + next_char(full_text, end).is_none_or(|(_, ch)| !is_postal_adjacent(ch)); + before_ok && after_ok +} + +fn is_postal_adjacent(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' || is_dash(ch) +} + +fn is_plain_five_digit_postal_code(text: &str) -> bool { + text.len() == 5 && text.chars().all(|ch| ch.is_ascii_digit()) +} + +const fn is_dash(ch: char) -> bool { + matches!(ch, '-' | '‐' | '‑' | '‒' | '–' | '—' | '―') +} + +fn seed_covered(seeds: &[Seed], start: usize, end: usize) -> bool { + seeds + .iter() + .any(|seed| seed.start <= start && seed.end >= end) +} + +fn has_nearby_italian_cap_evidence(seeds: &[Seed], start: usize) -> bool { + seeds.iter().any(|seed| { + seed.start.abs_diff(start) <= 80 + && match seed.kind { + SeedType::AddressTrigger | SeedType::City | SeedType::PostalCode => { + true + } + SeedType::StreetWord => seed.text.to_lowercase() != "via", + SeedType::State => false, + } + }) +} + +fn is_city_zip_gap(text: &str) -> bool { + !text.is_empty() && text.chars().all(|ch| ch.is_whitespace() || ch == ',') +} + +fn cluster_seeds( + seeds: &[Seed], + full_text: &str, + existing_entities: &[PipelineEntity], +) -> Vec { + let Some(first) = seeds.first() else { + return Vec::new(); + }; + + let mut clusters = Vec::new(); + let mut current = SeedCluster { + seeds: vec![first.clone()], + start: first.start, + end: first.end, + }; + + for seed in seeds.iter().skip(1) { + let gap_ok = seed.start.saturating_sub(current.end) + <= ADDRESS_CLUSTER_MAX_GAP + && !has_cluster_barrier( + full_text, + current.end, + seed.start, + existing_entities, + ); + if gap_ok { + current.seeds.push(seed.clone()); + current.end = current.end.max(seed.end); + continue; + } + clusters.push(current); + current = SeedCluster { + seeds: vec![seed.clone()], + start: seed.start, + end: seed.end, + }; + } + clusters.push(current); + clusters +} + +fn has_cluster_barrier( + full_text: &str, + gap_start: usize, + gap_end: usize, + existing_entities: &[PipelineEntity], +) -> bool { + full_text + .get(gap_start..gap_end) + .is_some_and(has_paragraph_break) + || existing_entities.iter().any(|entity| { + non_address_label(&entity.label) + && usize::try_from(entity.start) + .is_ok_and(|start| start >= gap_start && start < gap_end) + && usize::try_from(entity.end).is_ok_and(|end| end > gap_start) + }) +} + +fn overlaps_non_address( + entity: &PipelineEntity, + existing_entities: &[PipelineEntity], +) -> bool { + let start = usize::try_from(entity.start).unwrap_or(usize::MAX); + let end = usize::try_from(entity.end).unwrap_or(usize::MAX); + range_overlaps_non_address(start, end, existing_entities) +} + +fn range_overlaps_non_address( + start: usize, + end: usize, + existing_entities: &[PipelineEntity], +) -> bool { + existing_entities.iter().any(|existing| { + non_address_label(&existing.label) + && usize::try_from(existing.end).is_ok_and(|existing_end| { + existing_end > start + && usize::try_from(existing.start) + .is_ok_and(|existing_start| existing_start < end) + }) + }) +} + +fn has_paragraph_break(text: &str) -> bool { + let mut saw_newline = false; + for ch in text.chars() { + if ch == '\n' { + if saw_newline { + return true; + } + saw_newline = true; + continue; + } + if !ch.is_whitespace() { + saw_newline = false; + } + } + false +} + +fn score_cluster(cluster: &SeedCluster) -> f64 { + let mut has_street_word = false; + let mut has_postal_code = false; + let mut has_city = false; + let mut has_state = false; + let mut has_address_trigger = false; + + for seed in &cluster.seeds { + match seed.kind { + SeedType::StreetWord => has_street_word = true, + SeedType::PostalCode => has_postal_code = true, + SeedType::City => has_city = true, + SeedType::State => has_state = true, + SeedType::AddressTrigger => has_address_trigger = true, + } + } + + let type_count = [ + has_street_word, + has_postal_code, + has_city, + has_state, + has_address_trigger, + ] + .into_iter() + .filter(|seen| *seen) + .count(); + if type_count < 2 { + return 0.0; + } + + let mut score = ADDRESS_SCORE_BASE; + if has_postal_code { + score += 0.15; + } + if has_city { + score += 0.15; + } + if has_state { + score += 0.15; + } + if has_street_word { + score += 0.15; + } + if has_address_trigger { + score += 0.1; + } + score.min(ADDRESS_SCORE_MAX) +} + +fn nearest_left_non_address( + start: usize, + existing_entities: &[PipelineEntity], +) -> usize { + existing_entities + .iter() + .filter(|entity| non_address_label(&entity.label)) + .filter_map(|entity| { + let end = usize::try_from(entity.end).ok()?; + (end <= start).then_some(end) + }) + .max() + .unwrap_or(0) +} + +fn nearest_right_non_address( + right_pos: usize, + existing_entities: &[PipelineEntity], +) -> Option { + existing_entities + .iter() + .filter(|entity| non_address_label(&entity.label)) + .filter_map(|entity| { + let start = usize::try_from(entity.start).ok()?; + let offset = start.saturating_sub(right_pos); + (offset > 0).then_some(offset) + }) + .min() +} + +fn non_address_label(label: &str) -> bool { + matches!( + label, + "registration number" + | "tax identification number" + | "national identification number" + | "social security number" + | "birth number" + | "identity card number" + | "date" + | "date of birth" + | "person" + | "bank account number" + | "email address" + | "phone number" + | "organization" + | "iban" + ) +} + +fn expand_left(full_text: &str, start: usize, left_bound: usize) -> usize { + let mut left_pos = start; + while left_pos > left_bound { + let Some((word_start, word_end, word)) = + word_before_for_address(full_text, left_pos, left_bound) + else { + break; + }; + if word.len() < 2 + || !starts_uppercase_or_digit(word) + || is_left_address_label(word) + { + break; + } + if full_text + .get(word_start..left_pos) + .is_some_and(|slice| slice.contains('\n')) + { + break; + } + left_pos = word_start; + if word_end <= left_bound { + break; + } + } + left_pos +} + +fn word_before_for_address( + text: &str, + pos: usize, + left_bound: usize, +) -> Option<(usize, usize, &str)> { + let mut end = pos; + while end > left_bound { + let Some((prev_start, ch)) = previous_char(text, end) else { + break; + }; + if ch == ' ' || ch == ',' { + end = prev_start; + continue; + } + break; + } + if end <= left_bound { + return None; + } + + let mut start = end; + while start > left_bound { + let Some((prev_start, ch)) = previous_char(text, start) else { + break; + }; + if ch.is_whitespace() { + break; + } + start = prev_start; + } + let word = text.get(start..end)?; + Some((start, end, word)) +} + +fn starts_uppercase_or_digit(text: &str) -> bool { + text + .chars() + .next() + .is_some_and(|ch| ch.is_uppercase() || ch.is_ascii_digit()) +} + +fn is_left_address_label(text: &str) -> bool { + text.ends_with(':') +} + +fn trim_address_tail(full_text: &str, start: usize, mut end: usize) -> usize { + while end > start { + let Some((prev_start, ch)) = previous_char(full_text, end) else { + break; + }; + if is_address_trailing_trim(ch) { + end = prev_start; + continue; + } + break; + } + end +} + +fn sentence_boundary(text: &str) -> Option { + let mut iter = text.char_indices().peekable(); + while let Some((index, ch)) = iter.next() { + if !matches!(ch, '.' | '!' | '?') { + continue; + } + let mut saw_whitespace = false; + while let Some((_, next)) = iter.peek().copied() { + if !next.is_whitespace() { + break; + } + saw_whitespace = true; + iter.next(); + } + let Some((_, next)) = iter.peek().copied() else { + return Some(index); + }; + if saw_whitespace && (next.is_uppercase() || next.is_ascii_digit()) { + return Some(index); + } + } + None +} + +const fn is_address_trailing_trim(ch: char) -> bool { + ch.is_whitespace() + || matches!( + ch, + ',' + | ';' + | ':' + | '(' + | '[' + | '{' + | '"' + | '\'' + | '“' + | '”' + | '‘' + | '’' + | '′' + ) +} + +fn resolve_newline_boundary( + span_start: usize, + text: &str, + cluster: &SeedCluster, +) -> NewlineBoundaryResolution { + let mut newline_positions = text.match_indices('\n').map(|(index, _)| index); + let Some(relative_newline) = newline_positions.next() else { + return NewlineBoundaryResolution::Keep; + }; + if newline_positions.next().is_some() { + return NewlineBoundaryResolution::Drop; + } + + let newline_abs = span_start.saturating_add(relative_newline); + let mut street_above = false; + let mut street_below = false; + let mut destination_above = false; + let mut destination_below = false; + + for seed in &cluster.seeds { + let is_above = seed.end <= newline_abs; + let is_street = matches!(seed.kind, SeedType::StreetWord); + let is_destination = + matches!(seed.kind, SeedType::PostalCode | SeedType::City); + if is_street && is_above { + street_above = true; + } + if is_street && !is_above { + street_below = true; + } + if is_destination && is_above { + destination_above = true; + } + if is_destination && !is_above { + destination_below = true; + } + } + + if (street_above && destination_below) || (street_below && destination_above) + { + return NewlineBoundaryResolution::Keep; + } + if street_above && destination_above { + return NewlineBoundaryResolution::Trim { + relative_end: relative_newline, + }; + } + NewlineBoundaryResolution::Drop +} + +fn byte_cap_at_char_boundary(text: &str, cap: usize) -> usize { + if cap >= text.len() { + return text.len(); + } + floor_char_boundary(text, cap) +} + +fn floor_char_boundary(text: &str, mut byte: usize) -> usize { + byte = byte.min(text.len()); + while byte > 0 && !text.is_char_boundary(byte) { + byte = byte.saturating_sub(1); + } + byte +} + +fn ceil_char_boundary(text: &str, mut byte: usize) -> usize { + byte = byte.min(text.len()); + while byte < text.len() && !text.is_char_boundary(byte) { + byte = byte.saturating_add(1); + } + byte +} + +fn previous_char(text: &str, byte: usize) -> Option<(usize, char)> { + text.get(..byte)?.char_indices().next_back() +} + +fn next_char(text: &str, byte: usize) -> Option<(usize, char)> { + let suffix = text.get(byte..)?; + let (relative, ch) = suffix.char_indices().next()?; + Some((byte.saturating_add(relative), ch)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn entity( + full_text: &str, + text: &str, + label: &str, + source: DetectionSource, + ) -> Result { + let Some(start) = full_text.find(text) else { + return Err(Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture text should exist"), + }); + }; + let end = start.saturating_add(text.len()); + Ok(PipelineEntity::detected( + u32::try_from(start).map_err(|_| Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture start should fit u32"), + })?, + u32::try_from(end).map_err(|_| Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture end should fit u32"), + })?, + label, + text, + 0.9, + source, + )) + } + + #[test] + fn expands_compound_street_with_plain_postal_city() -> Result<()> { + let data = PreparedAddressSeedData::new(AddressSeedData { + boundary_words: vec![String::from("steuer-id")], + br_cep_cue_words: Vec::new(), + })?; + let full_text = concat!( + "(2) Frau Karoline M. Brentano,\n", + " geboren am 09. Juli 1982,\n", + " wohnhaft Bismarckring 18, 65183 Wiesbaden,\n", + " Steuer-ID: 78 123 456 789", + ); + let existing = vec![ + entity( + full_text, + "Frau Karoline M. Brentano", + "person", + DetectionSource::DenyList, + )?, + entity( + full_text, + "09. Juli 1982", + "date of birth", + DetectionSource::Trigger, + )?, + entity( + full_text, + "5183 Wiesbaden", + "address", + DetectionSource::DenyList, + )?, + ]; + + let result = + data.process(&[], PatternSlice::default(), full_text, &existing)?; + + assert!( + result + .iter() + .any(|entity| entity.text == "Bismarckring 18, 65183 Wiesbaden"), + "address seed entities: {result:?}", + ); + Ok(()) + } +} diff --git a/crates/anonymize-core/src/anchored.rs b/crates/anonymize-core/src/anchored.rs new file mode 100644 index 00000000..c91040f9 --- /dev/null +++ b/crates/anonymize-core/src/anchored.rs @@ -0,0 +1,158 @@ +use crate::resolution::PipelineEntity; +use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::types::{Result, SearchMatch}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct AnchorSpan { + pub start: usize, + pub end: usize, +} + +pub(crate) struct AnchorTerm { + text: String, + case_insensitive: bool, + whole_words: bool, +} + +impl AnchorTerm { + pub(crate) const fn new( + text: String, + case_insensitive: bool, + whole_words: bool, + ) -> Self { + Self { + text, + case_insensitive, + whole_words, + } + } + + pub(crate) const fn word_case_insensitive(text: String) -> Self { + Self { + text, + case_insensitive: true, + whole_words: true, + } + } + + pub(crate) const fn word_case_sensitive(text: String) -> Self { + Self { + text, + case_insensitive: false, + whole_words: true, + } + } + + pub(crate) const fn symbol(text: String) -> Self { + Self { + text, + case_insensitive: false, + whole_words: false, + } + } +} + +pub(crate) trait AnchoredRule { + fn anchor_terms(&self) -> Vec; + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result>; +} + +pub(crate) struct AnchoredExtractor { + search: SearchIndex, + rule: R, +} + +impl AnchoredExtractor { + pub(crate) fn new(rule: R) -> Result> { + let anchors = rule.anchor_terms(); + if anchors.is_empty() { + return Ok(None); + } + + Ok(Some(Self { + search: SearchIndex::new( + anchors + .into_iter() + .map(|anchor| SearchPattern::LiteralWithOptions { + pattern: anchor.text, + case_insensitive: Some(anchor.case_insensitive), + whole_words: Some(anchor.whole_words), + }) + .collect(), + SearchOptions::default(), + )?, + rule, + })) + } + + pub(crate) fn extract(&self, full_text: &str) -> Result> { + let mut entities = Vec::new(); + for found in self.search.find_iter(full_text)? { + let anchor = anchor_span(&found); + entities.extend(self.rule.extract(full_text, anchor)?); + } + Ok(select_anchored_entities(entities)) + } +} + +fn anchor_span(found: &SearchMatch) -> AnchorSpan { + AnchorSpan { + start: usize::try_from(found.start()).unwrap_or(usize::MAX), + end: usize::try_from(found.end()).unwrap_or(usize::MAX), + } +} + +fn select_anchored_entities( + mut entities: Vec, +) -> Vec { + if entities.len() < 2 { + return entities; + } + + entities.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + .then_with(|| left.label.cmp(&right.label)) + }); + + let mut selected = Vec::new(); + for entity in entities { + if selected.iter().any(|existing| { + same_bucket(existing, &entity) && contains(existing, &entity) + }) { + continue; + } + + selected.retain(|existing| { + !same_bucket(&entity, existing) || !contains(&entity, existing) + }); + selected.push(entity); + } + + selected.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| left.end.cmp(&right.end)) + .then_with(|| left.label.cmp(&right.label)) + }); + selected +} + +fn same_bucket(left: &PipelineEntity, right: &PipelineEntity) -> bool { + left.label == right.label + && left.source == right.source + && left.source_detail == right.source_detail + && left.kind == right.kind +} + +const fn contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.start <= inner.start && outer.end >= inner.end +} diff --git a/crates/anonymize-core/src/artifact_bytes.rs b/crates/anonymize-core/src/artifact_bytes.rs new file mode 100644 index 00000000..350702dd --- /dev/null +++ b/crates/anonymize-core/src/artifact_bytes.rs @@ -0,0 +1,122 @@ +use crate::types::{Error, Result}; + +pub(crate) struct ArtifactWriter { + bytes: Vec, +} + +impl ArtifactWriter { + pub(crate) fn new(header: [u8; 8], version: u32) -> Self { + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header); + write_u32(&mut bytes, version); + Self { bytes } + } + + pub(crate) fn write_len( + &mut self, + len: usize, + field: &'static str, + ) -> Result<()> { + write_u32(&mut self.bytes, checked_len_u32(len, field)?); + Ok(()) + } + + pub(crate) fn write_len_prefixed_bytes( + &mut self, + field: &'static str, + bytes: &[u8], + ) -> Result<()> { + self.write_len(bytes.len(), field)?; + self.bytes.extend_from_slice(bytes); + Ok(()) + } + + pub(crate) fn into_bytes(self) -> Vec { + self.bytes + } +} + +pub(crate) struct ArtifactReader<'a> { + bytes: &'a [u8], + offset: usize, + field: &'static str, +} + +impl<'a> ArtifactReader<'a> { + pub(crate) fn new( + bytes: &'a [u8], + header: [u8; 8], + version: u32, + field: &'static str, + ) -> Result { + let mut reader = Self { + bytes, + offset: 0, + field, + }; + let actual_header = reader.read_bytes(header.len())?; + if actual_header != header { + return Err(invalid_artifact(field, "unexpected header")); + } + let actual_version = reader.read_u32()?; + if actual_version != version { + return Err(invalid_artifact(field, "unsupported version")); + } + Ok(reader) + } + + pub(crate) fn read_usize(&mut self) -> Result { + usize::try_from(self.read_u32()?) + .map_err(|_| invalid_artifact(self.field, "length is not addressable")) + } + + pub(crate) fn read_len_prefixed_bytes(&mut self) -> Result<&'a [u8]> { + let len = self.read_usize()?; + self.read_bytes(len) + } + + pub(crate) fn finish(&self) -> Result<()> { + if self.offset == self.bytes.len() { + return Ok(()); + } + Err(invalid_artifact(self.field, "trailing data")) + } + + fn read_u32(&mut self) -> Result { + let bytes = self.read_bytes(4)?; + let array = <[u8; 4]>::try_from(bytes) + .map_err(|_| invalid_artifact(self.field, "malformed u32"))?; + Ok(u32::from_le_bytes(array)) + } + + fn read_bytes(&mut self, len: usize) -> Result<&'a [u8]> { + let end = self + .offset + .checked_add(len) + .ok_or_else(|| invalid_artifact(self.field, "length overflow"))?; + let bytes = self + .bytes + .get(self.offset..end) + .ok_or_else(|| invalid_artifact(self.field, "truncated data"))?; + self.offset = end; + Ok(bytes) + } +} + +fn write_u32(bytes: &mut Vec, value: u32) { + bytes.extend_from_slice(&value.to_le_bytes()); +} + +fn checked_len_u32(len: usize, field: &'static str) -> Result { + u32::try_from(len).map_err(|_| Error::InvalidStaticData { + field, + reason: format!("length exceeds u32: {len}"), + }) +} + +fn invalid_artifact(field: &'static str, reason: impl Into) -> Error { + Error::InvalidStaticData { + field, + reason: reason.into(), + } +} diff --git a/crates/anonymize-core/src/dates.rs b/crates/anonymize-core/src/dates.rs new file mode 100644 index 00000000..cf3126e0 --- /dev/null +++ b/crates/anonymize-core/src/dates.rs @@ -0,0 +1,459 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::anchored::{ + AnchorSpan, AnchorTerm, AnchoredExtractor, AnchoredRule, +}; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::Result; + +const DATE_LABEL: &str = "date"; +const DATE_SCORE: f64 = 1.0; + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct DateData { + pub month_names_by_language: BTreeMap>, + pub year_words_by_language: BTreeMap>, +} + +pub(crate) struct PreparedDateData { + extractor: AnchoredExtractor, +} + +impl PreparedDateData { + pub(crate) fn new(data: &DateData) -> Result> { + AnchoredExtractor::new(DateRule::new(data)) + .map(|extractor| extractor.map(|extractor| Self { extractor })) + } + + pub(crate) fn process(&self, full_text: &str) -> Result> { + self.extractor.extract(full_text) + } +} + +struct DateRule { + month_names: BTreeSet, + year_words: BTreeSet, +} + +impl DateRule { + fn new(data: &DateData) -> Self { + Self { + month_names: unique_word_set(&data.month_names_by_language, 3), + year_words: unique_word_set(&data.year_words_by_language, 2), + } + } +} + +impl AnchoredRule for DateRule { + fn anchor_terms(&self) -> Vec { + self + .month_names + .iter() + .cloned() + .map(AnchorTerm::word_case_insensitive) + .chain( + self + .year_words + .iter() + .cloned() + .map(AnchorTerm::word_case_insensitive), + ) + .collect() + } + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result> { + let span = word_span(full_text, anchor); + let clean = str_slice(full_text, span.start, span.end) + .unwrap_or_default() + .trim_end_matches('.') + .to_lowercase(); + let mut spans = Vec::new(); + if self.month_names.contains(&clean) { + spans.extend( + date_spans_for_month(full_text, span.start, span.end) + .into_iter() + .map(|(start, end)| (start, end, DetectionSource::Regex)), + ); + } + if self.year_words.contains(&clean) + && let Some(year) = year_after_word_span(full_text, span.end) + { + spans.push((year.0, year.1, DetectionSource::Trigger)); + } + + Ok( + spans + .into_iter() + .filter_map(|(start, end, source)| { + date_entity(full_text, start, end, source) + }) + .collect(), + ) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Span { + start: usize, + end: usize, +} + +fn unique_word_set( + values_by_language: &BTreeMap>, + min_chars: usize, +) -> BTreeSet { + let mut seen = BTreeSet::new(); + for names in values_by_language.values() { + for name in names { + let clean = name.trim().trim_end_matches('.').to_lowercase(); + if clean.chars().count() >= min_chars { + seen.insert(clean); + } + } + } + seen +} + +fn word_span(full_text: &str, anchor: AnchorSpan) -> Span { + let mut end = anchor.end.min(full_text.len()); + if starts_with_at(full_text, end, ".") { + end = end.saturating_add(1); + } + Span { + start: anchor.start, + end, + } +} + +fn year_after_word_span(text: &str, word_end: usize) -> Option<(usize, usize)> { + let after_word = skip_horizontal_ws(text, word_end); + parse_year_forward(text, after_word) +} + +fn date_spans_for_month( + full_text: &str, + month_start: usize, + month_end: usize, +) -> Vec<(usize, usize)> { + let mut spans = Vec::new(); + + if let Some(span) = day_month_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = ordinal_day_month_span(full_text, month_start, month_end) + { + spans.push(span); + } + if let Some(span) = de_day_month_year_span(full_text, month_start, month_end) + { + spans.push(span); + } + if let Some(span) = month_day_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = month_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = year_month_day_span(full_text, month_start, month_end) { + spans.push(span); + } + + spans +} + +fn date_entity( + full_text: &str, + start: usize, + end: usize, + source: DetectionSource, +) -> Option { + let start_u32 = u32::try_from(start).unwrap_or(u32::MAX); + let end_u32 = u32::try_from(end).unwrap_or(u32::MAX); + Some(PipelineEntity::detected( + start_u32, + end_u32, + DATE_LABEL, + str_slice(full_text, start, end)?.to_owned(), + DATE_SCORE, + source, + )) +} + +fn day_month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let year = parse_year_forward(text, after_month)?; + let end = parse_time_suffix(text, year.1).unwrap_or(year.1); + Some((day.0, end)) +} + +fn ordinal_day_month_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = ordinal_day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let end = parse_year_forward(text, after_month).map_or(month_end, |year| { + parse_time_suffix(text, year.1).unwrap_or(year.1) + }); + Some((day.0, end)) +} + +fn de_day_month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = de_day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let after_de = parse_de_prefix(text, after_month).unwrap_or(after_month); + let year = parse_year_forward(text, after_de)?; + Some((day.0, year.1)) +} + +fn month_day_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let after_month = skip_horizontal_ws(text, month_end); + let day = parse_digits_forward(text, after_month, 1, 2)?; + let after_day = skip_date_year_separator(text, day.1); + if let Some(year) = parse_year_forward(text, after_day) { + return Some((month_start, year.1)); + } + right_date_boundary(text, day.1).then_some((month_start, day.1)) +} + +fn month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let after_month = skip_horizontal_ws(text, month_end); + let year = parse_year_forward(text, after_month)?; + Some((month_start, year.1)) +} + +fn year_month_day_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let before_month = skip_horizontal_ws_backward(text, month_start); + if !ends_with_before(text, before_month, ".") { + return None; + } + let year_end = before_month.saturating_sub(1); + let year = parse_digits_backward(text, year_end, 4, 4)?; + if !left_date_boundary(text, year.0) { + return None; + } + + let after_month = skip_horizontal_ws(text, month_end); + let day = parse_digits_forward(text, after_month, 1, 2)?; + let end = if starts_with_at(text, day.1, ".") { + day.1.saturating_add(1) + } else { + day.1 + }; + Some((year.0, end)) +} + +fn day_before_month(text: &str, month_start: usize) -> Option<(usize, usize)> { + let mut end = skip_horizontal_ws_backward(text, month_start); + if end == month_start { + return None; + } + if ends_with_before(text, end, ".") { + end = end.saturating_sub(1); + } + let day = parse_digits_backward(text, end, 1, 2)?; + left_date_boundary(text, day.0).then_some(day) +} + +fn ordinal_day_before_month( + text: &str, + month_start: usize, +) -> Option<(usize, usize)> { + let end = skip_horizontal_ws_backward(text, month_start); + if end == month_start { + return None; + } + for suffix in ["st", "nd", "rd", "th"] { + if !ends_with_before(text, end, suffix) { + continue; + } + let day_end = end.saturating_sub(suffix.len()); + let day = parse_digits_backward(text, day_end, 1, 2)?; + if left_date_boundary(text, day.0) { + return Some((day.0, end)); + } + } + None +} + +fn de_day_before_month( + text: &str, + month_start: usize, +) -> Option<(usize, usize)> { + let end = skip_horizontal_ws_backward(text, month_start); + let de_start = end.checked_sub(2)?; + if !str_slice(text, de_start, end)?.eq_ignore_ascii_case("de") { + return None; + } + let day_end = skip_horizontal_ws_backward(text, de_start); + let day = parse_digits_backward(text, day_end, 1, 2)?; + left_date_boundary(text, day.0).then_some((day.0, end)) +} + +fn parse_de_prefix(text: &str, index: usize) -> Option { + let end = index.saturating_add(2); + if !str_slice(text, index, end)?.eq_ignore_ascii_case("de") { + return None; + } + Some(skip_horizontal_ws(text, end)) +} + +fn parse_year_forward(text: &str, index: usize) -> Option<(usize, usize)> { + let year = parse_digits_forward(text, index, 4, 4)?; + right_date_boundary(text, year.1).then_some(year) +} + +fn parse_digits_forward( + text: &str, + index: usize, + min: usize, + max: usize, +) -> Option<(usize, usize)> { + let mut end = index; + let mut count = 0usize; + for ch in str_tail(text, index)?.chars() { + if !ch.is_ascii_digit() || count == max { + break; + } + end = end.saturating_add(ch.len_utf8()); + count = count.saturating_add(1); + } + (count >= min).then_some((index, end)) +} + +fn parse_digits_backward( + text: &str, + index: usize, + min: usize, + max: usize, +) -> Option<(usize, usize)> { + let mut start = index; + let mut count = 0usize; + for (char_start, ch) in str_head(text, index)?.char_indices().rev() { + if !ch.is_ascii_digit() || count == max { + break; + } + start = char_start; + count = count.saturating_add(1); + } + (count >= min).then_some((start, index)) +} + +fn parse_time_suffix(text: &str, index: usize) -> Option { + let start = skip_horizontal_ws(text, index); + let hour = parse_digits_forward(text, start, 1, 2)?; + if !starts_with_at(text, hour.1, ":") { + return None; + } + let minute = parse_digits_forward(text, hour.1.saturating_add(1), 2, 2)?; + if !starts_with_at(text, minute.1, ":") { + return Some(minute.1); + } + parse_digits_forward(text, minute.1.saturating_add(1), 2, 2) + .map(|second| second.1) +} + +fn skip_date_year_separator(text: &str, index: usize) -> usize { + if starts_with_at(text, index, ",") { + return skip_any_ws(text, index.saturating_add(1)); + } + skip_horizontal_ws(text, index) +} + +fn skip_any_ws(text: &str, mut index: usize) -> usize { + while let Some(ch) = + str_tail(text, index).and_then(|value| value.chars().next()) + { + if !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + } + index +} + +fn skip_horizontal_ws(text: &str, mut index: usize) -> usize { + while let Some(ch) = + str_tail(text, index).and_then(|value| value.chars().next()) + { + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + } + index +} + +fn skip_horizontal_ws_backward(text: &str, mut index: usize) -> usize { + while let Some((char_start, ch)) = + str_head(text, index).and_then(|value| value.char_indices().next_back()) + { + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = char_start; + } + index +} + +fn left_date_boundary(text: &str, index: usize) -> bool { + str_head(text, index) + .and_then(|value| value.chars().next_back()) + .is_none_or(|ch| !is_identifier_char(ch)) +} + +fn right_date_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| ch.is_whitespace() || ".,;!?)]".contains(ch)) +} + +fn is_identifier_char(ch: char) -> bool { + ch == '_' || ch.is_alphanumeric() +} + +fn starts_with_at(text: &str, index: usize, needle: &str) -> bool { + str_tail(text, index).is_some_and(|value| value.starts_with(needle)) +} + +fn ends_with_before(text: &str, index: usize, needle: &str) -> bool { + str_head(text, index).is_some_and(|value| value.ends_with(needle)) +} + +fn str_head(text: &str, index: usize) -> Option<&str> { + text.get(..index) +} + +fn str_tail(text: &str, index: usize) -> Option<&str> { + text.get(index..) +} + +fn str_slice(text: &str, start: usize, end: usize) -> Option<&str> { + text.get(start..end) +} diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 81578156..14ff55cc 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -4,9 +4,17 @@ use crate::types::{RedactionResult, SearchEngine, SearchMatch}; #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum DiagnosticStage { + PrepareCacheHit, + PrepareCacheMiss, + PrepareBindingParse, + PrepareBindingConvert, + PrepareArtifactsDecode, PrepareTotal, PrepareRegex, PrepareCustomRegex, + PrepareAnchored, + PrepareLegalFormSearch, + PrepareTriggerSearch, PrepareLiteral, Normalize, FindMatches, @@ -15,12 +23,19 @@ pub enum DiagnosticStage { FindLiteral, SearchRegex, SearchCustomRegex, + SearchLegalForm, + SearchTrigger, SearchLiteral, EntityRegex, EntityCustomRegex, + EntityAnchored, EntityDenyList, EntityGazetteer, EntityCountry, + EntityTrigger, + EntitySignature, + EntityLegalForm, + EntityAddressSeed, Merge, Boundary, Sanitize, diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs new file mode 100644 index 00000000..fa35cee7 --- /dev/null +++ b/crates/anonymize-core/src/false_positives.rs @@ -0,0 +1,876 @@ +use std::sync::LazyLock; + +use regex::Regex; + +use crate::byte_offsets::ByteOffsets; +use crate::processors::DenyListFilterData; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const ADDRESS_LABEL: &str = "address"; +const ORGANIZATION_LABEL: &str = "organization"; +const PERSON_LABEL: &str = "person"; +const REGISTRATION_NUMBER_LABEL: &str = "registration number"; +const MAX_ORGANIZATION_LENGTH: usize = 80; +const MAX_PERSON_LENGTH: usize = 60; +const MAX_OPEN_ENDED_ORGANIZATION_WORDS: usize = 8; +const ALL_CAPS_LINE_LETTER_THRESHOLD: usize = 5; +const ALL_CAPS_LINE_RATIO: f64 = 0.95; +const ALL_CAPS_LINE_PROSE_EXTRA_LETTERS: usize = 20; +const ALL_CAPS_LINE_HEADING_WORD_LIMIT: usize = 5; + +static POSTAL_CODE_RE: LazyLock> = + LazyLock::new(|| Regex::new(r"\d{3}\s?\d{2}").ok()); +static SECTION_NUMBER_RE: LazyLock> = + LazyLock::new(|| Regex::new(r"^(?:§\s*)?\d{1,3}(?:\.\d{1,3}){0,4}\.?$").ok()); + +pub(crate) fn filter_entity_false_positives( + entities: Vec, + full_text: &str, + filters: Option<&DenyListFilterData>, +) -> Result> { + let Some(filters) = filters else { + return Ok(entities); + }; + + let offsets = ByteOffsets::new(full_text); + let mut filtered = Vec::with_capacity(entities.len()); + for entity in entities { + if is_caller_owned(&entity) { + filtered.push(entity); + continue; + } + + let Some(normalized) = + normalize_entity(&entity, full_text, &offsets, filters)? + else { + continue; + }; + if should_reject_entity(&normalized, full_text, &offsets, filters)? { + continue; + } + filtered.push(normalized); + } + + Ok(filtered) +} + +fn normalize_entity( + entity: &PipelineEntity, + full_text: &str, + offsets: &ByteOffsets<'_>, + filters: &DenyListFilterData, +) -> Result> { + let raw_text = offsets.slice(full_text, entity.start, entity.end)?; + let mut start_byte = 0usize; + let mut end_byte = raw_text.len(); + + trim_leading_artifacts(&raw_text, &mut start_byte, end_byte); + trim_leading_whitespace(&raw_text, &mut start_byte, end_byte); + + if entity.label == ADDRESS_LABEL { + if let Some(trimmed) = + address_role_prefix_len(slice(&raw_text, start_byte, end_byte)?, filters) + { + start_byte = start_byte.saturating_add(trimmed); + trim_leading_whitespace(&raw_text, &mut start_byte, end_byte); + } + + let address_text = slice(&raw_text, start_byte, end_byte)?; + if let Some(trimmed_end) = + trim_trailing_address_prose(address_text, filters) + { + end_byte = start_byte.saturating_add(trimmed_end); + } + } + + trim_trailing_separators(&raw_text, start_byte, &mut end_byte); + if start_byte >= end_byte { + return Ok(None); + } + + let cleaned_raw = slice(&raw_text, start_byte, end_byte)?; + if !cleaned_raw.chars().any(char::is_alphanumeric) { + return Ok(None); + } + + let mut normalized = entity.clone(); + normalized.start = entity + .start + .saturating_add(byte_len(raw_text.get(..start_byte).unwrap_or_default())); + normalized.end = normalized.start.saturating_add(byte_len(cleaned_raw)); + normalized.text = collapse_display_whitespace(cleaned_raw); + Ok(Some(normalized)) +} + +fn should_reject_entity( + entity: &PipelineEntity, + full_text: &str, + offsets: &ByteOffsets<'_>, + filters: &DenyListFilterData, +) -> Result { + let text = entity.text.trim(); + if is_template_placeholder(text) { + return Ok(true); + } + if exceeds_label_length(entity) { + return Ok(true); + } + if exceeds_open_ended_word_count(entity) { + return Ok(true); + } + if is_section_number(text) && entity.source != DetectionSource::Trigger { + return Ok(true); + } + if is_standalone_year(text) && entity.source != DetectionSource::Trigger { + return Ok(true); + } + if entity.label == REGISTRATION_NUMBER_LABEL && is_short_letter_run(text) { + return Ok(true); + } + if entity.label == PERSON_LABEL && text.chars().any(|ch| ch.is_ascii_digit()) + { + return Ok(true); + } + if entity.label == PERSON_LABEL && is_single_person_stopword(text, filters) { + return Ok(true); + } + if entity.label == PERSON_LABEL + && ends_in_person_trailing_noun(entity, filters) + { + return Ok(true); + } + if role_exact_match(entity, filters) { + return Ok(true); + } + if entity.label == ORGANIZATION_LABEL + && is_all_caps_candidate(text) + && is_all_caps_boilerplate_line(full_text, offsets, entity)? + { + return Ok(true); + } + if entity.label == ADDRESS_LABEL && should_reject_address(entity, filters) { + return Ok(true); + } + + Ok(false) +} + +fn should_reject_address( + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> bool { + let text = entity.text.trim(); + if is_signing_place_address(text, filters) { + return true; + } + + let has_digits = text.chars().any(|ch| ch.is_ascii_digit()); + let has_component = has_address_component(text, filters); + if is_jurisdiction_address(text, filters) { + return false; + } + if entity.source == DetectionSource::Trigger && !has_digits && !has_component + { + return true; + } + + text.chars().count() > 40 + && !has_digits + && !regex_is_match(&POSTAL_CODE_RE, text) + && !has_component +} + +fn exceeds_label_length(entity: &PipelineEntity) -> bool { + if entity.source == DetectionSource::LegalForm { + return false; + } + let max = match entity.label.as_str() { + ORGANIZATION_LABEL => MAX_ORGANIZATION_LENGTH, + PERSON_LABEL => MAX_PERSON_LENGTH, + _ => return false, + }; + entity.text.chars().count() > max +} + +fn exceeds_open_ended_word_count(entity: &PipelineEntity) -> bool { + entity.label == ORGANIZATION_LABEL + && matches!( + entity.source, + DetectionSource::Trigger | DetectionSource::Coreference + ) + && word_count(&entity.text) > MAX_OPEN_ENDED_ORGANIZATION_WORDS +} + +fn is_template_placeholder(text: &str) -> bool { + let trimmed = text.trim(); + if trimmed.len() >= 3 && trimmed.chars().all(|ch| ch == '.' || ch == '_') { + return true; + } + let Some(inner) = bracketed_inner(trimmed, '[', ']') + .or_else(|| bracketed_inner(trimmed, '{', '}')) + else { + return false; + }; + !inner.is_empty() + && inner + .chars() + .all(|ch| ch == '_' || ch.is_alphanumeric() || ch.is_whitespace()) +} + +fn bracketed_inner(text: &str, open: char, close: char) -> Option<&str> { + let mut chars = text.chars(); + if chars.next()? != open || chars.next_back()? != close { + return None; + } + let start = open.len_utf8(); + let end = text.len().saturating_sub(close.len_utf8()); + text.get(start..end) +} + +fn is_section_number(text: &str) -> bool { + regex_is_match(&SECTION_NUMBER_RE, text.trim()) +} + +fn is_standalone_year(text: &str) -> bool { + let trimmed = text.trim(); + trimmed.len() == 4 + && trimmed.chars().all(|ch| ch.is_ascii_digit()) + && (trimmed.starts_with("19") || trimmed.starts_with("20")) +} + +fn is_short_letter_run(text: &str) -> bool { + let letters = text.trim(); + (1..=2).contains(&letters.chars().count()) + && letters.chars().all(char::is_alphabetic) +} + +fn is_single_person_stopword(text: &str, filters: &DenyListFilterData) -> bool { + let token = trim_token_punctuation(text); + !token.is_empty() + && !token.chars().any(char::is_whitespace) + && filters.person_stopwords.contains(&token.to_lowercase()) +} + +fn ends_in_person_trailing_noun( + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> bool { + if matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) { + return false; + } + + let mut words = entity + .text + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()); + if words.next().is_none() { + return false; + } + let Some(last) = words.next_back() else { + return false; + }; + filters.person_trailing_nouns.contains(&last.to_lowercase()) +} + +fn role_exact_match( + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> bool { + matches!(entity.label.as_str(), PERSON_LABEL | ORGANIZATION_LABEL) + && filters + .generic_roles + .contains(&entity.text.trim().to_lowercase()) +} + +fn is_all_caps_candidate(text: &str) -> bool { + let mut has_upper = false; + for ch in text.chars().filter(|ch| ch.is_alphabetic()) { + if ch.is_lowercase() { + return false; + } + has_upper |= ch.is_uppercase(); + } + has_upper +} + +fn is_all_caps_boilerplate_line( + full_text: &str, + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, +) -> Result { + let start = offsets.validate_offset(entity.start)?; + let end = offsets.validate_offset(entity.end)?; + let before = full_text.get(..start).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let line_start = before + .rfind('\n') + .map_or(0usize, |index| index.saturating_add('\n'.len_utf8())); + let after = full_text.get(end..).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let line_end = after + .find('\n') + .map_or(full_text.len(), |index| end.saturating_add(index)); + let line = full_text + .get(line_start..line_end) + .ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let entity_rel_start = start.saturating_sub(line_start); + let entity_rel_end = end.saturating_sub(line_start); + + let mut letter_count = 0usize; + let mut upper_count = 0usize; + let mut outside_entity_letters = 0usize; + for (index, ch) in line.char_indices() { + if !ch.is_alphabetic() { + continue; + } + letter_count = letter_count.saturating_add(1); + if ch.is_uppercase() { + upper_count = upper_count.saturating_add(1); + } + if index < entity_rel_start || index >= entity_rel_end { + outside_entity_letters = outside_entity_letters.saturating_add(1); + } + } + + if letter_count <= ALL_CAPS_LINE_LETTER_THRESHOLD { + return Ok(false); + } + if !uppercase_ratio_at_least(upper_count, letter_count) { + return Ok(false); + } + if starts_with_section_heading_prefix(line) { + return Ok(true); + } + if outside_entity_letters >= ALL_CAPS_LINE_PROSE_EXTRA_LETTERS { + return Ok(true); + } + Ok( + word_count(&entity.text) > ALL_CAPS_LINE_HEADING_WORD_LIMIT + && !entity.text.contains(','), + ) +} + +fn starts_with_section_heading_prefix(line: &str) -> bool { + let mut chars = line.trim_start().chars().peekable(); + if chars.peek().is_some_and(|ch| *ch == '§') { + chars.next(); + while chars.peek().is_some_and(|ch| ch.is_whitespace()) { + chars.next(); + } + } + + let mut saw_digit = false; + let mut group_digits = 0usize; + while let Some(ch) = chars.peek().copied() { + if ch.is_ascii_digit() { + saw_digit = true; + group_digits = group_digits.saturating_add(1); + if group_digits > 3 { + return false; + } + chars.next(); + continue; + } + if ch == '.' && saw_digit { + group_digits = 0; + chars.next(); + continue; + } + break; + } + if !saw_digit { + return false; + } + while chars.peek().is_some_and(|ch| ch.is_whitespace()) { + chars.next(); + } + chars.next().is_some_and(char::is_uppercase) +} + +fn trim_leading_artifacts(text: &str, start: &mut usize, end: usize) { + while let Some(rest) = text.get(*start..end) { + if !rest.starts_with('.') { + break; + } + let after_dot_start = '.'.len_utf8(); + let Some(after_dot) = rest.get(after_dot_start..) else { + break; + }; + let whitespace = leading_whitespace_len(after_dot); + if whitespace == 0 { + break; + } + *start = + (*start).saturating_add(after_dot_start.saturating_add(whitespace)); + } +} + +fn trim_leading_whitespace(text: &str, start: &mut usize, end: usize) { + let Some(rest) = text.get(*start..end) else { + return; + }; + *start = (*start).saturating_add(leading_whitespace_len(rest)); +} + +fn trim_trailing_separators(text: &str, start: usize, end: &mut usize) { + while let Some(slice) = text.get(start..*end) { + let Some((index, ch)) = slice.char_indices().next_back() else { + break; + }; + if ch.is_whitespace() || ch == ',' { + *end = start.saturating_add(index); + continue; + } + break; + } +} + +fn address_role_prefix_len( + text: &str, + filters: &DenyListFilterData, +) -> Option { + let (word_end, word) = first_word(text)?; + if !filters.generic_roles.contains(&word.to_lowercase()) { + return None; + } + let rest = text.get(word_end..)?; + let whitespace = leading_whitespace_len(rest); + if whitespace == 0 { + return None; + } + let candidate = rest.get(whitespace..)?; + if looks_like_address_start(candidate, filters) { + return Some(word_end.saturating_add(whitespace)); + } + None +} + +fn looks_like_address_start(text: &str, filters: &DenyListFilterData) -> bool { + let trimmed = text.trim_start(); + trimmed.chars().next().is_some_and(|ch| { + ch.is_ascii_digit() + || ch.is_uppercase() + || has_address_component(trimmed, filters) + }) +} + +fn trim_trailing_address_prose( + text: &str, + filters: &DenyListFilterData, +) -> Option { + for (index, ch) in text.char_indices() { + if ch != '.' { + continue; + } + let before = text.get(..index)?; + if !before.chars().any(|candidate| candidate.is_ascii_digit()) { + continue; + } + if text_ends_with_address_component(before.trim_end(), filters) { + continue; + } + let after = text + .get(index.saturating_add('.'.len_utf8())..)? + .trim_start(); + if after.len() < 5 || has_address_component(after, filters) { + continue; + } + if after.chars().next().is_some_and(char::is_uppercase) { + return Some(before.trim_end().len()); + } + } + None +} + +fn has_address_component(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters + .street_types + .iter() + .any(|component| contains_component(&lower, component)) +} + +fn is_jurisdiction_address(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters.address_jurisdiction_prefixes.iter().any(|prefix| { + let Some(rest) = lower.strip_prefix(prefix) else { + return false; + }; + rest.chars().next().is_some_and(char::is_whitespace) + && rest.chars().any(char::is_alphabetic) + }) +} + +fn text_ends_with_address_component( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let lower = text.to_lowercase(); + filters.street_types.iter().any(|component| { + if component.is_empty() || !lower.ends_with(component) { + return false; + } + let prefix_len = lower.len().saturating_sub(component.len()); + lower + .get(..prefix_len) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary) + }) +} + +fn contains_component(text: &str, component: &str) -> bool { + if component.is_empty() { + return false; + } + text.match_indices(component).any(|(start, _)| { + let end = start.saturating_add(component.len()); + let left_ok = text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary); + let right_ok = text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(is_right_component_boundary); + left_ok && right_ok + }) +} + +const fn is_left_component_boundary(ch: char) -> bool { + ch.is_whitespace() || ch == ',' || ch == '(' || ch == '[' +} + +const fn is_right_component_boundary(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, ',' | '.' | '/' | ')' | ']') +} + +fn is_signing_place_address(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters.signing_place_guards.iter().any(|guard| { + guard.prefix_phrases.iter().any(|prefix| { + !prefix.is_empty() + && lower.starts_with(prefix) + && guard + .suffix_phrases + .iter() + .any(|suffix| !suffix.is_empty() && lower.ends_with(suffix)) + }) + }) +} + +fn first_word(text: &str) -> Option<(usize, &str)> { + let mut end = 0usize; + for (index, ch) in text.char_indices() { + if !ch.is_alphabetic() { + break; + } + end = index.saturating_add(ch.len_utf8()); + } + if end == 0 { + return None; + } + text.get(..end).map(|word| (end, word)) +} + +fn word_count(text: &str) -> usize { + let mut count = 0usize; + let mut in_word = false; + for ch in text.chars() { + let word_char = + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '-' | '.'); + if word_char && !in_word { + count = count.saturating_add(1); + } + in_word = word_char; + } + count +} + +fn trim_token_punctuation(text: &str) -> &str { + text + .trim() + .trim_matches(|ch: char| matches!(ch, '.' | ',' | ';' | ':' | '!' | '?')) +} + +fn leading_whitespace_len(text: &str) -> usize { + let mut len = 0usize; + for ch in text.chars() { + if !ch.is_whitespace() { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + +fn slice(text: &str, start: usize, end: usize) -> Result<&str> { + text.get(start..end).ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(start).unwrap_or(u32::MAX), + end: u32::try_from(end).unwrap_or(u32::MAX), + }) +} + +fn collapse_display_whitespace(text: &str) -> String { + let mut out = String::new(); + let mut whitespace = String::new(); + + for ch in text.chars() { + if ch.is_whitespace() { + whitespace.push(ch); + continue; + } + + flush_whitespace(&mut out, &mut whitespace); + out.push(ch); + } + + flush_whitespace(&mut out, &mut whitespace); + out +} + +fn flush_whitespace(output: &mut String, whitespace: &mut String) { + if whitespace.is_empty() { + return; + } + + if whitespace.chars().any(|ch| matches!(ch, '\n' | '\r')) + || whitespace.chars().count() >= 2 + { + output.push(' '); + } else if let Some(ch) = whitespace.chars().next() { + output.push(ch); + } + + whitespace.clear(); +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn regex_is_match(regex: &LazyLock>, text: &str) -> bool { + regex + .as_ref() + .is_some_and(|compiled| compiled.is_match(text)) +} + +fn uppercase_ratio_at_least(upper_count: usize, letter_count: usize) -> bool { + let Some(upper) = u32::try_from(upper_count).ok().map(f64::from) else { + return true; + }; + let Some(total) = u32::try_from(letter_count).ok().map(f64::from) else { + return true; + }; + upper / total >= ALL_CAPS_LINE_RATIO +} + +const fn is_caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + + use std::collections::BTreeSet; + + use super::*; + + #[test] + fn rejects_template_placeholders() { + let entities = filter_entity_false_positives( + vec![entity( + "[NAME]", + "[NAME]", + PERSON_LABEL, + DetectionSource::Regex, + )], + "[NAME]", + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn trims_address_role_prefix_from_shared_role_data() { + let text = "sídlo prodávajícího Na Květnici 1"; + let start = text.find("prodávajícího").unwrap(); + let filters = DenyListFilterData { + generic_roles: set(["prodávajícího"]), + ..DenyListFilterData::default() + }; + + let entities = filter_entity_false_positives( + vec![PipelineEntity::detected( + u32::try_from(start).unwrap(), + u32::try_from(text.len()).unwrap(), + ADDRESS_LABEL, + "prodávajícího Na Květnici 1", + 0.8, + DetectionSource::Trigger, + )], + text, + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Na Květnici 1"); + assert_eq!( + entities[0].start, + u32::try_from("sídlo prodávajícího ".len()).unwrap() + ); + } + + #[test] + fn preserves_single_non_breaking_space_in_entity_text() { + let text = "Městským soudem v\u{00a0}Praze"; + let entities = filter_entity_false_positives( + vec![entity( + text, + text, + ORGANIZATION_LABEL, + DetectionSource::Trigger, + )], + text, + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, text); + } + + #[test] + fn rejects_trigger_address_without_digits_or_street_component() { + let entities = filter_entity_false_positives( + vec![entity( + "Nejsme plátci DPH", + "Nejsme plátci DPH", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "Nejsme plátci DPH", + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn keeps_trigger_address_with_street_component() { + let filters = DenyListFilterData { + street_types: set(["street"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity( + "West Street", + "West Street", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "West Street", + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + } + + #[test] + fn keeps_configured_jurisdiction_addresses_without_digits() { + let filters = DenyListFilterData { + address_jurisdiction_prefixes: set(["state of"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity( + "State of Delaware", + "State of Delaware", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "State of Delaware", + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + } + + #[test] + fn rejects_person_stopwords() { + let filters = DenyListFilterData { + person_stopwords: set(["tato"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity("Tato", "Tato", PERSON_LABEL, DetectionSource::Regex)], + "Tato", + Some(&filters), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn rejects_all_caps_section_heading_organizations() { + let text = "17. NO ASSIGNMENT.\n"; + let start = text.find("NO ASSIGNMENT").unwrap(); + let end = start.saturating_add("NO ASSIGNMENT".len()); + let entities = filter_entity_false_positives( + vec![PipelineEntity::detected( + u32::try_from(start).unwrap(), + u32::try_from(end).unwrap(), + ORGANIZATION_LABEL, + "NO ASSIGNMENT", + 0.8, + DetectionSource::Regex, + )], + text, + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + fn entity( + full_text: &str, + text: &str, + label: &str, + source: DetectionSource, + ) -> PipelineEntity { + PipelineEntity::detected( + 0, + u32::try_from(full_text.len()).expect("fixture length fits u32"), + label, + text, + 0.8, + source, + ) + } + + fn set(values: [&str; N]) -> BTreeSet { + values.into_iter().map(String::from).collect() + } +} diff --git a/crates/anonymize-core/src/legal_forms.rs b/crates/anonymize-core/src/legal_forms.rs new file mode 100644 index 00000000..ff169947 --- /dev/null +++ b/crates/anonymize-core/src/legal_forms.rs @@ -0,0 +1,1612 @@ +use std::collections::BTreeSet; + +use crate::byte_offsets::ByteOffsets; +use crate::processors::PatternSlice; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Result, SearchMatch}; + +const LEGAL_FORM_SCORE: f64 = 0.95; +const HEAD_TOKEN_CAP: usize = 20; +const MAX_LOWER_BRIDGE: usize = 4; +const MAX_NAME_LOOKBACK: usize = 32; + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct LegalFormData { + pub suffixes: Vec, + pub normalized_boundary_suffixes: Vec, + pub normalized_in_name_words: Vec, + pub normalized_suffix_words: Vec, + pub role_heads: Vec, + pub sentence_verb_indicators: Vec, + pub clause_noun_heads: Vec, + pub connector_prose_heads: Vec, + pub structural_single_cap_prefixes: Vec, + pub leading_clause_phrases: Vec, + pub leading_clause_direct_prefixes: Vec, + pub connector_words: Vec, + pub and_connector_words: Vec, + pub in_name_prepositions: Vec, + pub company_suffix_words: Vec, + pub comma_gated_direct_prefixes: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct PreparedLegalFormData { + suffixes: Vec, + normalized_boundary_suffixes: BTreeSet, + normalized_in_name_words: BTreeSet, + normalized_suffix_words: BTreeSet, + role_heads: BTreeSet, + sentence_verb_indicators: BTreeSet, + clause_noun_heads: BTreeSet, + connector_prose_heads: BTreeSet, + structural_single_cap_prefixes: BTreeSet, + leading_clause_phrases: Vec, + leading_clause_direct_prefixes: Vec, + connector_words: BTreeSet, + and_connector_words: BTreeSet, + in_name_prepositions: BTreeSet, + company_suffix_words: BTreeSet, + comma_gated_direct_prefixes: BTreeSet, +} + +impl PreparedLegalFormData { + pub(crate) fn new(data: LegalFormData) -> Self { + Self { + suffixes: data.suffixes, + normalized_boundary_suffixes: lower_set( + data.normalized_boundary_suffixes, + ), + normalized_in_name_words: lower_set(data.normalized_in_name_words), + normalized_suffix_words: lower_set(data.normalized_suffix_words), + role_heads: lower_set(data.role_heads), + sentence_verb_indicators: lower_set(data.sentence_verb_indicators), + clause_noun_heads: lower_set(data.clause_noun_heads), + connector_prose_heads: lower_set(data.connector_prose_heads), + structural_single_cap_prefixes: lower_set( + data.structural_single_cap_prefixes, + ), + leading_clause_phrases: lower_vec(data.leading_clause_phrases), + leading_clause_direct_prefixes: lower_vec( + data.leading_clause_direct_prefixes, + ), + connector_words: lower_set(data.connector_words), + and_connector_words: lower_set(data.and_connector_words), + in_name_prepositions: lower_set(data.in_name_prepositions), + company_suffix_words: lower_set(data.company_suffix_words), + comma_gated_direct_prefixes: lower_set(data.comma_gated_direct_prefixes), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct Candidate { + start: usize, + suffix_start: usize, + end: usize, + trimmed: bool, +} + +pub(crate) fn process_legal_form_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &PreparedLegalFormData, +) -> Result> { + if data.suffixes.is_empty() { + return Ok(Vec::new()); + } + + let offsets = ByteOffsets::new(full_text); + let mut candidates = Vec::new(); + for found in matches { + if slice.local_index(found.pattern()).is_none() { + continue; + } + + let suffix_start = offsets.validate_offset(found.start())?; + let suffix_end = offsets.validate_offset(found.end())?; + let effective_suffix_start = + effective_line_wrapped_suffix_start(full_text, suffix_start); + if !is_leading_separator(full_text, suffix_start) + || !is_trailing_boundary(full_text, suffix_end) + { + continue; + } + + let Some(walker_start) = + walk_backward(full_text, effective_suffix_start, data) + else { + continue; + }; + if walker_start >= effective_suffix_start { + continue; + } + if crosses_sentence_end(full_text, walker_start, effective_suffix_start) { + continue; + } + + let candidate_start = trim_to_first_cap_after_verb( + full_text, + walker_start, + effective_suffix_start, + data, + ); + if candidate_start >= effective_suffix_start { + continue; + } + + candidates.push(Candidate { + start: candidate_start, + suffix_start: effective_suffix_start, + end: suffix_end, + trimmed: candidate_start != walker_start, + }); + } + + let candidates = drop_overlapping(candidates); + let mut entities = Vec::new(); + for candidate in candidates { + process_candidate(&mut entities, full_text, &candidate, data); + } + + Ok(entities) +} + +fn effective_line_wrapped_suffix_start( + text: &str, + suffix_start: usize, +) -> usize { + let mut scan = suffix_start; + while let Some((prev_start, ch)) = previous_char(text, scan) { + if matches!(ch, ' ' | '\t') { + scan = prev_start; + continue; + } + break; + } + + let Some((newline_start, '\n')) = previous_char(text, scan) else { + return suffix_start; + }; + let mut before = newline_start; + while let Some((prev_start, ch)) = previous_char(text, before) { + if ch == ' ' { + before = prev_start; + continue; + } + return if ch == '.' { before } else { suffix_start }; + } + + suffix_start +} + +fn is_trailing_boundary(text: &str, end: usize) -> bool { + text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn is_leading_separator(text: &str, suffix_start: usize) -> bool { + let Some((prev_start, prev)) = previous_char(text, suffix_start) else { + return true; + }; + if prev.is_alphanumeric() { + return false; + } + if prev != '.' { + return true; + } + previous_char(text, prev_start).is_none_or(|(_, ch)| !ch.is_alphabetic()) +} + +fn walk_backward( + text: &str, + suffix_start: usize, + data: &PreparedLegalFormData, +) -> Option { + let mut pos = suffix_start; + let mut steps = 0; + let mut leftmost_cap = None::; + let mut lower_bridge_run = 0_usize; + + while steps < HEAD_TOKEN_CAP { + let Some(token) = token_before(text, pos) else { + break; + }; + if !is_acceptable_token(token.text, data) { + break; + } + + if starts_lower(token.text) && leftmost_cap.is_some() { + let after_token = text.get(token.end..pos).unwrap_or_default(); + if starts_with_list_separator(after_token) + && is_legal_form_suffix_word(token.text, data) + { + break; + } + } + + if data.connector_words.contains(&token.text.to_lowercase()) { + let previous = token_before(text, token.start); + if previous + .as_ref() + .is_some_and(|found| is_legal_form_suffix_word(found.text, data)) + { + break; + } + if data + .and_connector_words + .contains(&token.text.to_lowercase()) + { + let upper_before = count_upper_before(text, token.start); + if upper_before <= 2 || has_middle_initial_before(text, token.start) { + break; + } + } + } + + if starts_upper(token.text) { + leftmost_cap = Some(token.start); + lower_bridge_run = 0; + } else if starts_lower(token.text) { + if leftmost_cap.is_some() { + lower_bridge_run = lower_bridge_run.saturating_add(1); + if lower_bridge_run > MAX_LOWER_BRIDGE { + break; + } + } + } else { + lower_bridge_run = 0; + } + + pos = token.start; + steps = steps.saturating_add(1); + } + + leftmost_cap +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Token<'a> { + start: usize, + end: usize, + text: &'a str, +} + +fn token_before(text: &str, pos: usize) -> Option> { + let mut end = pos; + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch == '\n' { + return None; + } + if is_inter_token_space(ch) || matches!(ch, ',' | ';') { + end = prev_start; + continue; + } + break; + } + if end == 0 { + return None; + } + + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if ch == '\n' || !is_token_char(ch) { + break; + } + start = prev_start; + } + + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +const fn is_inter_token_space(ch: char) -> bool { + matches!(ch, ' ' | '\t' | '\u{00a0}' | '\u{202f}') +} + +fn is_token_char(ch: char) -> bool { + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '.' | '&' | '-') +} + +fn is_acceptable_token(token: &str, data: &PreparedLegalFormData) -> bool { + token.chars().next().is_some_and(|ch| { + ch.is_uppercase() || ch.is_lowercase() || ch.is_ascii_digit() + }) || data.connector_words.contains(&token.to_lowercase()) +} + +fn starts_upper(text: &str) -> bool { + text.chars().next().is_some_and(char::is_uppercase) +} + +fn starts_lower(text: &str) -> bool { + text.chars().next().is_some_and(char::is_lowercase) +} + +fn starts_with_list_separator(text: &str) -> bool { + text + .chars() + .next() + .is_some_and(|ch| matches!(ch, ',' | ';')) +} + +fn normalize_suffix_token(text: &str) -> String { + text + .chars() + .filter(|ch| { + !matches!(ch, '.' | ',' | ' ' | '\t' | '\u{00a0}' | '\u{202f}') + }) + .collect::() + .to_lowercase() +} + +fn is_legal_form_suffix_word(word: &str, data: &PreparedLegalFormData) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() && data.normalized_suffix_words.contains(&normalized) +} + +fn is_known_boundary_suffix(word: &str, data: &PreparedLegalFormData) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() + && data.normalized_boundary_suffixes.contains(&normalized) +} + +fn is_in_name_legal_form_word( + word: &str, + data: &PreparedLegalFormData, +) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() && data.normalized_in_name_words.contains(&normalized) +} + +fn count_upper_before(text: &str, pos: usize) -> usize { + let mut scan = pos; + let mut count = 0_usize; + while let Some(token) = token_before(text, scan) { + if !starts_upper(token.text) { + break; + } + count = count.saturating_add(1); + scan = token.start; + } + count +} + +fn has_middle_initial_before(text: &str, pos: usize) -> bool { + let start = pos.saturating_sub(MAX_NAME_LOOKBACK); + let Some(slice) = text.get(start..pos) else { + return false; + }; + let trimmed = slice.trim_end_matches(is_inter_token_space); + let Some(last_word) = trailing_word(trimmed) else { + return false; + }; + let before_word = trimmed.get(..last_word.start).unwrap_or_default(); + let before_word = before_word.trim_end_matches(is_inter_token_space); + let Some((dot_start, '.')) = previous_char(before_word, before_word.len()) + else { + return false; + }; + previous_char(before_word, dot_start).is_some_and(|(_, ch)| ch.is_uppercase()) +} + +fn trailing_word(text: &str) -> Option> { + let mut end = text.len(); + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch.is_alphabetic() || matches!(ch, '\'' | '’') { + break; + } + end = prev_start; + } + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if !(ch.is_alphabetic() || matches!(ch, '\'' | '’')) { + break; + } + start = prev_start; + } + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +fn crosses_sentence_end(text: &str, start: usize, suffix_start: usize) -> bool { + let Some(slice) = text.get(start..suffix_start) else { + return false; + }; + let mut previous = None::; + let mut lowercase_run = 0_usize; + let mut uppercase_run = 0_usize; + + for ch in slice.chars() { + if ch.is_uppercase() { + uppercase_run = uppercase_run.saturating_add(1); + lowercase_run = 0; + previous = Some(ch); + continue; + } + if ch.is_lowercase() { + if previous.is_some_and(char::is_uppercase) || lowercase_run > 0 { + lowercase_run = lowercase_run.saturating_add(1); + } + uppercase_run = 0; + previous = Some(ch); + continue; + } + if ch == '.' { + previous = Some(ch); + continue; + } + if ch.is_whitespace() && previous == Some('.') { + if lowercase_run >= 2 || uppercase_run >= 2 { + return true; + } + lowercase_run = 0; + uppercase_run = 0; + } + previous = Some(ch); + } + + false +} + +fn trim_to_first_cap_after_verb( + text: &str, + candidate_start: usize, + suffix_start: usize, + data: &PreparedLegalFormData, +) -> usize { + if candidate_start >= suffix_start { + return candidate_start; + } + let mut last_verb_end = None::; + for token in word_tokens(text, candidate_start, suffix_start) { + if starts_lower(token.text) + && data + .sentence_verb_indicators + .contains(&token.text.to_lowercase()) + { + last_verb_end = Some(token.end); + } + } + + let Some(scan_start) = last_verb_end else { + return candidate_start; + }; + for token in word_tokens(text, scan_start, suffix_start) { + if !starts_upper(token.text) { + continue; + } + let lower = token.text.to_lowercase(); + if data.role_heads.contains(&lower) + || data.clause_noun_heads.contains(&lower) + { + continue; + } + return token.start; + } + + suffix_start +} + +fn word_tokens(text: &str, start: usize, end: usize) -> Vec> { + let mut tokens = Vec::new(); + let mut cursor = start; + while cursor < end { + let Some((ch_start, ch)) = next_char(text, cursor) else { + break; + }; + if !is_word_token_char(ch) { + cursor = ch_start.saturating_add(ch.len_utf8()); + continue; + } + + let token_start = ch_start; + let mut token_end = ch_start.saturating_add(ch.len_utf8()); + while token_end < end { + let Some((next_start, next)) = next_char(text, token_end) else { + break; + }; + if !is_word_token_char(next) { + break; + } + token_end = next_start.saturating_add(next.len_utf8()); + } + if let Some(token_text) = text.get(token_start..token_end) { + tokens.push(Token { + start: token_start, + end: token_end, + text: token_text, + }); + } + cursor = token_end; + } + tokens +} + +fn is_word_token_char(ch: char) -> bool { + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '-') +} + +fn drop_overlapping(candidates: Vec) -> Vec { + let mut sorted = candidates; + sorted.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + }); + + let mut out = Vec::::new(); + for candidate in sorted { + if out.last().is_some_and(|last| { + candidate.start >= last.start && candidate.end <= last.end + }) { + continue; + } + out.push(candidate); + } + out +} + +fn process_candidate( + results: &mut Vec, + full_text: &str, + candidate: &Candidate, + data: &PreparedLegalFormData, +) { + let Some(raw_text) = full_text.get(candidate.start..candidate.end) else { + return; + }; + let processed_end = candidate.start.saturating_add(trim_end_byte(raw_text)); + if processed_end <= candidate.start { + return; + } + let Some(text) = full_text.get(candidate.start..processed_end) else { + return; + }; + if text.len() < 5 { + return; + } + + let mut processed_start = candidate.start; + let mut processed_text = text; + if is_structural_single_cap_match(processed_text, data) + || is_bare_single_cap_structural_inner_match( + full_text, + candidate.start, + processed_text, + data, + ) + { + return; + } + + let role_trimmed = if let Some(trimmed) = + trim_role_head(full_text, processed_start, processed_text, data) + { + let Some(next_text) = full_text.get(trimmed.start..processed_end) else { + return; + }; + processed_start = trimmed.start; + processed_text = next_text; + true + } else { + false + }; + + if processed_text.contains('\n') && has_disallowed_line_break(processed_text) + { + return; + } + + let (entity_start, entity_text) = candidate_entity_span( + full_text, + candidate, + processed_start, + processed_end, + processed_text, + role_trimmed, + data, + ); + emit_candidate_segments( + results, + candidate, + text, + entity_start, + entity_text, + data, + ); +} + +fn candidate_entity_span<'a>( + full_text: &'a str, + candidate: &Candidate, + processed_start: usize, + processed_end: usize, + processed_text: &'a str, + role_trimmed: bool, + data: &PreparedLegalFormData, +) -> (usize, &'a str) { + if candidate.trimmed + || role_trimmed + || is_bare_single_cap_legal_form(processed_text) + { + return (processed_start, processed_text); + } + + let extended = extend_backward(full_text, processed_start, data, false); + if extended < processed_start + && let Some(extended_text) = full_text.get(extended..processed_end) + { + return (extended, extended_text.trim_end()); + } + + (processed_start, processed_text) +} + +fn emit_candidate_segments( + results: &mut Vec, + candidate: &Candidate, + original_text: &str, + entity_start: usize, + entity_text: &str, + data: &PreparedLegalFormData, +) { + for segment in split_embedded_legal_form_list(entity_start, entity_text, data) + { + let (mut segment_start, mut segment_text) = + trim_embedded_legal_form_list_prefix(segment.start, segment.text, data); + let leading = trim_leading_clause(segment_text, data); + if leading.offset > 0 + && let Some(trimmed) = segment_text.get(leading.offset..) + { + segment_start = segment_start.saturating_add(leading.offset); + segment_text = trimmed.trim_start(); + segment_start = segment_start.saturating_add(leading_ws_len(trimmed)); + } + + if segment_text.contains('\n') && has_disallowed_line_break(segment_text) { + continue; + } + + let mut emit_start = segment_start; + let mut emit_text = segment_text; + let prefix = prefix_info(emit_text); + let all_caps_match = + prefix.part.len() > 2 && prefix.part == prefix.part.to_uppercase(); + if all_caps_match { + let word_count = if prefix.end > 0 { + emit_text + .get(..prefix.end) + .unwrap_or_default() + .split_whitespace() + .count() + } else { + emit_text.split_whitespace().count() + }; + if word_count > 3 { + emit_start = candidate.start; + emit_text = original_text; + } + } + + if has_roman_numeral_suffix(emit_text) { + continue; + } + if short_ascii_suffix_collides_with_non_ascii_prefix(emit_text) { + continue; + } + + let end = emit_start.saturating_add(emit_text.len()); + results.push(PipelineEntity::detected( + u32::try_from(emit_start).unwrap_or(u32::MAX), + u32::try_from(end).unwrap_or(u32::MAX), + "organization", + emit_text, + LEGAL_FORM_SCORE, + DetectionSource::LegalForm, + )); + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct TrimmedStart { + start: usize, +} + +fn trim_role_head( + full_text: &str, + match_start: usize, + text: &str, + data: &PreparedLegalFormData, +) -> Option { + let first = first_role_word(text)?; + let first_lower = first.text.to_lowercase(); + let first_leading = first + .text + .split('-') + .next() + .unwrap_or_default() + .to_lowercase(); + if !data.role_heads.contains(&first_lower) + && !data.role_heads.contains(&first_leading) + { + return None; + } + + let suffix_offset = suffix_offset_in_text(text, data)?; + let mid_start = first.end; + if mid_start >= suffix_offset { + return None; + } + let mid = text.get(mid_start..suffix_offset).unwrap_or_default(); + let mut last_verb_end = None::; + for token in word_tokens(text, mid_start, suffix_offset) { + if data + .sentence_verb_indicators + .contains(&token.text.to_lowercase()) + { + last_verb_end = Some(token.end); + } + } + let digit_after_role = mid + .trim_start() + .chars() + .next() + .is_some_and(|ch| ch.is_ascii_digit()); + let appositive_role_head = !digit_after_role + && last_verb_end.is_none() + && preceding_word_is_sentence_verb(full_text, match_start, data); + + if last_verb_end.is_none() && !digit_after_role && !appositive_role_head { + return None; + } + + let scan_start = last_verb_end.unwrap_or(mid_start); + for token in word_tokens(text, scan_start, suffix_offset) { + if !starts_upper(token.text) { + continue; + } + let lower = token.text.to_lowercase(); + if data.role_heads.contains(&lower) + || data.clause_noun_heads.contains(&lower) + { + continue; + } + return Some(TrimmedStart { + start: match_start.saturating_add(token.start), + }); + } + + Some(TrimmedStart { + start: match_start.saturating_add(suffix_offset), + }) +} + +fn first_role_word(text: &str) -> Option> { + let mut end = 0_usize; + let mut saw = false; + let mut previous_was_hyphen = false; + while let Some((start, ch)) = next_char(text, end) { + if ch.is_alphabetic() { + saw = true; + previous_was_hyphen = false; + end = start.saturating_add(ch.len_utf8()); + continue; + } + if ch == '-' && saw { + previous_was_hyphen = true; + end = start.saturating_add(ch.len_utf8()); + continue; + } + if previous_was_hyphen { + end = start.saturating_sub('-'.len_utf8()); + } + break; + } + (saw && end > 0).then(|| Token { + start: 0, + end, + text: text.get(..end).unwrap_or_default(), + }) +} + +fn suffix_offset_in_text( + text: &str, + data: &PreparedLegalFormData, +) -> Option { + for suffix in &data.suffixes { + let Some(offset) = text.rfind(suffix) else { + continue; + }; + if offset.saturating_add(suffix.len()) >= text.len().saturating_sub(1) { + return Some(offset); + } + } + None +} + +fn preceding_word_is_sentence_verb( + full_text: &str, + match_start: usize, + data: &PreparedLegalFormData, +) -> bool { + let window_start = match_start.saturating_sub(40); + let Some(before) = full_text.get(window_start..match_start) else { + return false; + }; + trailing_word(before).is_some_and(|word| { + data + .sentence_verb_indicators + .contains(&word.text.to_lowercase()) + }) +} + +fn is_structural_single_cap_match( + text: &str, + data: &PreparedLegalFormData, +) -> bool { + let mut tokens = text.split_whitespace(); + let Some(first) = tokens.next() else { + return false; + }; + let Some(second) = tokens.next() else { + return false; + }; + data + .structural_single_cap_prefixes + .contains(&first.to_lowercase()) + && is_single_cap_token(second.trim_matches(',')) +} + +fn is_bare_single_cap_structural_inner_match( + full_text: &str, + match_start: usize, + text: &str, + data: &PreparedLegalFormData, +) -> bool { + if !is_bare_single_cap_legal_form(text) { + return false; + } + token_before(full_text, match_start).is_some_and(|token| { + data + .structural_single_cap_prefixes + .contains(&token.text.to_lowercase()) + }) +} + +fn is_bare_single_cap_legal_form(text: &str) -> bool { + let Some(first) = text.chars().next() else { + return false; + }; + if !first.is_uppercase() { + return false; + } + let after_first = text.get(first.len_utf8()..).unwrap_or_default(); + after_first + .chars() + .next() + .is_some_and(|ch| is_inter_token_space(ch) || ch == ',') +} + +fn is_single_cap_token(text: &str) -> bool { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_uppercase() && chars.next().is_none() +} + +fn has_disallowed_line_break(text: &str) -> bool { + let mut search_start = 0_usize; + while let Some(relative) = + text.get(search_start..).and_then(|tail| tail.find('\n')) + { + let index = search_start.saturating_add(relative); + let before = text.get(..index).unwrap_or_default(); + let after = text.get(index.saturating_add(1)..).unwrap_or_default(); + let dotted_designator_before = + before.trim_end_matches(is_inter_token_space).ends_with('.'); + let after_trimmed = after.trim_matches(is_inter_token_space); + let legal_suffix_after = is_dotted_upper_suffix(after_trimmed); + let all_caps_suffix_after = after_trimmed + .trim_end_matches('.') + .chars() + .all(char::is_uppercase) + && after_trimmed.chars().any(char::is_uppercase); + if !dotted_designator_before + || (!legal_suffix_after && !all_caps_suffix_after) + { + return true; + } + search_start = index.saturating_add(1); + } + false +} + +fn is_dotted_upper_suffix(text: &str) -> bool { + let mut saw_upper = false; + for part in text.split('.') { + if part.is_empty() { + continue; + } + if !part.chars().all(char::is_uppercase) { + return false; + } + saw_upper = true; + } + saw_upper +} + +fn extend_backward( + full_text: &str, + match_start: usize, + data: &PreparedLegalFormData, + force_suffix_mode: bool, +) -> usize { + let head_word = leading_entity_word(full_text, match_start); + let suffix_mode = force_suffix_mode + || head_word.as_ref().is_some_and(|word| { + data.company_suffix_words.contains(&word.to_lowercase()) + }); + let mut pos = match_start; + + while let Some(found) = simple_word_before(full_text, pos) { + let word = found.text; + let lower = word.to_lowercase(); + let is_upper = starts_upper(word); + let is_connector = data.connector_words.contains(&lower); + let is_in_name_prep = + suffix_mode && data.in_name_prepositions.contains(&lower); + + if is_upper { + pos = found.start; + continue; + } + + if is_connector { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) + || is_known_boundary_suffix(previous.text, data) + { + break; + } + if data.and_connector_words.contains(&lower) { + let upper_before = + count_upper_words_before(full_text, found.start, suffix_mode, data); + let middle_initial = has_middle_initial_before(full_text, found.start); + if upper_before <= 1 + && (data + .clause_noun_heads + .contains(&previous.text.to_lowercase()) + || data + .connector_prose_heads + .contains(&previous.text.to_lowercase())) + { + break; + } + let person_name_boundary = if suffix_mode { + middle_initial && has_single_cap_prefix_before(full_text, match_start) + } else { + (upper_before == 2 + && !is_in_name_legal_form_word(previous.text, data)) + || middle_initial + }; + if person_name_boundary { + break; + } + } + pos = previous.start; + continue; + } + + if is_in_name_prep { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) { + break; + } + pos = previous.start; + continue; + } + + break; + } + + skip_initials_backward(full_text, pos) +} + +fn simple_word_before(text: &str, pos: usize) -> Option> { + let mut end = pos; + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch == '\n' { + return None; + } + if ch.is_whitespace() { + end = prev_start; + continue; + } + break; + } + + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if !(ch.is_alphabetic() || ch == '&') { + break; + } + start = prev_start; + } + + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +fn leading_entity_word(text: &str, start: usize) -> Option { + let mut end = start; + while let Some((ch_start, ch)) = next_char(text, end) { + if !(ch.is_alphabetic() || ch == '&') { + break; + } + end = ch_start.saturating_add(ch.len_utf8()); + } + (end > start).then(|| text.get(start..end).unwrap_or_default().to_owned()) +} + +fn count_upper_words_before( + full_text: &str, + pos: usize, + cross_in_name_preps: bool, + data: &PreparedLegalFormData, +) -> usize { + let mut count = 0_usize; + let mut scan = pos; + while scan > 0 { + let Some(found) = simple_word_before(full_text, scan) else { + break; + }; + if starts_upper(found.text) { + count = count.saturating_add(1); + scan = found.start; + continue; + } + if cross_in_name_preps + && data + .in_name_prepositions + .contains(&found.text.to_lowercase()) + { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) { + break; + } + scan = found.start; + continue; + } + break; + } + count +} + +fn has_single_cap_prefix_before(full_text: &str, match_start: usize) -> bool { + simple_word_before(full_text, match_start) + .is_some_and(|word| is_single_cap_token(word.text)) +} + +fn skip_initials_backward(full_text: &str, pos: usize) -> usize { + let mut scan = pos; + while let Some((prev_start, ch)) = previous_char(full_text, scan) { + if ch == '\n' || !ch.is_whitespace() { + break; + } + scan = prev_start; + } + let Some((dot_start, '.')) = previous_char(full_text, scan) else { + return pos; + }; + + let mut cursor = dot_start; + let mut start = dot_start; + let mut saw_two = false; + while let Some((letter_start, letter)) = previous_char(full_text, cursor) { + if !letter.is_uppercase() { + break; + } + start = letter_start; + let before_letter = previous_char(full_text, letter_start); + match before_letter { + Some((space_start, ch)) if is_inter_token_space(ch) => { + cursor = space_start; + } + Some((prev_dot_start, '.')) => { + saw_two = true; + cursor = prev_dot_start; + } + _ => break, + } + } + + if saw_two + && previous_char(full_text, start) + .is_none_or(|(_, ch)| !ch.is_alphanumeric()) + { + return start; + } + pos +} + +#[derive(Clone, Copy, Debug)] +struct Segment<'a> { + start: usize, + text: &'a str, +} + +fn split_embedded_legal_form_list<'a>( + entity_start: usize, + entity_text: &'a str, + data: &PreparedLegalFormData, +) -> Vec> { + let mut cuts = vec![0_usize]; + for suffix in &data.suffixes { + if is_roman_numeral(&clean_suffix(suffix)) { + continue; + } + let mut search_from = 0_usize; + while let Some(relative) = entity_text + .get(search_from..) + .and_then(|tail| tail.find(suffix)) + { + let suffix_start = search_from.saturating_add(relative); + let suffix_end = suffix_start.saturating_add(suffix.len()); + search_from = suffix_end; + if suffix_end >= entity_text.len().saturating_sub(1) { + continue; + } + let Some(after) = entity_text.get(suffix_end..) else { + continue; + }; + let boundary_len = legal_list_boundary_len(after); + if boundary_len > 0 { + cuts.push(suffix_end.saturating_add(boundary_len)); + } + } + } + + cuts.sort_unstable(); + cuts.dedup(); + if cuts.len() == 1 { + return vec![Segment { + start: entity_start, + text: entity_text, + }]; + } + + let mut segments = Vec::new(); + for (index, start) in cuts.iter().enumerate() { + let end = cuts + .get(index.saturating_add(1)) + .copied() + .unwrap_or(entity_text.len()); + if *start >= end { + continue; + } + let Some(segment) = entity_text.get(*start..end) else { + continue; + }; + let trimmed = segment.trim_end_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, ',' | ';') + }); + if trimmed.is_empty() || !ends_with_legal_suffix(trimmed, data) { + continue; + } + segments.push(Segment { + start: entity_start.saturating_add(*start), + text: trimmed, + }); + } + + segments +} + +fn legal_list_boundary_len(text: &str) -> usize { + let mut chars = text.char_indices(); + let Some((_, first)) = chars.next() else { + return 0; + }; + if !matches!(first, ',' | ';') { + return 0; + } + let mut end = first.len_utf8(); + let mut saw_space = false; + for (index, ch) in chars { + if ch.is_whitespace() { + saw_space = true; + end = index.saturating_add(ch.len_utf8()); + continue; + } + if saw_space && (ch.is_uppercase() || ch == '.') { + return end; + } + return 0; + } + 0 +} + +fn ends_with_legal_suffix(text: &str, data: &PreparedLegalFormData) -> bool { + data.suffixes.iter().any(|suffix| text.ends_with(suffix)) +} + +fn trim_embedded_legal_form_list_prefix<'a>( + entity_start: usize, + entity_text: &'a str, + data: &PreparedLegalFormData, +) -> (usize, &'a str) { + let mut cut = 0_usize; + for suffix in &data.suffixes { + if is_roman_numeral(&clean_suffix(suffix)) { + continue; + } + let mut search_from = 0_usize; + while let Some(relative) = entity_text + .get(search_from..) + .and_then(|tail| tail.find(suffix)) + { + let suffix_start = search_from.saturating_add(relative); + let suffix_end = suffix_start.saturating_add(suffix.len()); + search_from = suffix_end; + if suffix_end >= entity_text.len().saturating_sub(1) { + continue; + } + let Some(after) = entity_text.get(suffix_end..) else { + continue; + }; + let boundary_len = comma_upper_boundary_len(after); + if boundary_len == 0 { + continue; + } + let next_start = suffix_end.saturating_add(boundary_len); + if entity_text + .get(next_start..) + .is_some_and(|remainder| ends_with_legal_suffix(remainder, data)) + { + cut = cut.max(next_start); + } + } + } + + if cut == 0 { + return (entity_start, entity_text); + } + ( + entity_start.saturating_add(cut), + entity_text.get(cut..).unwrap_or_default(), + ) +} + +fn comma_upper_boundary_len(text: &str) -> usize { + let Some(stripped) = text.strip_prefix(',') else { + return 0; + }; + let ws_len = leading_ws_len(stripped); + if ws_len == 0 { + return 0; + } + let after_ws = stripped.get(ws_len..).unwrap_or_default(); + if after_ws.chars().next().is_some_and(char::is_uppercase) { + return ','.len_utf8().saturating_add(ws_len); + } + 0 +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct LeadingTrim { + offset: usize, +} + +fn trim_leading_clause( + text: &str, + data: &PreparedLegalFormData, +) -> LeadingTrim { + let lower = text.to_lowercase(); + let mut cut = 0_usize; + + for phrase in &data.leading_clause_phrases { + let mut search_from = 0_usize; + while let Some(relative) = + lower.get(search_from..).and_then(|tail| tail.find(phrase)) + { + let start = search_from.saturating_add(relative); + let end = start.saturating_add(phrase.len()); + search_from = end; + let before_ok = start == 0 + || lower + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_whitespace); + let after_ws = lower.get(end..).map(leading_ws_len).unwrap_or_default(); + if before_ok && after_ws > 0 { + cut = cut.max(end.saturating_add(after_ws)); + } + } + } + + for prefix in &data.leading_clause_direct_prefixes { + let mut search_from = 0_usize; + while let Some(relative) = lower + .get(search_from..) + .and_then(|tail| find_word_at_boundary(tail, prefix)) + { + let start = search_from.saturating_add(relative); + let end = start.saturating_add(prefix.len()); + search_from = end; + let after_ws = lower.get(end..).map(leading_ws_len).unwrap_or_default(); + let after = lower + .get(end.saturating_add(after_ws)..) + .and_then(|suffix| suffix.chars().next()); + if after_ws == 0 || !after.is_some_and(char::is_uppercase) { + continue; + } + + let before = text.get(..start).unwrap_or_default(); + let prefix_lower = prefix.to_lowercase(); + if data.comma_gated_direct_prefixes.contains(&prefix_lower) { + let has_comma = before.trim_end().ends_with(','); + let has_sentence_verb = + word_tokens(before, 0, before.len()).iter().any(|word| { + starts_lower(word.text) + && data + .sentence_verb_indicators + .contains(&word.text.to_lowercase()) + }); + if !has_comma && !has_sentence_verb { + continue; + } + } + + let words = word_tokens(before, 0, before.len()); + let has_prose_prefix = + words.len() >= 3 && words.iter().any(|word| starts_lower(word.text)); + if has_prose_prefix { + cut = cut.max(end.saturating_add(after_ws)); + } + } + } + + for (comma, _) in text.match_indices(',') { + let before = text.get(..comma).unwrap_or_default(); + if !before.chars().any(|ch| ch.is_ascii_digit()) { + continue; + } + let after = text.get(comma.saturating_add(1)..).unwrap_or_default(); + let ws = leading_ws_len(after); + let candidate = after.get(ws..).unwrap_or_default(); + let upper_words = word_tokens(candidate, 0, candidate.len()) + .into_iter() + .filter(|word| starts_upper(word.text)) + .count(); + if upper_words >= 3 { + cut = cut.max(comma.saturating_add(1).saturating_add(ws)); + } + } + + LeadingTrim { offset: cut } +} + +fn find_word_at_boundary(haystack: &str, needle: &str) -> Option { + let mut from = 0_usize; + while let Some(relative) = haystack.get(from..)?.find(needle) { + let start = from.saturating_add(relative); + let end = start.saturating_add(needle.len()); + let left_ok = previous_char(haystack, start) + .is_none_or(|(_, ch)| !ch.is_alphanumeric()); + let right_ok = haystack + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if left_ok && right_ok { + return Some(start); + } + from = end; + } + None +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct PrefixInfo { + end: usize, + part: String, +} + +fn prefix_info(text: &str) -> PrefixInfo { + let end = text.rfind(',').or_else(|| text.rfind(' ')).unwrap_or(0); + let source = if end > 0 { + text.get(..end).unwrap_or_default() + } else { + text + }; + PrefixInfo { + end, + part: source.chars().filter(|ch| ch.is_alphabetic()).collect(), + } +} + +fn has_roman_numeral_suffix(text: &str) -> bool { + let separator = last_suffix_separator(text); + let raw_suffix = separator + .and_then(|index| text.get(index.saturating_add(1)..)) + .unwrap_or_default(); + let suffix = clean_suffix(raw_suffix); + !suffix.is_empty() && is_roman_numeral(&suffix) +} + +fn short_ascii_suffix_collides_with_non_ascii_prefix(text: &str) -> bool { + let separator = last_suffix_separator(text); + let raw_suffix = separator + .and_then(|index| text.get(index.saturating_add(1)..)) + .unwrap_or_default(); + let suffix = clean_suffix(raw_suffix); + if suffix.len() > 2 || raw_suffix.contains('.') { + return false; + } + let prefix = separator + .and_then(|index| text.get(..index)) + .unwrap_or(text) + .chars() + .filter(|ch| !matches!(ch, '\u{00a0}' | '\u{202f}')) + .collect::(); + !prefix.is_ascii() +} + +fn last_suffix_separator(text: &str) -> Option { + text + .char_indices() + .filter_map(|(index, ch)| { + matches!(ch, ' ' | '\t' | '\u{00a0}' | '\u{202f}' | ',').then_some(index) + }) + .next_back() +} + +fn clean_suffix(text: &str) -> String { + text.chars().filter(|ch| !matches!(ch, '.' | ',')).collect() +} + +fn is_roman_numeral(text: &str) -> bool { + if text.is_empty() + || !text.chars().next().is_some_and(|ch| { + ch == 'I' + || ch == 'V' + || ch == 'X' + || ch == 'L' + || ch == 'C' + || ch == 'D' + || ch == 'M' + }) + { + return false; + } + + let bytes = text.as_bytes(); + let mut index = 0_usize; + + let _ = take_repeated(bytes, &mut index, b'M', 3); + + if consume_pair(bytes, &mut index, b'C', b'M') + || consume_pair(bytes, &mut index, b'C', b'D') + { + } else { + let _ = consume(bytes, &mut index, b'D'); + let _ = take_repeated(bytes, &mut index, b'C', 3); + } + + if consume_pair(bytes, &mut index, b'X', b'C') + || consume_pair(bytes, &mut index, b'X', b'L') + { + } else { + let _ = consume(bytes, &mut index, b'L'); + let _ = take_repeated(bytes, &mut index, b'X', 3); + } + + if consume_pair(bytes, &mut index, b'I', b'X') + || consume_pair(bytes, &mut index, b'I', b'V') + { + } else { + let _ = consume(bytes, &mut index, b'V'); + let _ = take_repeated(bytes, &mut index, b'I', 3); + } + + index == bytes.len() +} + +fn take_repeated( + bytes: &[u8], + index: &mut usize, + target: u8, + max: usize, +) -> usize { + let mut count = 0_usize; + while count < max && bytes.get(*index) == Some(&target) { + *index = index.saturating_add(1); + count = count.saturating_add(1); + } + count +} + +fn consume_pair( + bytes: &[u8], + index: &mut usize, + first: u8, + second: u8, +) -> bool { + if bytes.get(*index) != Some(&first) + || bytes.get(index.saturating_add(1)) != Some(&second) + { + return false; + } + *index = index.saturating_add(2); + true +} + +fn consume(bytes: &[u8], index: &mut usize, target: u8) -> bool { + if bytes.get(*index) != Some(&target) { + return false; + } + *index = index.saturating_add(1); + true +} + +fn trim_end_byte(text: &str) -> usize { + text.trim_end().len() +} + +fn leading_ws_len(text: &str) -> usize { + let mut len = 0_usize; + for ch in text.chars() { + if !ch.is_whitespace() { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + +fn previous_char(text: &str, pos: usize) -> Option<(usize, char)> { + text.get(..pos)?.char_indices().next_back() +} + +fn next_char(text: &str, pos: usize) -> Option<(usize, char)> { + text + .get(pos..)? + .char_indices() + .next() + .map(|(relative, ch)| (pos.saturating_add(relative), ch)) +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .filter(|value| !value.is_empty()) + .map(|value| value.to_lowercase()) + .collect() +} + +fn lower_vec(values: Vec) -> Vec { + values + .into_iter() + .filter(|value| !value.is_empty()) + .map(|value| value.to_lowercase()) + .collect() +} diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 0a0e9db5..5f8f077a 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -2,8 +2,15 @@ //! Core anonymization contracts shared by host-language bindings. +mod address_seeds; +mod anchored; +mod artifact_bytes; pub(crate) mod byte_offsets; +mod dates; mod diagnostics; +mod false_positives; +mod legal_forms; +mod money; pub(crate) mod normalize; mod placeholders; mod prepared; @@ -11,23 +18,35 @@ mod processors; mod redact; mod resolution; mod search; +mod signatures; +mod triggers; mod types; +mod validators; +pub use address_seeds::AddressSeedData; +pub use dates::DateData; pub use diagnostics::{ DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, StaticRedactionDiagnostics, }; +pub use legal_forms::LegalFormData; +pub use money::{ + AmountWordsData, CurrencyData, MagnitudeSuffixData, MonetaryData, + ShareQuantityTermData, WrittenAmountPatternData, +}; pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; pub use prepared::{ - PreparedSearch, PreparedSearchBuildResult, PreparedSearchConfig, - PreparedSearchMatches, PreparedSearchSlices, StaticDetectionResult, - StaticRedactionDiagnosticResult, StaticRedactionResult, + PreparedSearch, PreparedSearchArtifacts, PreparedSearchBuildResult, + PreparedSearchConfig, PreparedSearchMatches, PreparedSearchSlices, + StaticDetectionResult, StaticRedactionDiagnosticResult, + StaticRedactionResult, }; pub use processors::{ CountryMatchData, DenyListFilterData, DenyListMatchData, GazetteerMatchData, - PatternSlice, RegexMatchMeta, process_country_matches, - process_deny_list_matches, process_gazetteer_matches, process_regex_matches, + PatternSlice, RegexMatchMeta, SigningPlaceGuardData, StringGroups, + process_country_matches, process_deny_list_matches, + process_gazetteer_matches, process_regex_matches, }; pub use redact::{deanonymise, redact_text}; pub use resolution::{ @@ -36,7 +55,10 @@ pub use resolution::{ }; pub use search::{ FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, - SearchOptions, SearchPattern, + SearchIndexArtifacts, SearchOptions, SearchPattern, +}; +pub use triggers::{ + TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, }; pub use types::{ Entity, EntityKind, Error, OperatorConfig, OperatorEntry, OperatorType, diff --git a/crates/anonymize-core/src/money.rs b/crates/anonymize-core/src/money.rs new file mode 100644 index 00000000..559937fe --- /dev/null +++ b/crates/anonymize-core/src/money.rs @@ -0,0 +1,687 @@ +use std::collections::BTreeSet; + +use crate::anchored::{ + AnchorSpan, AnchorTerm, AnchoredExtractor, AnchoredRule, +}; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::Result; + +const MONEY_LABEL: &str = "monetary amount"; +const MONEY_SCORE: f64 = 0.9; +const MAX_LEFT_SCAN_BYTES: usize = 96; + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct MonetaryData { + pub currencies: CurrencyData, + pub amount_words: AmountWordsData, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CurrencyData { + pub codes: Vec, + pub symbols: Vec, + pub local_names: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct AmountWordsData { + pub written_amount_patterns: Vec, + pub magnitude_suffixes: Vec, + pub share_quantity_terms: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct WrittenAmountPatternData { + pub keywords: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct MagnitudeSuffixData { + pub words: Vec, + pub abbreviations_case_insensitive: Vec, + pub abbreviations_case_sensitive: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ShareQuantityTermData { + pub modifiers: Vec, + pub nouns: Vec, +} + +pub(crate) struct PreparedMonetaryData { + extractor: AnchoredExtractor, +} + +impl PreparedMonetaryData { + pub(crate) fn new(data: MonetaryData) -> Result> { + AnchoredExtractor::new(MonetaryRule::new(data)) + .map(|extractor| extractor.map(|extractor| Self { extractor })) + } + + pub(crate) fn process(&self, full_text: &str) -> Result> { + self.extractor.extract(full_text) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum AnchorKind { + Code, + Symbol, + LocalName, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct MagnitudeTerm { + text: String, + folded: String, + case_insensitive: bool, +} + +struct MonetaryRule { + codes: BTreeSet, + symbols: BTreeSet, + local_names: Vec, + magnitudes: Vec, + quantity_followers: Vec, + written_amount_keywords: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct CurrencyName { + text: String, + folded: String, + case_insensitive: bool, + whole_words: bool, +} + +impl MonetaryRule { + fn new(data: MonetaryData) -> Self { + let codes = clean_terms(data.currencies.codes) + .into_iter() + .collect::>(); + let symbols = clean_terms(data.currencies.symbols) + .into_iter() + .collect::>(); + let local_names = clean_terms(data.currencies.local_names) + .into_iter() + .map(currency_name) + .collect::>(); + let mut magnitudes = Vec::new(); + for entry in data.amount_words.magnitude_suffixes { + magnitudes.extend( + clean_terms(entry.words) + .into_iter() + .map(|text| magnitude_term(text, true)), + ); + magnitudes.extend( + clean_terms(entry.abbreviations_case_insensitive) + .into_iter() + .map(|text| magnitude_term(text, true)), + ); + magnitudes.extend( + clean_terms(entry.abbreviations_case_sensitive) + .into_iter() + .map(|text| magnitude_term(text, false)), + ); + } + magnitudes.sort_by_key(|term| std::cmp::Reverse(term.text.len())); + + let mut quantity_followers = Vec::new(); + for entry in data.amount_words.share_quantity_terms { + quantity_followers.extend(clean_terms(entry.modifiers)); + quantity_followers.extend(clean_terms(entry.nouns)); + } + quantity_followers.sort_by_key(|term| std::cmp::Reverse(term.len())); + + let mut written_amount_keywords = Vec::new(); + for entry in data.amount_words.written_amount_patterns { + written_amount_keywords.extend( + clean_terms(entry.keywords) + .into_iter() + .map(|term| term.to_lowercase()), + ); + } + written_amount_keywords.sort_by_key(|term| std::cmp::Reverse(term.len())); + + Self { + codes, + symbols, + local_names, + magnitudes, + quantity_followers, + written_amount_keywords, + } + } + + fn classify_anchor(&self, text: &str) -> Option { + if self.symbols.contains(text) { + return Some(AnchorKind::Symbol); + } + if self.codes.contains(text) { + return Some(AnchorKind::Code); + } + + let folded = text.to_lowercase(); + self.local_names.iter().find_map(|name| { + if name.case_insensitive && name.folded == folded { + return Some(AnchorKind::LocalName); + } + (!name.case_insensitive && name.text == text) + .then_some(AnchorKind::LocalName) + }) + } +} + +impl AnchoredRule for MonetaryRule { + fn anchor_terms(&self) -> Vec { + let mut anchors = Vec::new(); + anchors.extend( + self + .codes + .iter() + .cloned() + .map(AnchorTerm::word_case_sensitive), + ); + anchors.extend(self.symbols.iter().cloned().map(AnchorTerm::symbol)); + anchors.extend(self.local_names.iter().map(|name| { + AnchorTerm::new( + name.text.clone(), + name.case_insensitive, + name.whole_words, + ) + })); + anchors + } + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result> { + let Some(anchor_text) = str_slice(full_text, anchor.start, anchor.end) + else { + return Ok(Vec::new()); + }; + let Some(kind) = self.classify_anchor(anchor_text) else { + return Ok(Vec::new()); + }; + + let mut entities = Vec::new(); + if let Some((start, end)) = + self.leading_amount_span(full_text, anchor, kind) + && let Some(entity) = money_entity(full_text, start, end) + { + entities.push(entity); + } + if let Some((start, end)) = + self.trailing_amount_span(full_text, anchor, kind) + && let Some(entity) = money_entity(full_text, start, end) + { + entities.push(entity); + } + + Ok(entities) + } +} + +impl MonetaryRule { + fn leading_amount_span( + &self, + text: &str, + anchor: AnchorSpan, + kind: AnchorKind, + ) -> Option<(usize, usize)> { + if !left_money_boundary(text, anchor.start, kind) { + return None; + } + + let number_start = skip_horizontal_ws_limit(text, anchor.end, 2); + let number = parse_number_forward(text, number_start)?; + let (end, _) = self + .parse_magnitude_forward(text, number.end) + .unwrap_or((number.end, false)); + right_money_boundary(text, end) + .then(|| (anchor.start, self.extend_written_amount(text, end))) + } + + fn trailing_amount_span( + &self, + text: &str, + anchor: AnchorSpan, + kind: AnchorKind, + ) -> Option<(usize, usize)> { + if !right_money_boundary(text, anchor.end) { + return None; + } + + let scan_start = char_boundary_before( + text, + anchor.start.saturating_sub(MAX_LEFT_SCAN_BYTES), + ); + let window = str_slice(text, scan_start, anchor.start)?; + let mut best = None; + + for (offset, ch) in window.char_indices() { + if !ch.is_ascii_digit() { + continue; + } + let number_start = scan_start.saturating_add(offset); + let number = parse_number_forward(text, number_start)?; + let (after_number, has_magnitude) = self + .parse_magnitude_forward(text, number.end) + .unwrap_or((number.end, false)); + let after_gap = skip_horizontal_ws_limit(text, after_number, 4); + if after_gap != anchor.start { + continue; + } + + let start = leading_symbol_start(text, number.start) + .filter(|value| left_money_boundary(text, *value, AnchorKind::Symbol)) + .unwrap_or(number.start); + if !left_money_boundary(text, start, kind) { + continue; + } + if has_magnitude + && kind != AnchorKind::Symbol + && self.has_quantity_follower(text, anchor.end) + { + continue; + } + let end = self.extend_written_amount(text, anchor.end); + if best.is_none_or(|(best_start, _)| start < best_start) { + best = Some((start, end)); + } + } + + best + } + + fn parse_magnitude_forward( + &self, + text: &str, + index: usize, + ) -> Option<(usize, bool)> { + let start = skip_horizontal_ws_limit(text, index, 8); + self.match_magnitude_at(text, start).map(|end| (end, true)) + } + + fn match_magnitude_at(&self, text: &str, index: usize) -> Option { + for term in &self.magnitudes { + let end = index.saturating_add(term.text.len()); + let Some(candidate) = str_slice(text, index, end) else { + continue; + }; + let matches = if term.case_insensitive { + candidate.to_lowercase() == term.folded + } else { + candidate == term.text + }; + if matches && right_word_boundary(text, end) { + return Some(end); + } + } + None + } + + fn has_quantity_follower(&self, text: &str, index: usize) -> bool { + let start = skip_horizontal_ws_limit(text, index, 16); + self.quantity_followers.iter().any(|term| { + let end = start.saturating_add(term.len()); + str_slice(text, start, end).is_some_and(|candidate| { + candidate.to_lowercase() == *term && right_word_boundary(text, end) + }) + }) + } + + fn extend_written_amount(&self, text: &str, index: usize) -> usize { + if self.written_amount_keywords.is_empty() { + return index; + } + + self.match_written_amount_at(text, index).unwrap_or(index) + } + + fn match_written_amount_at(&self, text: &str, index: usize) -> Option { + let after = str_tail(text, index)?; + let mut cursor = 0usize; + + if let Some(ch) = after.chars().next() + && matches!(ch, ',' | ';') + { + cursor = cursor.saturating_add(ch.len_utf8()); + } + + cursor = skip_horizontal_ws_limit(after, cursor, usize::MAX); + if after.get(cursor..)?.chars().next()? != '(' { + return None; + } + + cursor = cursor.saturating_add('('.len_utf8()); + let keyword_end = self.match_written_amount_keyword(after, cursor)?; + cursor = keyword_end; + let separator = after.get(cursor..)?.chars().next()?; + if separator == '\n' || separator == '\r' { + return None; + } + if separator != ':' && !separator.is_whitespace() { + return None; + } + cursor = cursor.saturating_add(separator.len_utf8()); + + let mut content_chars = 0usize; + for (offset, ch) in after.get(cursor..)?.char_indices() { + if ch == '\n' || ch == '\r' { + return None; + } + if ch == ')' { + if content_chars == 0 || content_chars > 120 { + return None; + } + return Some( + index + .saturating_add(cursor) + .saturating_add(offset) + .saturating_add(ch.len_utf8()), + ); + } + content_chars = content_chars.saturating_add(1); + if content_chars > 120 { + return None; + } + } + + None + } + + fn match_written_amount_keyword( + &self, + text: &str, + index: usize, + ) -> Option { + for keyword in &self.written_amount_keywords { + let end = index.saturating_add(keyword.len()); + let Some(candidate) = str_slice(text, index, end) else { + continue; + }; + if candidate.to_lowercase() == *keyword { + return Some(end); + } + } + None + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct NumberSpan { + start: usize, + end: usize, +} + +fn parse_number_forward(text: &str, index: usize) -> Option { + let mut digits = 0usize; + let mut end = index; + let mut value_end = index; + + for (offset, ch) in str_tail(text, index)?.char_indices() { + let char_start = index.saturating_add(offset); + if char_start.saturating_sub(index) > 48 { + break; + } + + if ch.is_ascii_digit() { + digits = digits.saturating_add(1); + end = char_start.saturating_add(ch.len_utf8()); + value_end = end; + continue; + } + + if is_dash(ch) && digits > 0 { + value_end = char_start.saturating_add(ch.len_utf8()); + break; + } + + if is_number_separator(ch) + && number_separator_continues( + text, + char_start.saturating_add(ch.len_utf8()), + ch, + ) + { + end = char_start.saturating_add(ch.len_utf8()); + continue; + } + + break; + } + + if digits == 0 { + return None; + } + + Some(NumberSpan { + start: index, + end: value_end.max(end), + }) +} + +fn number_separator_continues( + text: &str, + index: usize, + separator: char, +) -> bool { + let mut saw_space = false; + for ch in str_tail(text, index) + .into_iter() + .flat_map(str::chars) + .take(2) + { + if ch == '\n' || ch == '\r' { + return false; + } + if ch.is_whitespace() { + saw_space = true; + continue; + } + if separator.is_whitespace() { + return ch.is_ascii_digit(); + } + return (!saw_space && ch.is_ascii_digit()) || is_dash(ch); + } + false +} + +fn money_entity( + full_text: &str, + start: usize, + end: usize, +) -> Option { + let start_u32 = u32::try_from(start).unwrap_or(u32::MAX); + let end_u32 = u32::try_from(end).unwrap_or(u32::MAX); + Some(PipelineEntity::detected( + start_u32, + end_u32, + MONEY_LABEL, + str_slice(full_text, start, end)?.to_owned(), + MONEY_SCORE, + DetectionSource::Regex, + )) +} + +fn leading_symbol_start(text: &str, number_start: usize) -> Option { + let before_number = skip_horizontal_ws_backward_limit(text, number_start, 2); + let (symbol_start, ch) = previous_char(text, before_number)?; + is_currency_symbol(ch).then_some(symbol_start) +} + +fn currency_name(text: String) -> CurrencyName { + let case_insensitive = is_ascii_phrase(&text) && text.chars().count() >= 3; + let whole_words = text + .chars() + .all(|ch| ch.is_alphanumeric() || ch.is_whitespace()); + CurrencyName { + folded: text.to_lowercase(), + text, + case_insensitive, + whole_words, + } +} + +fn magnitude_term(text: String, case_insensitive: bool) -> MagnitudeTerm { + MagnitudeTerm { + folded: text.to_lowercase(), + text, + case_insensitive, + } +} + +fn clean_terms(values: Vec) -> Vec { + values + .into_iter() + .map(|value| value.trim().to_owned()) + .filter(|value| !value.is_empty()) + .collect() +} + +fn left_money_boundary(text: &str, index: usize, kind: AnchorKind) -> bool { + if kind == AnchorKind::Symbol { + return true; + } + previous_char(text, index).is_none_or(|(_, ch)| !is_identifier_char(ch)) +} + +fn right_money_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| ch.is_whitespace() || ".,;!?)]}".contains(ch)) +} + +fn right_word_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| !is_identifier_char(ch)) +} + +fn is_ascii_phrase(text: &str) -> bool { + text + .chars() + .all(|ch| ch.is_ascii_alphabetic() || ch.is_whitespace()) +} + +fn is_identifier_char(ch: char) -> bool { + ch == '_' || ch.is_alphanumeric() +} + +const fn is_number_separator(ch: char) -> bool { + ch == ',' + || ch == '.' + || ch == '\'' + || (ch.is_whitespace() && ch != '\n' && ch != '\r') +} + +const fn is_dash(ch: char) -> bool { + matches!( + ch, + '-' + | '‐' + | '‑' + | '‒' + | '–' + | '—' + | '―' + | '⸺' + | '⸻' + | '⁃' + | '־' + | '−' + ) +} + +const fn is_currency_symbol(ch: char) -> bool { + matches!( + ch, + '$' + | '£' + | '¥' + | '৳' + | '₡' + | '₦' + | '₩' + | '₪' + | '₫' + | '€' + | '₭' + | '₮' + | '₱' + | '₲' + | '₴' + | '₵' + | '₸' + | '₹' + | '₺' + | '₼' + | '₽' + | '₾' + ) +} + +fn skip_horizontal_ws_limit( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some(ch) = str_tail(text, index).and_then(|value| value.chars().next()) + else { + break; + }; + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + skipped = skipped.saturating_add(1); + } + index +} + +fn skip_horizontal_ws_backward_limit( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some((char_start, ch)) = previous_char(text, index) else { + break; + }; + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = char_start; + skipped = skipped.saturating_add(1); + } + index +} + +fn previous_char(text: &str, index: usize) -> Option<(usize, char)> { + str_head(text, index)?.char_indices().next_back() +} + +const fn char_boundary_before(text: &str, mut index: usize) -> usize { + while !text.is_char_boundary(index) { + index = index.saturating_sub(1); + } + index +} + +fn str_head(text: &str, index: usize) -> Option<&str> { + text.get(..index) +} + +fn str_tail(text: &str, index: usize) -> Option<&str> { + text.get(index..) +} + +fn str_slice(text: &str, start: usize, end: usize) -> Option<&str> { + text.get(start..end) +} diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index f46b2f5e..7a1a5e00 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1,6 +1,14 @@ use std::time::Instant; +use crate::address_seeds::{AddressSeedData, PreparedAddressSeedData}; +use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; +use crate::dates::{DateData, PreparedDateData}; use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; +use crate::false_positives::filter_entity_false_positives; +use crate::legal_forms::{ + LegalFormData, PreparedLegalFormData, process_legal_form_matches, +}; +use crate::money::{MonetaryData, PreparedMonetaryData}; use crate::normalize::normalize_for_search_with_byte_map; use crate::processors::{ CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, @@ -12,15 +20,27 @@ use crate::resolution::{ PipelineEntity, enforce_boundary_consistency, merge_and_dedup, sanitize_entities_with_source, }; -use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::search::{ + LiteralSearchOptions, SearchIndex, SearchIndexArtifacts, SearchOptions, + SearchPattern, +}; +use crate::signatures::detect_signatures; +use crate::triggers::{ + PreparedTriggerData, TriggerData, process_trigger_matches, +}; use crate::types::{ Entity, EntityKind, Error, OperatorConfig, RedactionResult, Result, SearchMatch, }; +const PREPARED_SEARCH_ARTIFACTS_HEADER: [u8; 8] = *b"ANONPSR1"; +const PREPARED_SEARCH_ARTIFACTS_VERSION: u32 = 1; + pub struct PreparedSearch { regex: SearchIndex, custom_regex: SearchIndex, + legal_forms: SearchIndex, + triggers: SearchIndex, literals: SearchIndex, slices: PreparedSearchSlices, regex_meta: Vec, @@ -28,6 +48,11 @@ pub struct PreparedSearch { deny_list_data: Option, gazetteer_data: Option, country_data: Option, + trigger_data: Option, + legal_form_data: Option, + address_seed_data: Option, + date_data: Option, + monetary_data: Option, } #[derive(Clone, Debug, Default, Eq, PartialEq)] @@ -56,6 +81,75 @@ pub struct PreparedSearchConfig { pub deny_list_data: Option, pub gazetteer_data: Option, pub country_data: Option, + pub trigger_data: Option, + pub legal_form_data: Option, + pub address_seed_data: Option, + pub date_data: Option, + pub monetary_data: Option, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PreparedSearchArtifacts { + pub regex: SearchIndexArtifacts, + pub custom_regex: SearchIndexArtifacts, + pub legal_forms: SearchIndexArtifacts, + pub triggers: SearchIndexArtifacts, + pub literals: SearchIndexArtifacts, +} + +impl PreparedSearchArtifacts { + pub fn to_bytes(&self) -> Result> { + let mut writer = ArtifactWriter::new( + PREPARED_SEARCH_ARTIFACTS_HEADER, + PREPARED_SEARCH_ARTIFACTS_VERSION, + ); + write_index_artifacts(&mut writer, "prepared.regex", &self.regex)?; + write_index_artifacts( + &mut writer, + "prepared.custom_regex", + &self.custom_regex, + )?; + write_index_artifacts( + &mut writer, + "prepared.legal_forms", + &self.legal_forms, + )?; + write_index_artifacts(&mut writer, "prepared.triggers", &self.triggers)?; + write_index_artifacts(&mut writer, "prepared.literals", &self.literals)?; + Ok(writer.into_bytes()) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let mut reader = ArtifactReader::new( + bytes, + PREPARED_SEARCH_ARTIFACTS_HEADER, + PREPARED_SEARCH_ARTIFACTS_VERSION, + "prepared_search_artifacts", + )?; + let artifacts = Self { + regex: read_index_artifacts(&mut reader)?, + custom_regex: read_index_artifacts(&mut reader)?, + legal_forms: read_index_artifacts(&mut reader)?, + triggers: read_index_artifacts(&mut reader)?, + literals: read_index_artifacts(&mut reader)?, + }; + reader.finish()?; + Ok(artifacts) + } +} + +fn write_index_artifacts( + writer: &mut ArtifactWriter, + field: &'static str, + artifacts: &SearchIndexArtifacts, +) -> Result<()> { + writer.write_len_prefixed_bytes(field, &artifacts.to_bytes()?) +} + +fn read_index_artifacts( + reader: &mut ArtifactReader<'_>, +) -> Result { + SearchIndexArtifacts::from_bytes(reader.read_len_prefixed_bytes()?) } #[derive(Clone, Debug, Eq, PartialEq)] @@ -73,6 +167,11 @@ pub struct StaticDetectionResult { pub deny_list_entities: Vec, pub gazetteer_entities: Vec, pub country_entities: Vec, + pub anchored_entities: Vec, + pub trigger_entities: Vec, + pub signature_entities: Vec, + pub legal_form_entities: Vec, + pub address_seed_entities: Vec, } #[derive(Clone, Debug, PartialEq)] @@ -88,21 +187,126 @@ pub struct StaticRedactionDiagnosticResult { pub diagnostics: StaticRedactionDiagnostics, } +struct TimedEntities { + entities: Vec, + elapsed_us: u64, +} + +struct StaticEntityPasses { + regex: TimedEntities, + custom_regex: TimedEntities, + deny_list: TimedEntities, + gazetteer: TimedEntities, + country: TimedEntities, + anchored: TimedEntities, + trigger: TimedEntities, + signature: TimedEntities, + legal_form: TimedEntities, + address_seed: TimedEntities, +} + pub struct PreparedSearchBuildResult { pub prepared: PreparedSearch, pub diagnostics: StaticRedactionDiagnostics, } +struct RegexPatternGroups { + regex: Vec, + legal_forms: Vec, + triggers: Vec, +} + +type TimedSearchIndex = (SearchIndex, u64); + +struct PreparedSearchIndexes { + regex: TimedSearchIndex, + custom_regex: TimedSearchIndex, + legal_forms: TimedSearchIndex, + triggers: TimedSearchIndex, + literals: TimedSearchIndex, +} + +struct SearchIndexBuildInputs { + regex_patterns: Vec, + regex_options: SearchOptions, + custom_regex_patterns: Vec, + custom_regex_options: SearchOptions, + legal_form_patterns: Vec, + trigger_patterns: Vec, + literal_patterns: Vec, + literal_options: SearchOptions, +} + +#[derive(Clone, Copy)] +struct SearchIndexPrepareMetrics { + regex: (usize, u64), + custom_regex: (usize, u64), + legal_forms: (usize, u64), + triggers: (usize, u64), + literals: (usize, u64), +} + impl PreparedSearch { pub fn new(config: PreparedSearchConfig) -> Result { - Self::new_inner(config, None) + Self::new_inner(config, None, None) + } + + pub fn prepare_artifacts( + config: PreparedSearchConfig, + ) -> Result { + validate_supported_config(&config)?; + let regex_groups = + split_regex_patterns(config.regex_patterns, &config.slices)?; + Ok(PreparedSearchArtifacts { + regex: SearchIndex::prepare_artifacts( + regex_groups.regex, + config.regex_options, + )?, + custom_regex: SearchIndex::prepare_artifacts( + config.custom_regex_patterns, + config.custom_regex_options, + )?, + legal_forms: SearchIndex::prepare_artifacts( + regex_groups.legal_forms, + legal_form_search_options(), + )?, + triggers: SearchIndex::prepare_artifacts( + promote_case_insensitive_literals(regex_groups.triggers), + trigger_search_options(), + )?, + literals: SearchIndex::prepare_artifacts( + config.literal_patterns, + config.literal_options, + )?, + }) + } + + pub fn new_with_artifacts( + config: PreparedSearchConfig, + artifacts: &PreparedSearchArtifacts, + ) -> Result { + Self::new_inner(config, None, Some(artifacts)) + } + + pub fn new_with_artifacts_diagnostics( + config: PreparedSearchConfig, + artifacts: &PreparedSearchArtifacts, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let prepared = + Self::new_inner(config, Some(&mut diagnostics), Some(artifacts))?; + + Ok(PreparedSearchBuildResult { + prepared, + diagnostics, + }) } pub fn new_with_diagnostics( config: PreparedSearchConfig, ) -> Result { let mut diagnostics = StaticRedactionDiagnostics::default(); - let prepared = Self::new_inner(config, Some(&mut diagnostics))?; + let prepared = Self::new_inner(config, Some(&mut diagnostics), None)?; Ok(PreparedSearchBuildResult { prepared, @@ -113,70 +317,100 @@ impl PreparedSearch { fn new_inner( config: PreparedSearchConfig, mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + artifacts: Option<&PreparedSearchArtifacts>, ) -> Result { let total_start = Instant::now(); validate_supported_config(&config)?; - let regex_len = config.regex_patterns.len(); + let slices = config.slices.clone(); + let regex_groups = split_regex_patterns(config.regex_patterns, &slices)?; + let regex_len = regex_groups.regex.len(); let custom_regex_len = config.custom_regex_patterns.len(); + let anchored_len = anchored_config_len( + config.date_data.as_ref(), + config.monetary_data.as_ref(), + ); + let legal_form_len = regex_groups.legal_forms.len(); + let trigger_len = regex_groups.triggers.len(); let literal_len = config.literal_patterns.len(); - let regex_start = Instant::now(); - let regex = SearchIndex::new(config.regex_patterns, config.regex_options)?; - if let Some(diagnostics) = &mut diagnostics { - diagnostics.record_stage( - DiagnosticStage::PrepareRegex, - Some(regex_len), - Some(elapsed_us(regex_start)), - None, - ); - } - - let custom_regex_start = Instant::now(); - let custom_regex = SearchIndex::new( - config.custom_regex_patterns, - config.custom_regex_options, + let (date_data, monetary_data) = prepare_anchored_data( + config.date_data.as_ref(), + config.monetary_data, + anchored_len, + diagnostics.as_deref_mut(), )?; - if let Some(diagnostics) = &mut diagnostics { - diagnostics.record_stage( - DiagnosticStage::PrepareCustomRegex, - Some(custom_regex_len), - Some(elapsed_us(custom_regex_start)), - None, - ); - } - let literal_start = Instant::now(); - let literals = - SearchIndex::new(config.literal_patterns, config.literal_options)?; - if let Some(diagnostics) = &mut diagnostics { - diagnostics.record_stage( - DiagnosticStage::PrepareLiteral, - Some(literal_len), - Some(elapsed_us(literal_start)), - None, - ); - diagnostics.record_stage( - DiagnosticStage::PrepareTotal, - Some( - regex_len - .saturating_add(custom_regex_len) - .saturating_add(literal_len), + let indexes = build_search_indexes( + SearchIndexBuildInputs { + regex_patterns: regex_groups.regex, + regex_options: config.regex_options, + custom_regex_patterns: config.custom_regex_patterns, + custom_regex_options: config.custom_regex_options, + legal_form_patterns: regex_groups.legal_forms, + trigger_patterns: promote_case_insensitive_literals( + regex_groups.triggers, ), - Some(elapsed_us(total_start)), - None, - ); - } + literal_patterns: config.literal_patterns, + literal_options: config.literal_options, + }, + artifacts, + )?; + let ( + (regex, regex_elapsed), + (custom_regex, custom_regex_elapsed), + (legal_forms, legal_forms_elapsed), + (triggers, triggers_elapsed), + (literals, literals_elapsed), + ) = ( + indexes.regex, + indexes.custom_regex, + indexes.legal_forms, + indexes.triggers, + indexes.literals, + ); + record_search_index_prepare_stages( + &mut diagnostics, + &SearchIndexPrepareMetrics { + regex: (regex_len, regex_elapsed), + custom_regex: (custom_regex_len, custom_regex_elapsed), + legal_forms: (legal_form_len, legal_forms_elapsed), + triggers: (trigger_len, triggers_elapsed), + literals: (literal_len, literals_elapsed), + }, + ); + record_prepare_total( + &mut diagnostics, + [ + regex_len, + custom_regex_len, + anchored_len, + legal_form_len, + trigger_len, + literal_len, + ], + total_start, + ); Ok(Self { regex, custom_regex, + legal_forms, + triggers, literals, - slices: config.slices, + slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, deny_list_data: config.deny_list_data, gazetteer_data: config.gazetteer_data, country_data: config.country_data, + trigger_data: config + .trigger_data + .map(PreparedTriggerData::new) + .transpose()?, + legal_form_data: config.legal_form_data.map(PreparedLegalFormData::new), + address_seed_data: prepare_address_seed_data(config.address_seed_data)?, + date_data, + monetary_data, }) } @@ -202,7 +436,10 @@ impl PreparedSearch { } let regex_start = Instant::now(); - let regex = self.regex.find_iter(full_text)?; + let regex = offset_matches( + self.regex.find_iter(full_text)?, + self.slices.regex.start, + )?; if let Some(diagnostics) = &mut diagnostics { diagnostics.record_search_matches( DiagnosticStage::SearchRegex, @@ -212,8 +449,39 @@ impl PreparedSearch { ); } + let legal_form_start = Instant::now(); + let legal_forms = offset_matches( + self.legal_forms.find_iter(full_text)?, + self.slices.legal_forms.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchLegalForm, + &legal_forms, + full_text, + Some(elapsed_us(legal_form_start)), + ); + } + + let trigger_start = Instant::now(); + let triggers = offset_matches( + self.triggers.find_iter(full_text)?, + self.slices.triggers.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchTrigger, + &triggers, + full_text, + Some(elapsed_us(trigger_start)), + ); + } + let custom_regex_start = Instant::now(); - let custom_regex = self.custom_regex.find_iter(full_text)?; + let custom_regex = offset_matches( + self.custom_regex.find_iter(full_text)?, + self.slices.custom_regex.start, + )?; if let Some(diagnostics) = &mut diagnostics { diagnostics.record_search_matches( DiagnosticStage::SearchCustomRegex, @@ -230,6 +498,7 @@ impl PreparedSearch { .into_iter() .map(|found| remap_normalized_match(&normalized, found)) .collect::>>()?; + let regex = combine_regex_matches(regex, legal_forms, triggers); if let Some(diagnostics) = &mut diagnostics { diagnostics.record_search_matches( DiagnosticStage::SearchLiteral, @@ -271,104 +540,231 @@ impl PreparedSearch { ) -> Result { let matches = self.find_matches_inner(full_text, diagnostics.as_deref_mut())?; + let passes = self.process_static_entity_passes(&matches, full_text)?; + + if let Some(diagnostics) = &mut diagnostics { + record_static_entity_diagnostics(diagnostics, full_text, &passes); + } + + Ok(StaticDetectionResult { + matches, + regex_entities: passes.regex.entities, + custom_regex_entities: passes.custom_regex.entities, + deny_list_entities: passes.deny_list.entities, + gazetteer_entities: passes.gazetteer.entities, + country_entities: passes.country.entities, + anchored_entities: passes.anchored.entities, + trigger_entities: passes.trigger.entities, + signature_entities: passes.signature.entities, + legal_form_entities: passes.legal_form.entities, + address_seed_entities: passes.address_seed.entities, + }) + } + fn process_static_entity_passes( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { let regex_start = Instant::now(); - let regex_entities = process_regex_matches( - &matches.regex, - self.slices.regex, - full_text, - &self.regex_meta, - )?; - let regex_elapsed_us = elapsed_us(regex_start); + let regex = TimedEntities { + entities: process_regex_matches( + &matches.regex, + self.slices.regex, + full_text, + &self.regex_meta, + )?, + elapsed_us: elapsed_us(regex_start), + }; let custom_regex_start = Instant::now(); - let custom_regex_entities = process_regex_matches( - &matches.custom_regex, - self.slices.custom_regex, + let custom_regex = TimedEntities { + entities: process_regex_matches( + &matches.custom_regex, + self.slices.custom_regex, + full_text, + &self.custom_regex_meta, + )?, + elapsed_us: elapsed_us(custom_regex_start), + }; + + let deny_list_start = Instant::now(); + let deny_list = TimedEntities { + entities: if let Some(data) = &self.deny_list_data { + process_deny_list_matches( + &matches.literal, + self.slices.deny_list, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(deny_list_start), + }; + + let gazetteer_start = Instant::now(); + let gazetteer = TimedEntities { + entities: if let Some(data) = &self.gazetteer_data { + process_gazetteer_matches( + &matches.literal, + self.slices.gazetteer, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(gazetteer_start), + }; + + let country = self.process_country_entities(matches, full_text)?; + + let anchored = self.process_anchored_entities(full_text)?; + + let trigger = self.process_trigger_entities(matches, full_text)?; + + let signature = process_signature_entities(full_text); + + let legal_form = self.process_legal_form_entities(matches, full_text)?; + + let address_seed = self.process_address_seed_entities( + matches, full_text, - &self.custom_regex_meta, + &[ + ®ex.entities, + &custom_regex.entities, + &anchored.entities, + &trigger.entities, + &signature.entities, + &legal_form.entities, + &deny_list.entities, + &gazetteer.entities, + ], )?; - let custom_regex_elapsed_us = elapsed_us(custom_regex_start); - let deny_list_start = Instant::now(); - let deny_list_entities = if let Some(data) = &self.deny_list_data { - process_deny_list_matches( - &matches.literal, - self.slices.deny_list, + Ok(StaticEntityPasses { + regex, + custom_regex, + deny_list, + gazetteer, + country, + anchored, + trigger, + signature, + legal_form, + address_seed, + }) + } + + fn process_anchored_entities( + &self, + full_text: &str, + ) -> Result { + let anchored_start = Instant::now(); + let mut entities = Vec::new(); + if let Some(data) = &self.date_data { + entities.extend(data.process(full_text)?); + } + if let Some(data) = &self.monetary_data { + entities.extend(data.process(full_text)?); + } + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(anchored_start), + }) + } + + fn process_trigger_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.trigger_data { + process_trigger_matches( + &matches.regex, + self.slices.triggers, full_text, data, )? } else { Vec::new() }; - let deny_list_elapsed_us = elapsed_us(deny_list_start); - let gazetteer_start = Instant::now(); - let gazetteer_entities = if let Some(data) = &self.gazetteer_data { - process_gazetteer_matches( - &matches.literal, - self.slices.gazetteer, + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + fn process_legal_form_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.legal_form_data { + process_legal_form_matches( + &matches.regex, + self.slices.legal_forms, full_text, data, )? } else { Vec::new() }; - let gazetteer_elapsed_us = elapsed_us(gazetteer_start); - let country_start = Instant::now(); - let country_entities = if let Some(data) = &self.country_data { - process_country_matches( + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + fn process_address_seed_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + context_layers: &[&[PipelineEntity]], + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.address_seed_data { + let existing_entities = address_seed_context(context_layers); + data.process( &matches.literal, - self.slices.countries, + self.slices.street_types, full_text, - data, + &existing_entities, )? } else { Vec::new() }; - let country_elapsed_us = elapsed_us(country_start); - if let Some(diagnostics) = &mut diagnostics { - diagnostics.record_entities( - DiagnosticStage::EntityRegex, - ®ex_entities, - full_text, - Some(regex_elapsed_us), - ); - diagnostics.record_entities( - DiagnosticStage::EntityCustomRegex, - &custom_regex_entities, - full_text, - Some(custom_regex_elapsed_us), - ); - diagnostics.record_entities( - DiagnosticStage::EntityDenyList, - &deny_list_entities, - full_text, - Some(deny_list_elapsed_us), - ); - diagnostics.record_entities( - DiagnosticStage::EntityGazetteer, - &gazetteer_entities, - full_text, - Some(gazetteer_elapsed_us), - ); - diagnostics.record_entities( - DiagnosticStage::EntityCountry, - &country_entities, - full_text, - Some(country_elapsed_us), - ); - } + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } - Ok(StaticDetectionResult { - matches, - regex_entities, - custom_regex_entities, - deny_list_entities, - gazetteer_entities, - country_entities, + fn process_country_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { + let country_start = Instant::now(); + Ok(TimedEntities { + entities: if let Some(data) = &self.country_data { + process_country_matches( + &matches.literal, + self.slices.countries, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(country_start), }) } @@ -428,8 +824,16 @@ impl PreparedSearch { ); } let sanitize_start = Instant::now(); - let resolved_entities = + let sanitized_entities = sanitize_entities_with_source(&consistent, full_text)?; + let resolved_entities = filter_entity_false_positives( + sanitized_entities, + full_text, + self + .deny_list_data + .as_ref() + .and_then(|data| data.filters.as_ref()), + )?; if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( DiagnosticStage::Sanitize, @@ -460,11 +864,406 @@ impl PreparedSearch { } } +fn process_signature_entities(full_text: &str) -> TimedEntities { + let start = Instant::now(); + TimedEntities { + entities: detect_signatures(full_text), + elapsed_us: elapsed_us(start), + } +} + +fn record_static_entity_diagnostics( + diagnostics: &mut StaticRedactionDiagnostics, + full_text: &str, + passes: &StaticEntityPasses, +) { + diagnostics.record_entities( + DiagnosticStage::EntityRegex, + &passes.regex.entities, + full_text, + Some(passes.regex.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCustomRegex, + &passes.custom_regex.entities, + full_text, + Some(passes.custom_regex.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityDenyList, + &passes.deny_list.entities, + full_text, + Some(passes.deny_list.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityGazetteer, + &passes.gazetteer.entities, + full_text, + Some(passes.gazetteer.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCountry, + &passes.country.entities, + full_text, + Some(passes.country.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityAnchored, + &passes.anchored.entities, + full_text, + Some(passes.anchored.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityTrigger, + &passes.trigger.entities, + full_text, + Some(passes.trigger.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntitySignature, + &passes.signature.entities, + full_text, + Some(passes.signature.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityLegalForm, + &passes.legal_form.entities, + full_text, + Some(passes.legal_form.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityAddressSeed, + &passes.address_seed.entities, + full_text, + Some(passes.address_seed.elapsed_us), + ); +} + +fn address_seed_context(layers: &[&[PipelineEntity]]) -> Vec { + let capacity = layers + .iter() + .map(|layer| layer.len()) + .fold(0usize, usize::saturating_add); + let mut entities = Vec::with_capacity(capacity); + for layer in layers { + entities.extend(layer.iter().cloned()); + } + entities +} + fn elapsed_us(start: Instant) -> u64 { let micros = start.elapsed().as_micros(); u64::try_from(micros).unwrap_or(u64::MAX) } +fn build_search_indexes( + inputs: SearchIndexBuildInputs, + artifacts: Option<&PreparedSearchArtifacts>, +) -> Result { + let SearchIndexBuildInputs { + regex_patterns, + regex_options, + custom_regex_patterns, + custom_regex_options, + legal_form_patterns, + trigger_patterns, + literal_patterns, + literal_options, + } = inputs; + + let regex_artifacts = artifacts.map(|value| &value.regex); + let custom_regex_artifacts = artifacts.map(|value| &value.custom_regex); + let legal_form_artifacts = artifacts.map(|value| &value.legal_forms); + let trigger_artifacts = artifacts.map(|value| &value.triggers); + let literal_artifacts = artifacts.map(|value| &value.literals); + + std::thread::scope(|scope| { + let regex = scope.spawn(move || { + build_search_index(regex_patterns, regex_options, regex_artifacts) + }); + let custom_regex = scope.spawn(move || { + build_search_index( + custom_regex_patterns, + custom_regex_options, + custom_regex_artifacts, + ) + }); + let legal_forms = scope.spawn(move || { + build_search_index( + legal_form_patterns, + legal_form_search_options(), + legal_form_artifacts, + ) + }); + let triggers = scope.spawn(move || { + build_search_index( + trigger_patterns, + trigger_search_options(), + trigger_artifacts, + ) + }); + let literals = scope.spawn(move || { + build_search_index(literal_patterns, literal_options, literal_artifacts) + }); + + Ok(PreparedSearchIndexes { + regex: join_search_index(regex, "regex")?, + custom_regex: join_search_index(custom_regex, "custom_regex")?, + legal_forms: join_search_index(legal_forms, "legal_forms")?, + triggers: join_search_index(triggers, "triggers")?, + literals: join_search_index(literals, "literals")?, + }) + }) +} + +fn build_search_index( + patterns: Vec, + options: SearchOptions, + artifacts: Option<&SearchIndexArtifacts>, +) -> Result { + let start = Instant::now(); + let search = if let Some(artifacts) = artifacts { + SearchIndex::new_with_artifacts(patterns, options, artifacts)? + } else { + SearchIndex::new(patterns, options)? + }; + Ok((search, elapsed_us(start))) +} + +fn join_search_index( + handle: std::thread::ScopedJoinHandle<'_, Result>, + field: &'static str, +) -> Result { + handle.join().map_err(|_| Error::InvalidStaticData { + field, + reason: "search index builder panicked".to_owned(), + })? +} + +fn record_prepare_stage_elapsed( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + stage: DiagnosticStage, + count: usize, + elapsed_us: u64, +) { + if let Some(diagnostics) = diagnostics { + diagnostics.record_stage(stage, Some(count), Some(elapsed_us), None); + } +} + +fn record_search_index_prepare_stages( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + metrics: &SearchIndexPrepareMetrics, +) { + let stages = [ + (DiagnosticStage::PrepareRegex, metrics.regex), + (DiagnosticStage::PrepareCustomRegex, metrics.custom_regex), + (DiagnosticStage::PrepareLegalFormSearch, metrics.legal_forms), + (DiagnosticStage::PrepareTriggerSearch, metrics.triggers), + (DiagnosticStage::PrepareLiteral, metrics.literals), + ]; + for (stage, (count, elapsed)) in stages { + record_prepare_stage_elapsed(diagnostics, stage, count, elapsed); + } +} + +fn record_prepare_total( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + counts: [usize; 6], + start: Instant, +) { + let Some(diagnostics) = diagnostics else { + return; + }; + let count = counts.into_iter().fold(0usize, usize::saturating_add); + diagnostics.record_stage( + DiagnosticStage::PrepareTotal, + Some(count), + Some(elapsed_us(start)), + None, + ); +} + +fn anchored_config_len( + date_data: Option<&DateData>, + monetary_data: Option<&MonetaryData>, +) -> usize { + let date_len = date_data.map_or(0, |data| { + data.month_names_by_language.values().map(Vec::len).sum() + }); + let monetary_len = monetary_data.map_or(0, |data| { + data + .currencies + .codes + .len() + .saturating_add(data.currencies.symbols.len()) + .saturating_add(data.currencies.local_names.len()) + }); + date_len.saturating_add(monetary_len) +} + +fn prepare_anchored_data( + date_data: Option<&DateData>, + monetary_data: Option, + anchored_len: usize, + diagnostics: Option<&mut StaticRedactionDiagnostics>, +) -> Result<(Option, Option)> { + let anchored_start = Instant::now(); + let prepared_date = if let Some(data) = date_data { + PreparedDateData::new(data)? + } else { + None + }; + let prepared_monetary = if let Some(data) = monetary_data { + PreparedMonetaryData::new(data)? + } else { + None + }; + + if let Some(diagnostics) = diagnostics { + diagnostics.record_stage( + DiagnosticStage::PrepareAnchored, + Some(anchored_len), + Some(elapsed_us(anchored_start)), + None, + ); + } + + Ok((prepared_date, prepared_monetary)) +} + +fn prepare_address_seed_data( + data: Option, +) -> Result> { + data.map(PreparedAddressSeedData::new).transpose() +} + +fn split_regex_patterns( + patterns: Vec, + slices: &PreparedSearchSlices, +) -> Result { + let mut regex = Vec::new(); + let mut legal_forms = Vec::new(); + let mut triggers = Vec::new(); + + for (index, pattern) in patterns.into_iter().enumerate() { + let pattern_index = u32::try_from(index) + .map_err(|_| Error::PatternIndexOutOfRange { index })?; + if slices.legal_forms.contains(pattern_index) { + legal_forms.push(pattern); + continue; + } + if slices.triggers.contains(pattern_index) { + triggers.push(pattern); + continue; + } + regex.push(pattern); + } + + Ok(RegexPatternGroups { + regex, + legal_forms, + triggers, + }) +} + +fn legal_form_search_options() -> SearchOptions { + SearchOptions::default() +} + +fn trigger_search_options() -> SearchOptions { + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + } +} + +fn promote_case_insensitive_literals( + patterns: Vec, +) -> Vec { + patterns + .into_iter() + .map(|entry| match entry { + SearchPattern::LiteralWithOptions { + pattern: value, + case_insensitive: Some(true), + whole_words, + } if whole_words != Some(true) => SearchPattern::Literal(value), + other => other, + }) + .collect() +} + +fn offset_matches( + matches: Vec, + offset: u32, +) -> Result> { + if offset == 0 { + return Ok(matches); + } + + matches + .into_iter() + .map(|found| offset_match(found, offset)) + .collect() +} + +fn offset_match(found: SearchMatch, offset: u32) -> Result { + let pattern = found.pattern().checked_add(offset).ok_or_else(|| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + + Ok(match found { + SearchMatch::Literal { start, end, .. } => SearchMatch::Literal { + pattern, + start, + end, + }, + SearchMatch::Regex { start, end, .. } => SearchMatch::Regex { + pattern, + start, + end, + }, + SearchMatch::Fuzzy { + start, + end, + distance, + .. + } => SearchMatch::Fuzzy { + pattern, + start, + end, + distance, + }, + }) +} + +fn combine_regex_matches( + mut regex: Vec, + legal_forms: Vec, + triggers: Vec, +) -> Vec { + regex.extend(legal_forms); + regex.extend(triggers); + sort_matches(&mut regex); + regex +} + +fn sort_matches(matches: &mut [SearchMatch]) { + matches.sort_by(|left, right| { + left + .start() + .cmp(&right.start()) + .then_with(|| left.end().cmp(&right.end())) + .then_with(|| left.pattern().cmp(&right.pattern())) + }); +} + fn remap_normalized_match( normalized: &crate::normalize::NormalizedSearchText, found: SearchMatch, @@ -474,21 +1273,28 @@ fn remap_normalized_match( } fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { - reject_unsupported_slice(config.slices.legal_forms, "legal_forms")?; - reject_unsupported_slice(config.slices.triggers, "triggers")?; + validate_legal_form_config(config)?; + validate_trigger_config(config)?; validate_deny_list_config(config)?; - reject_unsupported_slice(config.slices.street_types, "street_types") + validate_address_seed_config(config) } -const fn reject_unsupported_slice( - slice: PatternSlice, - name: &'static str, -) -> Result<()> { - if slice.is_empty() { +fn validate_legal_form_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.legal_forms.is_empty() { return Ok(()); } - Err(Error::UnsupportedStaticSlice { slice: name }) + let Some(data) = &config.legal_form_data else { + return Err(Error::MissingStaticData { + field: "legal_form_data", + }); + }; + + validate_static_data_length( + "legal_form_data.suffixes", + config.slices.legal_forms, + data.suffixes.len(), + ) } fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { @@ -523,6 +1329,40 @@ fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { ensure_supported_deny_list_sources(data) } +const fn validate_address_seed_config( + config: &PreparedSearchConfig, +) -> Result<()> { + if config.slices.street_types.is_empty() { + return Ok(()); + } + + if config.address_seed_data.is_some() { + return Ok(()); + } + + Err(Error::MissingStaticData { + field: "address_seed_data", + }) +} + +fn validate_trigger_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.triggers.is_empty() { + return Ok(()); + } + + let Some(data) = &config.trigger_data else { + return Err(Error::MissingStaticData { + field: "trigger_data", + }); + }; + + validate_static_data_length( + "trigger_data.rules", + config.slices.triggers, + data.rules.len(), + ) +} + fn validate_static_data_length( field: &'static str, slice: PatternSlice, @@ -555,13 +1395,23 @@ impl StaticDetectionResult { .saturating_add(self.custom_regex_entities.len()) .saturating_add(self.deny_list_entities.len()) .saturating_add(self.gazetteer_entities.len()) - .saturating_add(self.country_entities.len()); + .saturating_add(self.country_entities.len()) + .saturating_add(self.anchored_entities.len()) + .saturating_add(self.trigger_entities.len()) + .saturating_add(self.signature_entities.len()) + .saturating_add(self.legal_form_entities.len()) + .saturating_add(self.address_seed_entities.len()); let mut entities = Vec::with_capacity(capacity); entities.extend(self.regex_entities.iter().cloned()); entities.extend(self.custom_regex_entities.iter().cloned()); entities.extend(self.deny_list_entities.iter().cloned()); entities.extend(self.gazetteer_entities.iter().cloned()); entities.extend(self.country_entities.iter().cloned()); + entities.extend(self.anchored_entities.iter().cloned()); + entities.extend(self.trigger_entities.iter().cloned()); + entities.extend(self.signature_entities.iter().cloned()); + entities.extend(self.legal_form_entities.iter().cloned()); + entities.extend(self.address_seed_entities.iter().cloned()); entities } } diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 0b27ed59..073a34f3 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, BTreeSet}; use crate::byte_offsets::ByteOffsets; use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; use crate::types::{Error, Result, SearchMatch}; +use crate::validators::validate_id; const GAZETTEER_EXACT_SCORE: f64 = 0.9; const GAZETTEER_FUZZY_SCORE: f64 = 0.85; @@ -40,7 +41,7 @@ impl PatternSlice { pattern >= self.start && pattern < self.end } - fn local_index(self, pattern: u32) -> Option { + pub(crate) fn local_index(self, pattern: u32) -> Option { if !self.contains(pattern) { return None; } @@ -54,6 +55,8 @@ pub struct RegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: bool, + pub validator_id: Option, + pub validator_input: Option, pub min_byte_length: Option, } @@ -65,6 +68,8 @@ impl RegexMatchMeta { score, source_detail: None, requires_validation: false, + validator_id: None, + validator_input: None, min_byte_length: None, } } @@ -83,25 +88,171 @@ pub struct CountryMatchData { #[derive(Clone, Debug, Eq, PartialEq)] pub struct DenyListMatchData { - pub labels: Vec>, - pub custom_labels: Vec>, + pub labels: StringGroups, + pub custom_labels: StringGroups, pub originals: Vec, - pub sources: Vec>, + pub sources: StringGroups, pub filters: Option, } +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct StringGroups { + table: Vec, + groups: Vec>, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct StringGroup<'a> { + table: &'a [String], + indexes: &'a [u32], +} + +impl StringGroups { + #[must_use] + pub fn from_groups(groups: Vec>) -> Self { + let mut table = Vec::new(); + let mut table_indexes = BTreeMap::::new(); + let groups = groups + .into_iter() + .map(|group| { + group + .into_iter() + .map(|value| { + string_table_index(value, &mut table, &mut table_indexes) + }) + .collect() + }) + .collect(); + + Self { table, groups } + } + + pub fn from_table_indices( + table: Vec, + groups: Vec>, + field: &'static str, + ) -> Result { + for group in &groups { + for &index in group { + let Ok(index) = usize::try_from(index) else { + return Err(Error::InvalidStaticData { + field, + reason: String::from("string table index exceeds usize range"), + }); + }; + if index >= table.len() { + return Err(Error::InvalidStaticData { + field, + reason: String::from("string table index out of range"), + }); + } + } + } + + Ok(Self { table, groups }) + } + + #[must_use] + pub fn empty_groups(len: usize) -> Self { + Self { + table: Vec::new(), + groups: vec![Vec::new(); len], + } + } + + #[must_use] + pub const fn len(&self) -> usize { + self.groups.len() + } + + #[must_use] + pub const fn is_empty(&self) -> bool { + self.groups.is_empty() + } + + #[must_use] + pub fn get(&self, index: usize) -> Option> { + Some(StringGroup { + table: &self.table, + indexes: self.groups.get(index)?, + }) + } + + pub fn iter(&self) -> impl Iterator> { + self.groups.iter().map(|indexes| StringGroup { + table: &self.table, + indexes, + }) + } +} + +impl From>> for StringGroups { + fn from(groups: Vec>) -> Self { + Self::from_groups(groups) + } +} + +impl<'a> StringGroup<'a> { + #[must_use] + pub const fn is_empty(self) -> bool { + self.indexes.is_empty() + } + + pub fn iter(self) -> impl Iterator + 'a { + self + .indexes + .iter() + .filter_map(|index| usize::try_from(*index).ok()) + .filter_map(|index| self.table.get(index)) + .map(String::as_str) + } + + #[must_use] + pub fn contains(self, value: &str) -> bool { + self.iter().any(|entry| entry == value) + } + + #[must_use] + pub fn to_strings(self) -> Vec { + self.iter().map(String::from).collect() + } +} + +fn string_table_index( + value: String, + table: &mut Vec, + table_indexes: &mut BTreeMap, +) -> u32 { + if let Some(index) = table_indexes.get(&value) { + return *index; + } + let index = u32::try_from(table.len()).unwrap_or(u32::MAX); + table_indexes.insert(value.clone(), index); + table.push(value); + index +} + #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct DenyListFilterData { pub stopwords: BTreeSet, pub allow_list: BTreeSet, pub person_stopwords: BTreeSet, + pub person_trailing_nouns: BTreeSet, pub address_stopwords: BTreeSet, + pub address_jurisdiction_prefixes: BTreeSet, pub street_types: BTreeSet, pub first_names: BTreeSet, pub generic_roles: BTreeSet, pub sentence_starters: BTreeSet, pub trailing_address_word_exclusions: BTreeSet, pub defined_term_cues: BTreeSet, + pub signing_place_guards: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SigningPlaceGuardData { + pub prefix_phrases: BTreeSet, + pub suffix_phrases: BTreeSet, } #[derive(Clone, Debug, Eq, PartialEq)] @@ -132,7 +283,11 @@ pub fn process_regex_matches( continue; }; let text = offsets.slice(full_text, found.start(), found.end())?; - if entry.requires_validation { + if let Some(validator_id) = &entry.validator_id { + if !validate_id(validator_id, &text, entry.validator_input.as_deref()) { + continue; + } + } else if entry.requires_validation { return Err(Error::UnsupportedRegexValidation { pattern }); } if entry @@ -260,7 +415,7 @@ fn collect_deny_list_matches( let custom_pattern_labels = data .custom_labels .get(local_index) - .cloned() + .map(StringGroup::to_strings) .unwrap_or_default(); let custom_edges_are_valid = custom_match_has_valid_edges( full_text, @@ -311,7 +466,7 @@ fn collect_deny_list_matches( end: found.end(), labels: curated_labels, custom_labels, - sources: sources.clone(), + sources: sources.to_strings(), text: match_text, }); } @@ -326,7 +481,7 @@ struct CuratedDenyListMatch<'a> { match_text: &'a str, keyword: &'a str, pattern: &'a str, - labels: &'a [String], + labels: StringGroup<'a>, custom_pattern_labels: &'a [String], custom_edges_are_valid: bool, filters: &'a DenyListFilterData, @@ -364,8 +519,13 @@ fn curated_labels_for_match( args .labels .iter() - .filter(|label| !args.custom_pattern_labels.contains(label)) - .cloned() + .filter(|label| { + !args + .custom_pattern_labels + .iter() + .any(|custom| custom == label) + }) + .map(String::from) .collect(), ) } @@ -381,6 +541,9 @@ fn should_suppress_address( let Some(filters) = &data.filters else { return Ok(false); }; + if is_signing_place_context(full_text, found.start, found.end, filters)? { + return Ok(true); + } let lower = found.text.to_lowercase(); if !filters.address_stopwords.contains(&lower) { return Ok(false); @@ -394,6 +557,76 @@ fn should_suppress_address( )?) } +fn is_signing_place_context( + full_text: &str, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + if filters.signing_place_guards.is_empty() { + return Ok(false); + } + + let offsets = ByteOffsets::new(full_text); + let start_byte = offsets.validate_offset(start)?; + let end_byte = offsets.validate_offset(end)?; + let before = full_text.get(..start_byte).unwrap_or_default(); + let after = full_text.get(end_byte..).unwrap_or_default(); + + Ok(filters.signing_place_guards.iter().any(|guard| { + !guard.prefix_phrases.is_empty() + && !guard.suffix_phrases.is_empty() + && context_before_matches_any_phrase(before, &guard.prefix_phrases) + && context_after_matches_any_phrase(after, &guard.suffix_phrases) + })) +} + +fn context_before_matches_any_phrase( + before: &str, + phrases: &BTreeSet, +) -> bool { + phrases.iter().any(|phrase| { + phrase.is_empty() || context_before_matches_phrase(before, phrase) + }) +} + +fn context_after_matches_any_phrase( + after: &str, + phrases: &BTreeSet, +) -> bool { + phrases.iter().any(|phrase| { + phrase.is_empty() || context_after_matches_phrase(after, phrase) + }) +} + +fn context_before_matches_phrase(before: &str, phrase: &str) -> bool { + let trimmed = before.trim_end_matches(char::is_whitespace); + if trimmed.len() < phrase.len() { + return false; + } + let lower = trimmed.to_lowercase(); + if !lower.ends_with(phrase) { + return false; + } + let phrase_start = trimmed.len().saturating_sub(phrase.len()); + char_before_byte(trimmed, phrase_start).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn context_after_matches_phrase(after: &str, phrase: &str) -> bool { + let trimmed = after.trim_start_matches(char::is_whitespace); + let trimmed = trimmed.strip_prefix(',').map_or(trimmed, |value| { + value.trim_start_matches(char::is_whitespace) + }); + if trimmed.len() < phrase.len() { + return false; + } + let lower = trimmed.to_lowercase(); + if !lower.starts_with(phrase) { + return false; + } + char_after_byte(trimmed, phrase.len()).is_none_or(|ch| !ch.is_alphanumeric()) +} + fn append_person_name_hits( results: &mut Vec, full_text: &str, @@ -596,7 +829,7 @@ pub(crate) fn ensure_supported_deny_list_sources( data: &DenyListMatchData, ) -> Result<()> { let mut needs_filters = false; - for sources in &data.sources { + for sources in data.sources.iter() { validate_deny_list_sources(sources)?; needs_filters |= has_curated_source(sources); } @@ -610,15 +843,15 @@ pub(crate) fn ensure_supported_deny_list_sources( Ok(()) } -fn validate_deny_list_sources(sources: &[String]) -> Result<()> { +fn validate_deny_list_sources(sources: StringGroup<'_>) -> Result<()> { if sources.is_empty() { return Err(Error::UnsupportedDenyListSource { source: String::from(""), }); } - for source in sources { - match source.as_str() { + for source in sources.iter() { + match source { DENY_LIST_SOURCE | CITY_SOURCE | CUSTOM_DENY_LIST_SOURCE @@ -627,7 +860,7 @@ fn validate_deny_list_sources(sources: &[String]) -> Result<()> { | TITLE_SOURCE => {} _ => { return Err(Error::UnsupportedDenyListSource { - source: source.clone(), + source: String::from(source), }); } } @@ -636,10 +869,10 @@ fn validate_deny_list_sources(sources: &[String]) -> Result<()> { Ok(()) } -fn has_curated_source(sources: &[String]) -> bool { +fn has_curated_source(sources: StringGroup<'_>) -> bool { sources .iter() - .any(|source| source.as_str() != CUSTOM_DENY_LIST_SOURCE) + .any(|source| source != CUSTOM_DENY_LIST_SOURCE) } fn has_person_name_source(found: &RawDenyListMatch) -> bool { diff --git a/crates/anonymize-core/src/resolution/merge.rs b/crates/anonymize-core/src/resolution/merge.rs index ed2de2a3..ddb16d7a 100644 --- a/crates/anonymize-core/src/resolution/merge.rs +++ b/crates/anonymize-core/src/resolution/merge.rs @@ -131,6 +131,28 @@ fn should_replace( return candidate_len > existing_len; } + if regex_shape_contains_trigger_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if regex_shape_contains_trigger_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if person_regex_contains_name_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if person_regex_contains_name_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + if country_inside_person_or_org(candidate, existing) && existing_len > candidate_len { @@ -312,6 +334,71 @@ fn same_start_longest_wins( && longest_wins_label(&candidate.label) } +fn regex_shape_contains_trigger_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == inner.label + && outer.source == DetectionSource::Regex + && inner.source == DetectionSource::Trigger + && outer.start <= inner.start + && outer.end >= comparable_trigger_fragment_end(inner) + && regex_shape_preferred_label(&outer.label) +} + +fn comparable_trigger_fragment_end(entity: &PipelineEntity) -> u32 { + let mut end = entity.end; + let mut text = entity.text.as_str(); + while let Some((index, ch)) = text.char_indices().next_back() { + if !is_trigger_fragment_trailing_trim(ch) { + break; + } + end = end.saturating_sub(u32_char_len(ch)); + text = text.get(..index).unwrap_or_default(); + } + end +} + +const fn is_trigger_fragment_trailing_trim(ch: char) -> bool { + matches!(ch, ',' | ';' | ':' | '!' | '?' | ' ' | '\t' | '\n' | '\r') +} + +fn u32_char_len(ch: char) -> u32 { + u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX) +} + +fn regex_shape_preferred_label(label: &str) -> bool { + matches!( + label, + "phone number" + | "tax identification number" + | "registration number" + | "national identification number" + | "social security number" + | "birth number" + | "identity card number" + | "passport number" + | "credit card number" + | "bank account number" + | "iban" + ) +} + +fn person_regex_contains_name_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == "person" + && inner.label == "person" + && outer.source == DetectionSource::Regex + && matches!( + inner.source, + DetectionSource::Trigger | DetectionSource::DenyList + ) + && outer.start <= inner.start + && outer.end >= inner.end +} + fn country_inside_person_or_org( country: &PipelineEntity, container: &PipelineEntity, diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs index 276165b8..01d8d5f4 100644 --- a/crates/anonymize-core/src/resolution/sanitize.rs +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -188,6 +188,9 @@ fn should_strip_period( if !text.ends_with('.') || known_period_suffix(text) { return false; } + if entity.source == DetectionSource::LegalForm { + return false; + } if entity.label == "address" && known_address_final_abbrev(text) { return false; } @@ -222,24 +225,38 @@ fn label_allows_colon(label: &str) -> bool { fn collapse_display_whitespace(text: &str) -> String { let mut output = String::new(); - let mut in_whitespace = false; + let mut whitespace = String::new(); for ch in text.chars() { if ch.is_whitespace() { - if !in_whitespace { - output.push(' '); - in_whitespace = true; - } + whitespace.push(ch); continue; } + flush_whitespace(&mut output, &mut whitespace); output.push(ch); - in_whitespace = false; } + flush_whitespace(&mut output, &mut whitespace); output } +fn flush_whitespace(output: &mut String, whitespace: &mut String) { + if whitespace.is_empty() { + return; + } + + if whitespace.chars().any(|ch| matches!(ch, '\n' | '\r')) + || whitespace.chars().count() >= 2 + { + output.push(' '); + } else if let Some(ch) = whitespace.chars().next() { + output.push(ch); + } + + whitespace.clear(); +} + fn first_char(text: &str) -> Option<(char, usize)> { text.chars().next().map(|ch| (ch, ch.len_utf8())) } diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index cbfcd97a..28a8acff 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -1,7 +1,11 @@ use stella_text_search_core as text_search; +use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; use crate::types::{Error, Result, SearchEngine, SearchMatch}; +const SEARCH_INDEX_ARTIFACTS_HEADER: [u8; 8] = *b"ANONIDX1"; +const SEARCH_INDEX_ARTIFACTS_VERSION: u32 = 1; + #[derive(Clone, Debug, Eq, PartialEq)] pub enum SearchPattern { Literal(String), @@ -63,6 +67,47 @@ pub struct SearchIndex { slots: Vec, } +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SearchIndexArtifacts { + pub slots: Vec, +} + +impl SearchIndexArtifacts { + pub fn to_bytes(&self) -> Result> { + let mut writer = ArtifactWriter::new( + SEARCH_INDEX_ARTIFACTS_HEADER, + SEARCH_INDEX_ARTIFACTS_VERSION, + ); + writer.write_len(self.slots.len(), "search_index.slots")?; + for slot in &self.slots { + let slot_bytes = slot.to_bytes().map_err(|error| search_error(&error))?; + writer.write_len_prefixed_bytes("search_index.slot", &slot_bytes)?; + } + Ok(writer.into_bytes()) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let mut reader = ArtifactReader::new( + bytes, + SEARCH_INDEX_ARTIFACTS_HEADER, + SEARCH_INDEX_ARTIFACTS_VERSION, + "search_index_artifacts", + )?; + let count = reader.read_usize()?; + let mut slots = Vec::with_capacity(count); + for _ in 0..count { + slots.push( + text_search::PreparedTextSearchArtifacts::from_bytes( + reader.read_len_prefixed_bytes()?, + ) + .map_err(|error| search_error(&error))?, + ); + } + reader.finish()?; + Ok(Self { slots }) + } +} + struct SearchSlot { engine: SlotEngine, search: text_search::TextSearch, @@ -76,107 +121,91 @@ enum SlotEngine { Fuzzy, } +struct SearchIndexParts { + literals: Vec, + literal_indexes: Vec, + regex: Vec, + regex_indexes: Vec, + fuzzy: Vec, + fuzzy_indexes: Vec, +} + +struct SearchIndexArtifactCursor<'a> { + slots: &'a [text_search::PreparedTextSearchArtifacts], + index: usize, +} + +impl<'a> SearchIndexArtifactCursor<'a> { + const fn new(slots: &'a [text_search::PreparedTextSearchArtifacts]) -> Self { + Self { slots, index: 0 } + } + + fn next(&mut self) -> Result<&'a text_search::PreparedTextSearchArtifacts> { + let index = self.index; + let Some(artifacts) = self.slots.get(index) else { + return Err(search_message(format!( + "Missing prepared text-search artifact at slot {index}" + ))); + }; + self.index = self.index.saturating_add(1); + Ok(artifacts) + } + + fn finish(&self) -> Result<()> { + if self.index == self.slots.len() { + return Ok(()); + } + Err(search_message(format!( + "Expected {} prepared text-search artifacts, got {}", + self.index, + self.slots.len() + ))) + } +} + impl SearchIndex { pub fn new( patterns: Vec, options: SearchOptions, ) -> Result { - let mut literals = Vec::new(); - let mut literal_indexes = Vec::new(); - let mut regex = Vec::new(); - let mut regex_indexes = Vec::new(); - let mut fuzzy = Vec::new(); - let mut fuzzy_indexes = Vec::new(); - - for (index, entry) in patterns.into_iter().enumerate() { - let pattern_index = pattern_index(index)?; - match entry { - SearchPattern::Literal(pattern) => { - literals.push(text_search::PatternEntry::Literal( - text_search::LiteralPattern { - pattern, - name: None, - case_insensitive: None, - whole_words: None, - }, - )); - literal_indexes.push(pattern_index); - } - SearchPattern::LiteralWithOptions { - pattern, - case_insensitive, - whole_words, - } => { - literals.push(text_search::PatternEntry::Literal( - text_search::LiteralPattern { - pattern, - name: None, - case_insensitive, - whole_words, - }, - )); - literal_indexes.push(pattern_index); - } - SearchPattern::Regex(pattern) => { - regex.push(text_search::PatternEntry::Regex( - text_search::RegexPattern::new(pattern), - )); - regex_indexes.push(pattern_index); - } - SearchPattern::RegexWithOptions { - pattern, - lazy, - prefilter_any, - prefilter_case_insensitive, - prefilter_regex, - } => { - let mut regex_pattern = text_search::RegexPattern::new(pattern); - regex_pattern.lazy = lazy; - regex_pattern.prefilter_any = prefilter_any; - regex_pattern.prefilter_case_insensitive = prefilter_case_insensitive; - regex_pattern.prefilter_regex = prefilter_regex; - regex.push(text_search::PatternEntry::Regex(regex_pattern)); - regex_indexes.push(pattern_index); - } - SearchPattern::Fuzzy { pattern, distance } => { - fuzzy.push(text_search::PatternEntry::Fuzzy( - text_search::FuzzyPattern::new( - pattern, - distance.map_or( - text_search::FuzzyDistance::Auto, - text_search::FuzzyDistance::Exact, - ), - ), - )); - fuzzy_indexes.push(pattern_index); - } - } - } + let parts = partition_patterns(patterns)?; + build_search_index(parts, options, None) + } + pub fn prepare_artifacts( + patterns: Vec, + options: SearchOptions, + ) -> Result { + let parts = partition_patterns(patterns)?; let mut slots = Vec::new(); - push_slot( + capture_slot_artifacts( &mut slots, - SlotEngine::Literal, - literals, - literal_indexes, + parts.literals, literal_options(options.literal), )?; - push_slot( + capture_slot_artifacts( &mut slots, - SlotEngine::Regex, - regex, - regex_indexes, + parts.regex, regex_options(options.regex), )?; - push_slot( + capture_slot_artifacts( &mut slots, - SlotEngine::Fuzzy, - fuzzy, - fuzzy_indexes, + parts.fuzzy, fuzzy_options(options.fuzzy), )?; + Ok(SearchIndexArtifacts { slots }) + } - Ok(Self { slots }) + pub fn new_with_artifacts( + patterns: Vec, + options: SearchOptions, + artifacts: &SearchIndexArtifacts, + ) -> Result { + let parts = partition_patterns(patterns)?; + let mut cursor = SearchIndexArtifactCursor::new(&artifacts.slots); + let search = build_search_index(parts, options, Some(&mut cursor))?; + cursor.finish()?; + Ok(search) } pub fn find_iter(&self, haystack: &str) -> Result> { @@ -234,19 +263,154 @@ impl SearchIndex { } } +fn partition_patterns( + patterns: Vec, +) -> Result { + let mut literals = Vec::new(); + let mut literal_indexes = Vec::new(); + let mut regex = Vec::new(); + let mut regex_indexes = Vec::new(); + let mut fuzzy = Vec::new(); + let mut fuzzy_indexes = Vec::new(); + + for (index, entry) in patterns.into_iter().enumerate() { + let pattern_index = pattern_index(index)?; + match entry { + SearchPattern::Literal(pattern) => { + literals.push(text_search::PatternEntry::Auto(pattern)); + literal_indexes.push(pattern_index); + } + SearchPattern::LiteralWithOptions { + pattern, + case_insensitive, + whole_words, + } => { + literals.push(text_search::PatternEntry::Literal( + text_search::LiteralPattern { + pattern, + name: None, + case_insensitive, + whole_words, + }, + )); + literal_indexes.push(pattern_index); + } + SearchPattern::Regex(pattern) => { + regex.push(text_search::PatternEntry::Regex( + text_search::RegexPattern::new(pattern), + )); + regex_indexes.push(pattern_index); + } + SearchPattern::RegexWithOptions { + pattern, + lazy, + prefilter_any, + prefilter_case_insensitive, + prefilter_regex, + } => { + let mut regex_pattern = text_search::RegexPattern::new(pattern); + regex_pattern.lazy = lazy; + regex_pattern.prefilter_any = prefilter_any; + regex_pattern.prefilter_case_insensitive = prefilter_case_insensitive; + regex_pattern.prefilter_regex = prefilter_regex; + regex.push(text_search::PatternEntry::Regex(regex_pattern)); + regex_indexes.push(pattern_index); + } + SearchPattern::Fuzzy { pattern, distance } => { + fuzzy.push(text_search::PatternEntry::Fuzzy( + text_search::FuzzyPattern::new( + pattern, + distance.map_or( + text_search::FuzzyDistance::Auto, + text_search::FuzzyDistance::Exact, + ), + ), + )); + fuzzy_indexes.push(pattern_index); + } + } + } + + Ok(SearchIndexParts { + literals, + literal_indexes, + regex, + regex_indexes, + fuzzy, + fuzzy_indexes, + }) +} + +fn build_search_index( + parts: SearchIndexParts, + options: SearchOptions, + mut artifacts: Option<&mut SearchIndexArtifactCursor<'_>>, +) -> Result { + let mut slots = Vec::new(); + let literal_artifacts = slot_artifacts(&parts.literals, &mut artifacts)?; + push_slot( + &mut slots, + SlotEngine::Literal, + parts.literals, + parts.literal_indexes, + literal_options(options.literal), + literal_artifacts, + )?; + let regex_artifacts = slot_artifacts(&parts.regex, &mut artifacts)?; + push_slot( + &mut slots, + SlotEngine::Regex, + parts.regex, + parts.regex_indexes, + regex_options(options.regex), + regex_artifacts, + )?; + let fuzzy_artifacts = slot_artifacts(&parts.fuzzy, &mut artifacts)?; + push_slot( + &mut slots, + SlotEngine::Fuzzy, + parts.fuzzy, + parts.fuzzy_indexes, + fuzzy_options(options.fuzzy), + fuzzy_artifacts, + )?; + + Ok(SearchIndex { slots }) +} + +fn slot_artifacts<'a>( + patterns: &[text_search::PatternEntry], + artifacts: &mut Option<&mut SearchIndexArtifactCursor<'a>>, +) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + let Some(cursor) = artifacts else { + return Ok(None); + }; + cursor.next().map(Some) +} + fn push_slot( slots: &mut Vec, engine: SlotEngine, patterns: Vec, pattern_indexes: Vec, options: text_search::TextSearchOptions, + artifacts: Option<&text_search::PreparedTextSearchArtifacts>, ) -> Result<()> { if patterns.is_empty() { return Ok(()); } - let search = text_search::TextSearch::new(patterns, options) - .map_err(|error| search_error(&error))?; + let search = if let Some(artifacts) = artifacts { + text_search::TextSearch::with_prepared_artifacts( + patterns, options, artifacts, + ) + } else { + text_search::TextSearch::new(patterns, options) + } + .map_err(|error| search_error(&error))?; slots.push(SearchSlot { engine, search, @@ -255,6 +419,21 @@ fn push_slot( Ok(()) } +fn capture_slot_artifacts( + slots: &mut Vec, + patterns: Vec, + options: text_search::TextSearchOptions, +) -> Result<()> { + if patterns.is_empty() { + return Ok(()); + } + slots.push( + text_search::TextSearch::prepare_artifacts(patterns, options) + .map_err(|error| search_error(&error))?, + ); + Ok(()) +} + fn literal_options( options: LiteralSearchOptions, ) -> text_search::TextSearchOptions { @@ -262,6 +441,7 @@ fn literal_options( case_insensitive: options.case_insensitive, whole_words: options.whole_words, overlap_strategy: text_search::OverlapStrategy::All, + all_literal: true, ..text_search::TextSearchOptions::default() } } @@ -303,9 +483,13 @@ fn remap_pattern(slot: &SearchSlot, local_pattern: u32) -> Result { } fn search_error(error: &text_search::Error) -> Error { + search_message(error.to_string()) +} + +const fn search_message(reason: String) -> Error { Error::Search { engine: SearchEngine::Text, - reason: error.to_string(), + reason, } } diff --git a/crates/anonymize-core/src/signatures.rs b/crates/anonymize-core/src/signatures.rs new file mode 100644 index 00000000..661a9550 --- /dev/null +++ b/crates/anonymize-core/src/signatures.rs @@ -0,0 +1,641 @@ +use crate::resolution::{DetectionSource, PipelineEntity}; + +const PERSON_LABEL: &str = "person"; +const MAX_NAME_LEN: usize = 60; +const MAX_WITNESS_SCAN_BYTES: usize = 600; +const NAME_PARTICLES: &[&str] = &[ + "de", + "del", + "della", + "der", + "den", + "di", + "du", + "da", + "das", + "do", + "dos", + "el", + "la", + "le", + "van", + "von", + "y", + "zu", + "af", + "ben", + "bin", + "al", + "d'", + "d\u{2019}", +]; +const POST_NOMINAL_SUFFIXES: &[&str] = &[ + "jr", "sr", "ii", "iii", "iv", "v", "esq", "esquire", "md", "phd", "jd", + "llm", "mba", "cpa", "pe", "rn", "dds", "dvm", "do", "cfa", "cfp", +]; +const ORG_SUFFIXES: &[&str] = &[ + "inc", + "inc.", + "llc", + "llp", + "lp", + "corp", + "corp.", + "corporation", + "ltd", + "ltd.", + "gmbh", + "ag", + "se", + "kg", + "ohg", + "sa", + "sas", + "sarl", + "s.a", + "s.a.", + "s.p.a", + "s.p.a.", + "plc", + "n.a", + "n.a.", + "n.v", + "n.v.", + "b.v", + "b.v.", + "pty ltd", + "pty ltd.", + "co", + "co.", + "s.r.o", + "s.r.o.", + "a.s", + "a.s.", + "z.s", + "z.s.", + "s.p", + "s.p.", + "s. p.", + "ltda", + "ltda.", + "eireli", + "epp", + "s/a", +]; + +#[must_use] +pub(crate) fn detect_signatures(full_text: &str) -> Vec { + let mut results = Vec::new(); + detect_slash_s(full_text, &mut results); + detect_labelled_names(full_text, &mut results); + detect_witness_blocks(full_text, &mut results); + results +} + +fn detect_slash_s(full_text: &str, results: &mut Vec) { + let mut cursor = 0usize; + while let Some(relative) = + full_text.get(cursor..).and_then(|tail| tail.find("/s/")) + { + let mark_start = cursor.saturating_add(relative); + let mut after_mark = mark_start.saturating_add("/s/".len()); + after_mark = skip_horizontal_ws(full_text, after_mark); + let line_end = find_line_end(full_text, after_mark); + let same_line = full_text + .get(after_mark..line_end) + .unwrap_or_default() + .trim(); + if same_line.is_empty() { + try_emit_forward_lines( + results, + full_text, + line_end.saturating_add(1), + 4, + 0.9, + ); + } else { + let first_cell_end = after_mark.saturating_add( + full_text + .get(after_mark..line_end) + .and_then(first_column_end) + .unwrap_or_else(|| line_end.saturating_sub(after_mark)), + ); + try_emit(results, full_text, after_mark, first_cell_end, 0.95); + } + + if let Some((prev_start, prev_end)) = find_prev_line(full_text, mark_start) + { + try_emit(results, full_text, prev_start, prev_end, 0.85); + } + cursor = mark_start.saturating_add("/s/".len()); + } +} + +fn detect_labelled_names(full_text: &str, results: &mut Vec) { + let mut line_start = 0usize; + while line_start <= full_text.len() { + let line_end = find_line_end(full_text, line_start); + if let Some(line) = full_text.get(line_start..line_end) { + detect_labelled_names_in_line(full_text, line_start, line, results); + } + if line_end >= full_text.len() { + break; + } + line_start = line_end.saturating_add(1); + } +} + +fn detect_labelled_names_in_line( + full_text: &str, + line_start: usize, + line: &str, + results: &mut Vec, +) { + let mut cursor = 0usize; + while let Some(label) = find_label(line, cursor) { + let mut value_start = label.value_start; + if let Some(after_slash) = slash_s_prefix_end(line, value_start) { + value_start = after_slash; + } + let value_end = value_start.saturating_add( + line + .get(value_start..) + .and_then(first_column_end) + .unwrap_or_else(|| line.len().saturating_sub(value_start)), + ); + let global_start = line_start.saturating_add(value_start); + let global_end = line_start.saturating_add(value_end); + let value_is_empty = line + .get(value_start..value_end) + .unwrap_or_default() + .trim() + .is_empty(); + if value_is_empty { + try_emit_forward_lines( + results, + full_text, + global_end.saturating_add(1), + 3, + 0.9, + ); + } else { + try_emit(results, full_text, global_start, global_end, 0.95); + } + cursor = value_end.max(label.next_cursor); + } +} + +fn detect_witness_blocks(full_text: &str, results: &mut Vec) { + let mut cursor = 0usize; + while let Some(relative) = find_ascii_case_insensitive( + full_text.get(cursor..).unwrap_or_default(), + "in witness whereof", + ) { + let anchor = cursor.saturating_add(relative); + if !has_word_boundaries(full_text, anchor, "in witness whereof".len()) { + cursor = anchor.saturating_add(1); + continue; + } + let anchor_line_end = find_line_end(full_text, anchor); + if anchor_line_end >= full_text.len() { + break; + } + let limit = + advance_char_boundary(full_text, anchor, MAX_WITNESS_SCAN_BYTES); + if let Some(scan_from) = find_witness_sentence_end(full_text, anchor, limit) + { + try_emit_forward_lines(results, full_text, scan_from, 6, 0.85); + } + cursor = anchor.saturating_add("in witness whereof".len()); + } +} + +fn try_emit_forward_lines( + results: &mut Vec, + full_text: &str, + from_pos: usize, + max_lines: usize, + score: f64, +) -> bool { + let mut pos = from_pos; + for _ in 0..max_lines { + if pos >= full_text.len() { + return false; + } + let line_end = find_line_end(full_text, pos); + let line = full_text.get(pos..line_end).unwrap_or_default().trim(); + if !line.is_empty() + && !is_image_stub(line) + && try_emit(results, full_text, pos, line_end, score) + { + return true; + } + pos = line_end.saturating_add(1); + } + false +} + +fn try_emit( + results: &mut Vec, + full_text: &str, + start: usize, + end: usize, + score: f64, +) -> bool { + let raw = full_text.get(start..end).unwrap_or_default(); + if contains_org_suffix(raw) { + return false; + } + let candidate = normalise_candidate(raw); + if !is_name_shape(&candidate) { + return false; + } + let Some(offset) = raw.find(&candidate) else { + return false; + }; + let abs_start = start.saturating_add(offset); + let abs_end = abs_start.saturating_add(candidate.len()); + let Ok(start_u32) = u32::try_from(abs_start) else { + return false; + }; + let Ok(end_u32) = u32::try_from(abs_end) else { + return false; + }; + results.push(PipelineEntity::detected( + start_u32, + end_u32, + PERSON_LABEL, + candidate, + score, + DetectionSource::Trigger, + )); + true +} + +fn normalise_candidate(text: &str) -> String { + let stripped = strip_post_nominal_suffix(text.trim()); + let first_cell_end = first_column_end(stripped).unwrap_or(stripped.len()); + stripped + .get(..first_cell_end) + .unwrap_or(stripped) + .trim() + .to_owned() +} + +fn strip_post_nominal_suffix(text: &str) -> &str { + let Some(comma) = text.rfind(',') else { + return text; + }; + let suffix = text + .get(comma.saturating_add(1)..) + .unwrap_or_default() + .trim() + .trim_end_matches('.'); + let compact = suffix + .chars() + .filter(|ch| *ch != '.') + .collect::() + .to_lowercase(); + if POST_NOMINAL_SUFFIXES.contains(&compact.as_str()) { + return text.get(..comma).unwrap_or(text).trim(); + } + text +} + +fn is_name_shape(text: &str) -> bool { + if text.len() < 3 || text.len() > MAX_NAME_LEN { + return false; + } + let tokens = text.split([' ', '\t']).filter(|token| !token.is_empty()); + let tokens = tokens.collect::>(); + if !(2..=5).contains(&tokens.len()) { + return false; + } + let Some(first) = tokens.first() else { + return false; + }; + if !is_cap_token(first) { + return false; + } + tokens + .iter() + .skip(1) + .all(|token| is_name_particle(token) || is_cap_token(token)) +} + +fn is_cap_token(token: &str) -> bool { + let mut chars = token.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_uppercase() + && chars.take(30).all(|ch| { + ch.is_alphabetic() + || matches!(ch, '\u{0300}'..='\u{036f}' | '.' | '\'' | '-' | '’') + }) +} + +fn is_name_particle(token: &str) -> bool { + NAME_PARTICLES.contains(&token) +} + +fn contains_org_suffix(text: &str) -> bool { + let lower = text.to_lowercase(); + ORG_SUFFIXES + .iter() + .any(|suffix| contains_bounded(&lower, suffix)) +} + +fn contains_bounded(text: &str, needle: &str) -> bool { + let mut cursor = 0usize; + while let Some(relative) = + text.get(cursor..).and_then(|tail| tail.find(needle)) + { + let start = cursor.saturating_add(relative); + let end = start.saturating_add(needle.len()); + if boundary_before(text, start) && boundary_after(text, end) { + return true; + } + cursor = start.saturating_add(1); + } + false +} + +fn boundary_before(text: &str, byte: usize) -> bool { + char_before(text, byte).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn boundary_after(text: &str, byte: usize) -> bool { + char_after(text, byte).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn first_column_end(text: &str) -> Option { + let mut run_start = None::; + let mut run_len = 0usize; + for (index, ch) in text.char_indices() { + if ch == '\t' { + return Some(index); + } + if ch.is_whitespace() { + if run_start.is_none() { + run_start = Some(index); + } + run_len = run_len.saturating_add(1); + if run_len >= 3 { + return run_start; + } + continue; + } + run_start = None; + run_len = 0; + } + None +} + +#[derive(Clone, Copy)] +struct LabelMatch { + value_start: usize, + next_cursor: usize, +} + +fn find_label(line: &str, from: usize) -> Option { + let mut cursor = from; + while cursor < line.len() { + if !line.is_char_boundary(cursor) { + cursor = cursor.saturating_add(1); + continue; + } + if let Some(after_label) = label_end_at(line, cursor) { + let mut after_spaces = skip_horizontal_ws(line, after_label); + if line.get(after_spaces..)?.starts_with(':') { + after_spaces = skip_horizontal_ws(line, after_spaces.saturating_add(1)); + return Some(LabelMatch { + value_start: after_spaces, + next_cursor: after_spaces.saturating_add(1), + }); + } + } + cursor = cursor.saturating_add(1); + } + None +} + +fn label_end_at(line: &str, start: usize) -> Option { + if !boundary_before(line, start) { + return None; + } + if starts_with_ascii_ci(line.get(start..)?, "by") { + let end = start.saturating_add("by".len()); + return label_tail_is_valid(line, end).then_some(end); + } + if starts_with_ascii_ci(line.get(start..)?, "name") { + let end = start.saturating_add("name".len()); + return label_tail_is_valid(line, end).then_some(end); + } + None +} + +fn label_tail_is_valid(line: &str, end: usize) -> bool { + line + .get(end..) + .and_then(|tail| tail.chars().next()) + .is_some_and(|ch| ch == ':' || ch == ' ' || ch == '\t') +} + +fn slash_s_prefix_end(line: &str, start: usize) -> Option { + let tail = line.get(start..)?; + if !tail.starts_with("/s/") { + return None; + } + let after = start.saturating_add("/s/".len()); + let has_space = line + .get(after..) + .and_then(|value| value.chars().next()) + .is_some_and(|ch| ch == ' ' || ch == '\t'); + has_space.then(|| skip_horizontal_ws(line, after)) +} + +fn skip_horizontal_ws(text: &str, from: usize) -> usize { + let mut cursor = from; + while let Some(ch) = text.get(cursor..).and_then(|tail| tail.chars().next()) { + if ch != ' ' && ch != '\t' { + break; + } + cursor = cursor.saturating_add(ch.len_utf8()); + } + cursor +} + +fn find_line_end(text: &str, pos: usize) -> usize { + text + .get(pos..) + .and_then(|tail| tail.find('\n')) + .map_or(text.len(), |relative| pos.saturating_add(relative)) +} + +fn find_prev_line(full_text: &str, pos: usize) -> Option<(usize, usize)> { + if pos == 0 { + return None; + } + let bytes = full_text.as_bytes(); + let mut cursor = pos.saturating_sub(1); + while cursor > 0 && bytes.get(cursor).copied() != Some(b'\n') { + cursor = cursor.saturating_sub(1); + } + if bytes.get(cursor).copied() != Some(b'\n') { + return None; + } + + while cursor > 0 { + let line_end = cursor; + let mut line_start = line_end; + while line_start > 0 + && bytes.get(line_start.saturating_sub(1)).copied() != Some(b'\n') + { + line_start = line_start.saturating_sub(1); + } + let line = full_text + .get(line_start..line_end) + .unwrap_or_default() + .trim(); + if !line.is_empty() && !is_image_stub(line) { + return Some((line_start, line_end)); + } + if line_start == 0 { + break; + } + cursor = line_start.saturating_sub(1); + } + None +} + +fn find_witness_sentence_end( + full_text: &str, + from: usize, + limit: usize, +) -> Option { + let mut line_start = from; + while line_start < limit { + let line_end = find_line_end(full_text, line_start).min(limit); + let line = full_text + .get(line_start..line_end) + .unwrap_or_default() + .trim_end(); + if line.ends_with('.') || line.ends_with(':') || line.ends_with(';') { + return Some(line_end.saturating_add(1)); + } + let next_start = line_end.saturating_add(1); + if next_start >= limit { + return None; + } + let next_end = find_line_end(full_text, next_start).min(limit); + let next_line_empty = full_text + .get(next_start..next_end) + .unwrap_or_default() + .trim() + .is_empty(); + if next_line_empty { + return Some(next_end.saturating_add(1)); + } + line_start = next_start; + } + None +} + +fn advance_char_boundary(text: &str, start: usize, max_bytes: usize) -> usize { + let limit = start.saturating_add(max_bytes).min(text.len()); + if text.is_char_boundary(limit) { + return limit; + } + let mut cursor = limit; + while cursor > start && !text.is_char_boundary(cursor) { + cursor = cursor.saturating_sub(1); + } + cursor +} + +fn find_ascii_case_insensitive(text: &str, needle: &str) -> Option { + let needle_len = needle.len(); + if needle_len == 0 || text.len() < needle_len { + return None; + } + let mut cursor = 0usize; + while cursor.saturating_add(needle_len) <= text.len() { + if text.is_char_boundary(cursor) + && starts_with_ascii_ci(text.get(cursor..)?, needle) + { + return Some(cursor); + } + cursor = cursor.saturating_add(1); + } + None +} + +fn starts_with_ascii_ci(text: &str, prefix: &str) -> bool { + let Some(candidate) = text.get(..prefix.len()) else { + return false; + }; + candidate.eq_ignore_ascii_case(prefix) +} + +fn has_word_boundaries(text: &str, start: usize, len: usize) -> bool { + boundary_before(text, start) + && boundary_after(text, start.saturating_add(len)) +} + +fn char_before(text: &str, byte: usize) -> Option { + text.get(..byte)?.chars().next_back() +} + +fn char_after(text: &str, byte: usize) -> Option { + text.get(byte..)?.chars().next() +} + +fn is_image_stub(line: &str) -> bool { + let lower = line.trim_start().to_lowercase(); + lower.starts_with("[img") + || lower.starts_with("[image") + || lower.starts_with("[logo") + || lower.starts_with("(logo") +} + +#[cfg(test)] +mod tests { + use super::detect_signatures; + + #[test] + fn detects_slash_signature_same_line() { + let entities = detect_signatures("/s/ Jane Doe Chief Executive Officer"); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some("Jane Doe") + ); + } + + #[test] + fn detects_multiple_labelled_name_columns() { + let entities = + detect_signatures("Name: Priya Ramanathan Name: Jonathan H. Whitaker"); + + assert_eq!( + entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(), + vec!["Priya Ramanathan", "Jonathan H. Whitaker"] + ); + } + + #[test] + fn skips_organization_caption_before_signature_mark() { + let entities = detect_signatures("TWITTER, INC.\n/s/ Jane Doe"); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some("Jane Doe") + ); + } +} diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs new file mode 100644 index 00000000..ca3151a1 --- /dev/null +++ b/crates/anonymize-core/src/triggers.rs @@ -0,0 +1,1192 @@ +use fancy_regex::Regex as FancyRegex; +use regex::{Regex, RegexBuilder}; + +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Error, Result, SearchMatch}; +use crate::validators::validate_named_id; + +use super::processors::PatternSlice; + +const TRIGGER_SCORE: f64 = 0.95; +const MAX_TRIGGER_VALUE_LEN: usize = 100; +const MIN_TRIGGER_PHONE_DIGITS: usize = 5; +const TRIGGER_LOOKAHEAD_MARGIN: usize = 128; +const LINE_TRIGGER_LOOKAHEAD: usize = 2_048; +const MATCH_PATTERN_LOOKAHEAD: usize = 512; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TriggerData { + pub rules: Vec, + pub address_stop_keywords: Vec, + pub party_position_terms: Vec, + pub legal_form_suffixes: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TriggerRule { + pub trigger: String, + pub label: String, + pub strategy: TriggerStrategy, + pub validations: Vec, + pub include_trigger: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum TriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum TriggerValidation { + StartsUppercase, + MinLength(u32), + MaxLength(u32), + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +pub(crate) struct PreparedTriggerData { + rules: Vec, + address_stop_keywords: Vec, + party_position_terms: Vec, + legal_form_suffixes: Vec, +} + +struct PreparedTriggerRule { + trigger: String, + label: String, + strategy: PreparedTriggerStrategy, + validations: Vec, + include_trigger: bool, + requires_exact_case: bool, +} + +enum PreparedTriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: usize, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + regex: FancyRegex, + }, +} + +enum PreparedTriggerValidation { + StartsUppercase, + MinLength(usize), + MaxLength(usize), + NoDigits, + HasDigits, + MatchesPattern { regex: Regex }, + ValidId { validator: String }, +} + +#[derive(Clone)] +struct ExtractedValue { + start: u32, + end: u32, + text: String, +} + +impl PreparedTriggerData { + pub(crate) fn new(data: TriggerData) -> Result { + let rules = data + .rules + .into_iter() + .map(PreparedTriggerRule::new) + .collect::>>()?; + Ok(Self { + rules, + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + legal_form_suffixes: data.legal_form_suffixes, + }) + } +} + +impl PreparedTriggerRule { + fn new(rule: TriggerRule) -> Result { + let requires_exact_case = requires_exact_case_trigger(&rule.trigger); + Ok(Self { + trigger: rule.trigger, + label: rule.label, + strategy: PreparedTriggerStrategy::new(rule.strategy)?, + validations: rule + .validations + .into_iter() + .map(PreparedTriggerValidation::new) + .collect::>>()?, + include_trigger: rule.include_trigger, + requires_exact_case, + }) + } +} + +impl PreparedTriggerStrategy { + fn new(strategy: TriggerStrategy) -> Result { + Ok(match strategy { + TriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length: max_length.and_then(|value| usize::try_from(value).ok()), + }, + TriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + TriggerStrategy::NWords { count } => Self::NWords { + count: usize::try_from(count).unwrap_or(usize::MAX), + }, + TriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + TriggerStrategy::Address { max_chars } => Self::Address { + max_chars: max_chars.and_then(|value| usize::try_from(value).ok()), + }, + TriggerStrategy::MatchPattern { pattern, flags } => Self::MatchPattern { + regex: build_fancy_regex(&format!("^(?:{pattern})"), flags.as_deref())?, + }, + }) + } +} + +impl PreparedTriggerValidation { + fn new(validation: TriggerValidation) -> Result { + Ok(match validation { + TriggerValidation::StartsUppercase => Self::StartsUppercase, + TriggerValidation::MinLength(min) => { + Self::MinLength(usize::try_from(min).unwrap_or(usize::MAX)) + } + TriggerValidation::MaxLength(max) => { + Self::MaxLength(usize::try_from(max).unwrap_or(usize::MAX)) + } + TriggerValidation::NoDigits => Self::NoDigits, + TriggerValidation::HasDigits => Self::HasDigits, + TriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { + regex: build_regex(&pattern, flags.as_deref())?, + } + } + TriggerValidation::ValidId { validator } => Self::ValidId { validator }, + }) + } +} + +pub(crate) fn process_trigger_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &PreparedTriggerData, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(rule) = data.rules.get(local_index) else { + continue; + }; + if !has_left_boundary(full_text, &offsets, found.start())? { + continue; + } + if !has_right_boundary(full_text, &offsets, found.end(), &rule.trigger)? { + continue; + } + if rule.requires_exact_case + && !matches_trigger_case(full_text, &offsets, found, rule)? + { + continue; + } + + let Some(raw_value) = extract_value( + full_text, + &offsets, + found.end(), + &rule.strategy, + &rule.label, + &data.address_stop_keywords, + &data.party_position_terms, + )? + else { + continue; + }; + let Some(mut value) = strip_quotes(&raw_value) else { + continue; + }; + if !apply_validations(&value.text, &rule.validations) { + continue; + } + if rule.label == "phone number" + && !is_plausible_phone_trigger_value(&value.text) + { + continue; + } + if rule.label == "phone number" + && char_count(&value.text) > MAX_TRIGGER_VALUE_LEN + && char_at(full_text, &offsets, value.end)? != Some('\n') + && char_at(full_text, &offsets, value.end)? != Some('\t') + { + value = cap_phone_value(&value); + } + + let entity_start = if rule.include_trigger { + found.start() + } else { + value.start + }; + let mut entity_end = value.end; + let mut entity_text = offsets.slice(full_text, entity_start, entity_end)?; + let mut label = if rule.label == "person" + && has_known_legal_form_suffix(&entity_text, &data.legal_form_suffixes) + { + String::from("organization") + } else { + rule.label.clone() + }; + + if label == "person" + && let Some(end) = person_name_run_end(&value.text) + && end < value.text.len() + && let Some(head) = value.text.get(..end) + { + entity_end = value.start.saturating_add(u32_len(head)); + entity_text = offsets.slice(full_text, entity_start, entity_end)?; + } + + if label.is_empty() { + label.clone_from(&rule.label); + } + results.push(PipelineEntity::detected( + entity_start, + entity_end, + label, + entity_text, + TRIGGER_SCORE, + DetectionSource::Trigger, + )); + } + + Ok(results) +} + +fn extract_value( + text: &str, + offsets: &ByteOffsets<'_>, + trigger_end: u32, + strategy: &PreparedTriggerStrategy, + label: &str, + address_stop_keywords: &[String], + party_position_terms: &[String], +) -> Result> { + let trigger_end_byte = offsets.validate_offset(trigger_end)?; + let lookahead = get_trigger_lookahead(strategy); + let lookahead_end = + text.len().min(trigger_end_byte.saturating_add(lookahead)); + let remaining = text + .get(trigger_end_byte..lookahead_end) + .unwrap_or_default(); + let stripped = remaining.trim_start_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, ':' | ';') + }); + let trimmed_offset = remaining.len().saturating_sub(stripped.len()); + let value_start_byte = trigger_end_byte.saturating_add(trimmed_offset); + if stripped.is_empty() { + return Ok(None); + } + + let extracted = match strategy { + PreparedTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => extract_to_next_comma( + stripped, + value_start_byte, + label, + stop_words, + max_length.unwrap_or(MAX_TRIGGER_VALUE_LEN), + ), + PreparedTriggerStrategy::ToEndOfLine => { + extract_to_end_of_line(remaining, stripped, value_start_byte, label) + } + PreparedTriggerStrategy::NWords { count } => { + extract_n_words(stripped, value_start_byte, *count) + } + PreparedTriggerStrategy::CompanyIdValue => { + extract_company_id_value(text, trigger_end_byte) + } + PreparedTriggerStrategy::Address { max_chars } => extract_address( + stripped, + value_start_byte, + max_chars.unwrap_or(120), + address_stop_keywords, + party_position_terms, + ), + PreparedTriggerStrategy::MatchPattern { regex } => { + extract_match_pattern(stripped, value_start_byte, regex) + } + }; + Ok(extracted.and_then(|value| byte_value_to_offsets(text, offsets, value))) +} + +fn extract_to_next_comma( + value_text: &str, + value_start_byte: usize, + label: &str, + stop_words: &[String], + length_cap: usize, +) -> Option { + let mut end = 0; + while end < value_text.len() { + let Some((ch, len)) = char_at_byte(value_text, end) else { + break; + }; + if matches!(ch, '\n' | '(' | ')' | '[' | ']' | '\t' | ';') { + break; + } + if ch == '.' && is_sentence_terminator(value_text, end) { + break; + } + if hits_stop_word(value_text, end, stop_words) { + break; + } + if ch == ',' { + let after = value_text.get(end..).unwrap_or_default(); + if is_decimal_comma(after) { + end = end.saturating_add(len); + continue; + } + if label == "person" + && let Some(skip) = post_nominal_len(after) + { + end = end.saturating_add(skip); + continue; + } + break; + } + end = end.saturating_add(len); + } + if prefix_char_count(value_text, end) > length_cap { + end = cap_at_word_boundary(value_text, length_cap); + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_to_end_of_line( + remaining: &str, + value_text: &str, + value_start_byte: usize, + label: &str, +) -> Option { + let consumed = remaining.len().saturating_sub(value_text.len()); + if consumed > 0 && remaining.get(..consumed)?.contains('\n') { + return None; + } + let mut end = value_text.len(); + let mut found_line_stop = false; + for ch in ['\n', '\t'] { + if let Some(index) = value_text.find(ch) + && index < end + { + end = index; + found_line_stop = true; + } + } + if label == "phone number" + && let Some(shape_end) = phone_shape_end(value_text.get(..end)?) + && shape_end < end + { + end = shape_end.min(MAX_TRIGGER_VALUE_LEN); + found_line_stop = true; + } + if !found_line_stop { + end = cap_at_word_boundary(value_text, end.min(MAX_TRIGGER_VALUE_LEN)); + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_n_words( + value_text: &str, + value_start_byte: usize, + count: usize, +) -> Option { + let cell_end = value_text.find('\t').unwrap_or(value_text.len()); + let cell = value_text.get(..cell_end)?; + let mut words = Vec::<&str>::new(); + for word in cell.split_whitespace() { + if punctuation_only(word) || number_marker(word) { + continue; + } + words.push(word); + if words.len() >= count { + break; + } + } + let first = words.first().copied()?; + let first_index = cell.find(first)?; + let mut actual_end = first_index.saturating_add(first.len()); + let mut search_pos = actual_end; + for word in words.iter().skip(1) { + let relative = cell.get(search_pos..)?.find(word)?; + let index = search_pos.saturating_add(relative); + actual_end = index.saturating_add(word.len()); + search_pos = actual_end; + } + byte_value( + cell.get(first_index..actual_end)?, + value_start_byte.saturating_add(first_index), + actual_end.saturating_sub(first_index), + ) +} + +fn extract_company_id_value( + text: &str, + trigger_end_byte: usize, +) -> Option { + let raw = text.get(trigger_end_byte..)?; + let trigger_last = text.get(..trigger_end_byte)?.chars().next_back(); + let allow_empty_sep = matches!(trigger_last, Some('°' | 'º' | '№' | '#')); + let sep_len = separator_len(raw, allow_empty_sep)?; + let mut after_sep = raw.get(sep_len..)?; + let mut label_offset = 0; + if let Some(len) = number_label_len(after_sep) { + label_offset = len; + after_sep = after_sep.get(len..)?; + } + let id_raw = id_value_prefix(after_sep)?; + let id_text = id_raw.trim().trim_end_matches(|ch: char| { + matches!(ch, '.' | ',' | ';' | ':' | '!' | '?') + }); + if id_text.is_empty() { + return None; + } + let leading = id_raw.len().saturating_sub(id_raw.trim_start().len()); + Some(ByteValue { + start_byte: trigger_end_byte + .saturating_add(sep_len) + .saturating_add(label_offset) + .saturating_add(leading), + end_byte: trigger_end_byte + .saturating_add(sep_len) + .saturating_add(label_offset) + .saturating_add(leading) + .saturating_add(id_text.len()), + }) +} + +fn extract_address( + mut value_text: &str, + mut value_start_byte: usize, + max_len: usize, + stop_keywords: &[String], + party_position_terms: &[String], +) -> Option { + if let Some(trimmed) = + trim_leading_party_position(value_text, party_position_terms) + { + value_start_byte = value_start_byte.saturating_add(trimmed); + value_text = value_text.get(trimmed..)?; + } + + let mut end = 0; + while end < value_text.len() && prefix_char_count(value_text, end) < max_len { + let Some((ch, len)) = char_at_byte(value_text, end) else { + break; + }; + if matches!(ch, '\n' | '(') { + break; + } + if matches!(ch, ' ' | '\t') + && address_stop_hit(value_text.get(end..)?.trim_start(), stop_keywords) + { + break; + } + if ch == '.' { + let after_period = value_text.get(end.saturating_add(len)..)?; + if address_stop_hit(after_period.trim_start(), stop_keywords) { + break; + } + if let Some((next, _)) = char_at_byte(value_text, end.saturating_add(len)) + && (next.is_alphabetic() || next.is_ascii_digit()) + { + end = end.saturating_add(len); + continue; + } + if value_text + .get(end.saturating_add(len)..) + .is_some_and(|tail| { + tail.starts_with(' ') + && tail.trim_start().chars().next().is_some_and(|next_ch| { + next_ch.is_alphabetic() || next_ch.is_ascii_digit() + }) + }) + && !is_sentence_terminator(value_text, end) + { + end = end.saturating_add(len); + continue; + } + break; + } + if ch == ',' { + let after = value_text.get(end.saturating_add(len)..)?.trim_start(); + if address_stop_hit(after, stop_keywords) { + break; + } + if after.chars().next().is_some_and(|next_ch| { + next_ch.is_ascii_digit() || next_ch.is_uppercase() + }) { + end = end.saturating_add(len); + continue; + } + break; + } + end = end.saturating_add(len); + } + if prefix_char_count(value_text, end) >= max_len + && let Some(last_space) = value_text.get(..end)?.rfind(' ') + && last_space > 0 + { + end = last_space; + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_match_pattern( + value_text: &str, + value_start_byte: usize, + regex: &FancyRegex, +) -> Option { + let line = value_text + .split_once('\n') + .map_or(value_text, |(head, _)| head); + let found = regex.find(line).ok().flatten()?; + if found.start() == found.end() { + return None; + } + Some(ByteValue { + start_byte: value_start_byte.saturating_add(found.start()), + end_byte: value_start_byte.saturating_add(found.end()), + }) +} + +#[derive(Clone, Copy)] +struct ByteValue { + start_byte: usize, + end_byte: usize, +} + +fn byte_value( + value_text: &str, + value_start_byte: usize, + end: usize, +) -> Option { + let raw = value_text.get(..end)?; + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + let leading = raw.len().saturating_sub(raw.trim_start().len()); + let trailing = raw.len().saturating_sub(raw.trim_end().len()); + Some(ByteValue { + start_byte: value_start_byte.saturating_add(leading), + end_byte: value_start_byte + .saturating_add(end) + .saturating_sub(trailing), + }) +} + +fn byte_value_to_offsets( + full_text: &str, + _offsets: &ByteOffsets<'_>, + value: ByteValue, +) -> Option { + if !full_text.is_char_boundary(value.start_byte) + || !full_text.is_char_boundary(value.end_byte) + { + return None; + } + Some(ExtractedValue { + start: byte_to_offset(value.start_byte)?, + end: byte_to_offset(value.end_byte)?, + text: full_text.get(value.start_byte..value.end_byte)?.to_owned(), + }) +} + +fn strip_quotes(value: &ExtractedValue) -> Option { + let leading = value.text.len().saturating_sub( + value + .text + .trim_start_matches(|ch: char| { + ch.is_whitespace() + || matches!(ch, '„' | '"' | '»' | '«' | '\'' | '(' | ')') + }) + .len(), + ); + let stripped = value.text.get(leading..)?.trim_end_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, '"' | '»' | '«' | '\'' | '(' | ')') + }); + if stripped.is_empty() { + return None; + } + Some(ExtractedValue { + start: value + .start + .saturating_add(u32_len(value.text.get(..leading)?)), + end: value + .start + .saturating_add(u32_len(value.text.get(..leading)?)) + .saturating_add(u32_len(stripped)), + text: stripped.to_owned(), + }) +} + +fn apply_validations( + text: &str, + validations: &[PreparedTriggerValidation], +) -> bool { + validations.iter().all(|validation| match validation { + PreparedTriggerValidation::StartsUppercase => { + text.chars().next().is_some_and(char::is_uppercase) + } + PreparedTriggerValidation::MinLength(min) => text.len() >= *min, + PreparedTriggerValidation::MaxLength(max) => text.len() <= *max, + PreparedTriggerValidation::NoDigits => { + !text.chars().any(|ch| ch.is_ascii_digit()) + } + PreparedTriggerValidation::HasDigits => { + text.chars().any(|ch| ch.is_ascii_digit()) + } + PreparedTriggerValidation::MatchesPattern { regex } => regex.is_match(text), + PreparedTriggerValidation::ValidId { validator } => { + validate_named_id(validator, text) + } + }) +} + +fn build_regex(pattern: &str, flags: Option<&str>) -> Result { + let mut builder = RegexBuilder::new(pattern); + if flags.is_some_and(|flags| flags.contains('i')) { + builder.case_insensitive(true); + } + builder.build().map_err(|error| Error::Search { + engine: crate::types::SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn build_fancy_regex(pattern: &str, flags: Option<&str>) -> Result { + let source = if flags.is_some_and(|flags| flags.contains('i')) { + format!("(?i:{pattern})") + } else { + pattern.to_owned() + }; + FancyRegex::new(&source).map_err(|error| Error::Search { + engine: crate::types::SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn get_trigger_lookahead(strategy: &PreparedTriggerStrategy) -> usize { + match strategy { + PreparedTriggerStrategy::ToNextComma { max_length, .. } => max_length + .unwrap_or(MAX_TRIGGER_VALUE_LEN) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::ToEndOfLine => LINE_TRIGGER_LOOKAHEAD, + PreparedTriggerStrategy::NWords { count } => count + .saturating_mul(64) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::CompanyIdValue => 256, + PreparedTriggerStrategy::Address { max_chars } => max_chars + .unwrap_or(120) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::MatchPattern { .. } => MATCH_PATTERN_LOOKAHEAD, + } +} + +fn has_left_boundary( + text: &str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result { + if start == 0 { + return Ok(true); + } + let byte = offsets.validate_offset(start)?; + Ok( + !text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_alphabetic), + ) +} + +fn has_right_boundary( + text: &str, + offsets: &ByteOffsets<'_>, + end: u32, + trigger: &str, +) -> Result { + let Some(last) = trigger.chars().next_back() else { + return Ok(false); + }; + if !last.is_alphabetic() { + return Ok(true); + } + let byte = offsets.validate_offset(end)?; + Ok( + !text + .get(byte..) + .and_then(|suffix| suffix.chars().next()) + .is_some_and(char::is_alphabetic), + ) +} + +fn matches_trigger_case( + text: &str, + offsets: &ByteOffsets<'_>, + found: &SearchMatch, + rule: &PreparedTriggerRule, +) -> Result { + Ok(offsets.slice(text, found.start(), found.end())? == rule.trigger) +} + +fn requires_exact_case_trigger(trigger: &str) -> bool { + let mut letters = 0usize; + for ch in trigger.chars() { + if ch.is_whitespace() { + return false; + } + if !ch.is_alphabetic() { + continue; + } + letters = letters.saturating_add(1); + if !ch.is_uppercase() { + return false; + } + } + letters >= 2 +} + +fn char_at( + text: &str, + offsets: &ByteOffsets<'_>, + offset: u32, +) -> Result> { + let byte = offsets.validate_offset(offset)?; + Ok(text.get(byte..).and_then(|suffix| suffix.chars().next())) +} + +fn char_at_byte(text: &str, byte: usize) -> Option<(char, usize)> { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .map(|ch| (ch, ch.len_utf8())) +} + +fn cap_at_word_boundary(value_text: &str, cap: usize) -> usize { + let mut capped = byte_index_after_chars(value_text, cap); + while capped > 0 + && previous_char_is_word(value_text, capped) + && is_word_byte(value_text, capped) + { + capped = previous_char_boundary(value_text, capped); + } + capped +} + +fn byte_index_after_chars(value_text: &str, count: usize) -> usize { + value_text + .char_indices() + .nth(count) + .map_or(value_text.len(), |(index, _)| index) +} + +fn prefix_char_count(value_text: &str, end: usize) -> usize { + value_text + .get(..end) + .map_or(usize::MAX, |prefix| prefix.chars().count()) +} + +fn char_count(value_text: &str) -> usize { + value_text.chars().count() +} + +fn previous_char_is_word(text: &str, byte: usize) -> bool { + text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_alphanumeric) +} + +fn previous_char_boundary(text: &str, byte: usize) -> usize { + text + .get(..byte) + .and_then(|prefix| prefix.char_indices().next_back()) + .map_or(0, |(index, _)| index) +} + +fn is_word_byte(text: &str, byte: usize) -> bool { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .is_some_and(char::is_alphanumeric) +} + +fn hits_stop_word(text: &str, byte: usize, stop_words: &[String]) -> bool { + if stop_words.is_empty() { + return false; + } + if byte > 0 && is_word_byte(text, byte.saturating_sub(1)) { + return false; + } + let Some(tail) = text.get(byte..) else { + return false; + }; + stop_words.iter().any(|word| { + tail + .get(..word.len()) + .is_some_and(|head| head.eq_ignore_ascii_case(word)) + && tail + .get(word.len()..) + .and_then(|after| after.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) + }) +} + +fn is_decimal_comma(text: &str) -> bool { + let mut chars = text.chars(); + if chars.next() != Some(',') { + return false; + } + chars + .next() + .is_some_and(|ch| ch.is_ascii_digit() || matches!(ch, '-' | '–' | '—')) +} + +fn post_nominal_len(text: &str) -> Option { + let trimmed = text.strip_prefix(',')?.trim_start(); + let len_before = text.len().saturating_sub(trimmed.len()); + let mut token_end = 0; + for (index, ch) in trimmed.char_indices() { + if ch.is_alphabetic() || ch == '.' { + token_end = index.saturating_add(ch.len_utf8()); + continue; + } + break; + } + (token_end > 0).then_some(len_before.saturating_add(token_end)) +} + +fn is_sentence_terminator(text: &str, period_byte: usize) -> bool { + let Some(tail) = text.get(period_byte..) else { + return false; + }; + let starts_next = tail.strip_prefix('.').is_some_and(|after| { + after.trim_start().is_empty() || after.starts_with(char::is_whitespace) + }); + if !starts_next { + return false; + } + let head = text.get(..period_byte).unwrap_or_default(); + head + .chars() + .rev() + .take_while(|ch| ch.is_alphabetic()) + .filter(|ch| ch.is_lowercase()) + .count() + >= 5 + || head + .chars() + .next_back() + .is_some_and(|ch| ch.is_ascii_digit()) +} + +fn punctuation_only(text: &str) -> bool { + text.chars().all(|ch| !ch.is_alphanumeric()) +} + +fn number_marker(text: &str) -> bool { + matches!( + text.to_ascii_lowercase().as_str(), + "nº" | "no" | "n°" | "n." | "№" + ) +} + +fn phone_shape_end(text: &str) -> Option { + let mut chars = text.char_indices(); + let (_, first) = chars.next()?; + if !(first == '+' || first == '(' || first.is_ascii_digit()) { + return None; + } + let mut end = first.len_utf8(); + for (index, ch) in chars { + if ch.is_ascii_digit() + || ch.is_whitespace() + || matches!(ch, '(' | ')' | '.' | '/' | '-' | '–' | '—' | '‑') + { + end = index.saturating_add(ch.len_utf8()); + continue; + } + break; + } + while end > 0 + && text + .get(..end) + .and_then(|head| head.chars().next_back()) + .is_some_and(|ch| !ch.is_ascii_digit()) + { + end = end.saturating_sub(next_len_backward(text, end)); + } + (end > 0).then_some(end) +} + +fn next_len_backward(text: &str, byte: usize) -> usize { + text + .get(..byte) + .and_then(|head| head.chars().next_back()) + .map_or(1, char::len_utf8) +} + +fn is_plausible_phone_trigger_value(value: &str) -> bool { + let trimmed = value.trim_start(); + if !trimmed + .chars() + .next() + .is_some_and(|ch| ch == '+' || ch == '(' || ch.is_ascii_digit()) + { + return false; + } + if looks_like_iso_date(trimmed) || inline_field_label(trimmed) { + return false; + } + trimmed.chars().filter(char::is_ascii_digit).count() + >= MIN_TRIGGER_PHONE_DIGITS +} + +fn looks_like_iso_date(text: &str) -> bool { + let bytes = text.as_bytes(); + bytes.len() >= 10 + && bytes + .get(0..4) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) + && bytes.get(4) == Some(&b'-') + && bytes + .get(5..7) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) + && bytes.get(7) == Some(&b'-') + && bytes + .get(8..10) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) +} + +fn inline_field_label(text: &str) -> bool { + let mut letters = 0_usize; + for ch in text.chars().take(40) { + if ch == ':' && letters >= 2 { + return true; + } + if ch.is_alphabetic() || matches!(ch, ' ' | '/' | '-') { + letters = letters.saturating_add(usize::from(ch.is_alphabetic())); + continue; + } + if letters > 0 { + break; + } + } + false +} + +fn cap_phone_value(value: &ExtractedValue) -> ExtractedValue { + let capped_end = cap_at_word_boundary(&value.text, MAX_TRIGGER_VALUE_LEN) + .min(MAX_TRIGGER_VALUE_LEN); + let capped = value.text.get(..capped_end).unwrap_or_default().trim_end(); + ExtractedValue { + start: value.start, + end: value.start.saturating_add(u32_len(capped)), + text: capped.to_owned(), + } +} + +fn trim_leading_party_position(text: &str, terms: &[String]) -> Option { + for prefix in terms { + let prefix_len = prefix.len(); + let Some(head) = text.get(..prefix_len) else { + continue; + }; + if head.to_lowercase() != *prefix { + continue; + } + let rest = text.get(prefix_len..)?; + let ws_len = rest.len().saturating_sub(rest.trim_start().len()); + if ws_len == 0 { + continue; + } + let candidate = rest.get(ws_len..)?; + if candidate + .chars() + .next() + .is_some_and(|ch| ch.is_uppercase() || ch.is_ascii_digit()) + { + return Some(prefix_len.saturating_add(ws_len)); + } + } + None +} + +fn address_stop_hit(text: &str, stop_keywords: &[String]) -> bool { + let lower = text.to_lowercase(); + stop_keywords.iter().any(|keyword| { + lower.starts_with(keyword) + && lower + .get(keyword.len()..) + .and_then(|after| after.chars().next()) + .is_none_or(|ch| { + ch.is_whitespace() + || matches!(ch, ':' | ';' | ',' | '.' | '!' | '?' | '(' | ')') + || ch.is_ascii_digit() + }) + }) +} + +fn separator_len(raw: &str, allow_empty: bool) -> Option { + let trimmed_colon = raw.trim_start(); + let leading = raw.len().saturating_sub(trimmed_colon.len()); + if let Some(after_colon) = trimmed_colon.strip_prefix(':') { + return Some( + leading.saturating_add(1).saturating_add( + after_colon + .len() + .saturating_sub(after_colon.trim_start().len()), + ), + ); + } + if leading > 0 || allow_empty { + return Some(leading); + } + None +} + +fn number_label_len(text: &str) -> Option { + let labels = ["nr", "nr.", "numer", "nº", "no", "no.", "n°", "n.", "№"]; + for label in labels { + let Some(rest) = text.get(label.len()..) else { + continue; + }; + if text + .get(..label.len()) + .is_some_and(|head| head.eq_ignore_ascii_case(label)) + && (rest.starts_with(char::is_whitespace) || rest.starts_with(':')) + { + return Some(label.len().saturating_add(separator_len(rest, false)?)); + } + } + None +} + +fn id_value_prefix(text: &str) -> Option<&str> { + let mut end = 0; + let mut digits = 0_usize; + let mut previous_was_digit = false; + for (index, ch) in text.char_indices() { + let allowed = if ch.is_ascii_digit() { + digits = digits.saturating_add(1); + previous_was_digit = true; + true + } else if ch.is_ascii_alphabetic() { + let allow = digits == 0 || previous_was_digit; + previous_was_digit = false; + allow + } else if matches!(ch, ' ' | '.' | '-' | '/' | '\t') { + previous_was_digit = false; + true + } else { + false + }; + if !allowed { + break; + } + end = index.saturating_add(ch.len_utf8()); + } + (digits >= 2 && end >= 5).then(|| text.get(..end)).flatten() +} + +fn has_known_legal_form_suffix(text: &str, suffixes: &[String]) -> bool { + suffixes.iter().any(|suffix| { + let mut from = 0; + while let Some(relative) = + text.get(from..).and_then(|tail| tail.find(suffix)) + { + let start = from.saturating_add(relative); + let end = start.saturating_add(suffix.len()); + from = start.saturating_add(1); + if !suffix.chars().all(char::is_alphabetic) { + return true; + } + let left = text + .get(..start) + .and_then(|head| head.chars().next_back()) + .is_none_or(|ch| !ch.is_alphanumeric()); + let right = text + .get(end..) + .and_then(|tail| tail.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if left && right { + return true; + } + } + false + }) +} + +fn person_name_run_end(text: &str) -> Option { + let mut end = 0; + let mut saw_token = false; + for token in text.split_whitespace() { + let trimmed = token.trim_matches(','); + if trimmed.chars().next().is_some_and(char::is_uppercase) { + let relative = text.get(end..)?.find(token)?; + end = end.saturating_add(relative).saturating_add(token.len()); + saw_token = true; + continue; + } + break; + } + saw_token.then_some(end) +} + +fn u32_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn byte_to_offset(byte: usize) -> Option { + u32::try_from(byte).ok() +} diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 0de292fb..08fb087c 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -41,6 +41,10 @@ pub enum Error { MissingStaticData { field: &'static str, }, + InvalidStaticData { + field: &'static str, + reason: String, + }, StaticDataLengthMismatch { field: &'static str, expected: usize, @@ -99,6 +103,12 @@ impl fmt::Display for Error { Self::MissingStaticData { field } => { write!(formatter, "Static data field '{field}' is required") } + Self::InvalidStaticData { field, reason } => { + write!( + formatter, + "Static data field '{field}' is invalid: {reason}" + ) + } Self::StaticDataLengthMismatch { field, expected, diff --git a/crates/anonymize-core/src/validators.rs b/crates/anonymize-core/src/validators.rs new file mode 100644 index 00000000..632fb77a --- /dev/null +++ b/crates/anonymize-core/src/validators.rs @@ -0,0 +1,647 @@ +const SPANISH_CHECK_LETTERS: &str = "TRWAGMYFPDXBNJZSQVHLCKE"; +const SPANISH_CIF_LETTERS: &str = "JABCDEFGHI"; + +pub(crate) fn validate_named_id(validator: &str, value: &str) -> bool { + validate_id(validator, value, None) +} + +pub(crate) fn validate_id( + validator: &str, + value: &str, + input: Option<&str>, +) -> bool { + let candidate = validator_candidate(value, input); + match validator { + "au.abn" => validate_au_abn(&candidate), + "br.cnpj" => validate_cnpj(&candidate), + "br.cpf" => validate_cpf(&candidate), + "cz.dic" => validate_cz_dic(&candidate), + "cz.rc" => validate_cz_rc(&candidate), + "es.cif" => validate_es_cif(&candidate), + "es.dni" => validate_es_dni(&candidate), + "es.nie" => validate_es_nie(&candidate), + "gb.nhs" => validate_gb_nhs(&candidate), + "gb.nino" => validate_gb_nino(&candidate), + "no.mva" => validate_no_mva(&candidate), + "no.orgnr" => validate_no_orgnr(&candidate), + "us.ein" => validate_us_ein(&candidate), + "us.rtn" => validate_us_routing(&candidate), + _ => false, + } +} + +fn validator_candidate(value: &str, input: Option<&str>) -> String { + match input { + Some("digits-only") => decimal_digit_chars(value).collect(), + _ => value.to_owned(), + } +} + +fn validate_us_ein(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']); + if compact.len() != 9 || !is_ascii_digits(&compact) { + return false; + } + let Some(prefix) = compact.get(0..2) else { + return false; + }; + matches!( + prefix, + "01" + | "02" + | "03" + | "04" + | "05" + | "06" + | "10" + | "11" + | "12" + | "13" + | "14" + | "15" + | "16" + | "20" + | "21" + | "22" + | "23" + | "24" + | "25" + | "26" + | "27" + | "30" + | "31" + | "32" + | "33" + | "34" + | "35" + | "36" + | "37" + | "38" + | "39" + | "40" + | "41" + | "42" + | "43" + | "44" + | "45" + | "46" + | "47" + | "48" + | "50" + | "51" + | "52" + | "53" + | "54" + | "55" + | "56" + | "57" + | "58" + | "59" + | "60" + | "61" + | "62" + | "63" + | "64" + | "65" + | "66" + | "67" + | "68" + | "71" + | "72" + | "73" + | "74" + | "75" + | "76" + | "77" + | "80" + | "81" + | "82" + | "83" + | "84" + | "85" + | "86" + | "87" + | "88" + | "90" + | "91" + | "92" + | "93" + | "94" + | "95" + | "98" + | "99" + ) +} + +fn validate_cpf(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-', '.']); + let Ok(digits) = <[u32; 11]>::try_from(decimal_digits_strict(&compact)) + else { + return false; + }; + let [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10] = digits; + if digits.iter().all(|digit| *digit == d0) { + return false; + } + let first = cpf_digit(&[d0, d1, d2, d3, d4, d5, d6, d7, d8], 10); + let second = cpf_digit(&[d0, d1, d2, d3, d4, d5, d6, d7, d8, d9], 11); + d9 == first && d10 == second +} + +fn cpf_digit(digits: &[u32], weight_start: u32) -> u32 { + let sum = digits + .iter() + .enumerate() + .map(|(index, digit)| { + let index = u32::try_from(index).unwrap_or(u32::MAX); + digit.saturating_mul(weight_start.saturating_sub(index)) + }) + .sum::(); + let value = 11_u32.saturating_sub(sum.rem_euclid(11)); + if value >= 10 { 0 } else { value } +} + +fn validate_cnpj(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-', '.', '/']).to_uppercase(); + let chars = compact.chars().collect::>(); + let Ok(chars) = <[char; 14]>::try_from(chars) else { + return false; + }; + if !chars + .iter() + .all(|ch| ch.is_ascii_digit() || ch.is_ascii_uppercase()) + { + return false; + } + if chars.iter().take(12).all(|ch| *ch == '0') { + return false; + } + let first = cnpj_digit( + chars.get(..12).unwrap_or(&[]), + &[5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2], + ); + let second = cnpj_digit( + chars.get(..13).unwrap_or(&[]), + &[6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2], + ); + chars.get(12).and_then(|ch| ascii_digit_value(*ch)) == Some(first) + && chars.get(13).and_then(|ch| ascii_digit_value(*ch)) == Some(second) +} + +fn cnpj_digit(chars: &[char], weights: &[u32]) -> u32 { + let sum = chars + .iter() + .zip(weights) + .filter_map(|(ch, weight)| { + cnpj_char_value(*ch).map(|value| value.saturating_mul(*weight)) + }) + .sum::(); + let value = sum.rem_euclid(11); + if value < 2 { + 0 + } else { + 11_u32.saturating_sub(value) + } +} + +fn cnpj_char_value(ch: char) -> Option { + (ch.is_ascii_digit() || ch.is_ascii_uppercase()) + .then(|| u32::from(ch).saturating_sub(u32::from('0'))) +} + +fn validate_cz_rc(value: &str) -> bool { + let compact = compact_without(value, &[' ', '/']); + let digits = decimal_digits_strict(&compact); + let len = digits.len(); + if len != 9 && len != 10 { + return false; + } + + let Some(yy) = number_from_digits(digits.get(0..2)) else { + return false; + }; + let Some(raw_month) = number_from_digits(digits.get(2..4)) else { + return false; + }; + let Some(day) = number_from_digits(digits.get(4..6)) else { + return false; + }; + + let mut year = 1900_u32.saturating_add(yy); + if len == 9 { + if year >= 1980 { + year = year.saturating_sub(100); + } + if year > 1953 { + return false; + } + } else if year < 1954 { + year = year.saturating_add(100); + } + + let Some(month) = decode_cz_month(raw_month, year, len) else { + return false; + }; + if !valid_date(year, month, day) { + return false; + } + if len != 10 { + return true; + } + + let Some(front) = number_from_digits(digits.get(0..9)) else { + return false; + }; + let Some(check) = digits.get(9).copied() else { + return false; + }; + (front % 11) % 10 == check +} + +fn decode_cz_month(raw_month: u32, year: u32, len: usize) -> Option { + let offsets: &[u32] = if len == 10 && year >= 2004 { + &[0, 50, 20, 70] + } else { + &[0, 50] + }; + offsets.iter().find_map(|offset| { + let month = raw_month.checked_sub(*offset)?; + (1..=12).contains(&month).then_some(month) + }) +} + +fn validate_cz_dic(value: &str) -> bool { + let mut compact = compact_without(value, &[' ', '-']); + if compact.starts_with("CZ") || compact.starts_with("cz") { + compact = compact.chars().skip(2).collect(); + } + let digits = decimal_digits_strict(&compact); + if !(8..=10).contains(&digits.len()) { + return false; + } + match digits.len() { + 8 => validate_cz_dic_legal(&digits), + 9 if digits.first() == Some(&6) => validate_cz_dic_special(&digits), + 9 | 10 => validate_cz_rc(&compact), + _ => false, + } +} + +fn validate_cz_dic_legal(digits: &[u32]) -> bool { + if digits.first() == Some(&9) { + return false; + } + let Some(check) = digits.get(7).copied() else { + return false; + }; + let sum = + weighted_sum(digits.get(0..7).unwrap_or(&[]), &[8, 7, 6, 5, 4, 3, 2]) + .rem_euclid(11); + let v11 = 11_u32.saturating_sub(sum).rem_euclid(11); + let expected = if v11 == 0 { 1 } else { v11 % 10 }; + check == expected +} + +fn validate_cz_dic_special(digits: &[u32]) -> bool { + let Some(check_digit) = digits.get(8).copied() else { + return false; + }; + let sum = + weighted_sum(digits.get(1..8).unwrap_or(&[]), &[8, 7, 6, 5, 4, 3, 2]) + .rem_euclid(11); + let inner = 10_u32.saturating_add(11).saturating_sub(sum).rem_euclid(11); + let check = 8_u32 + .saturating_add(10) + .saturating_sub(inner) + .rem_euclid(10); + check_digit == check +} + +fn validate_gb_nhs(value: &str) -> bool { + let digits = decimal_digits_strict(value); + let Ok(digits) = <[u32; 10]>::try_from(digits) else { + return false; + }; + let [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9] = digits; + let total = weighted_sum( + &[d0, d1, d2, d3, d4, d5, d6, d7, d8], + &[10, 9, 8, 7, 6, 5, 4, 3, 2], + ); + let check = 11_u32.saturating_sub(total.rem_euclid(11)); + let expected = match check { + 10 => return false, + 11 => 0, + candidate => candidate, + }; + d9 == expected +} + +fn validate_gb_nino(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']).to_uppercase(); + let chars = compact.chars().collect::>(); + let Ok(chars) = <[char; 9]>::try_from(chars) else { + return false; + }; + let [first, second, d0, d1, d2, d3, d4, d5, suffix] = chars; + if !matches!( + first, + 'A' + | 'B' + | 'C' + | 'E' + | 'G' + | 'H' + | 'J' + | 'K' + | 'L' + | 'M' + | 'N' + | 'O' + | 'P' + | 'R' + | 'S' + | 'T' + | 'W' + | 'X' + | 'Y' + | 'Z' + ) { + return false; + } + if !matches!( + second, + 'A' + | 'B' + | 'C' + | 'E' + | 'G' + | 'H' + | 'J' + | 'K' + | 'L' + | 'M' + | 'N' + | 'P' + | 'R' + | 'S' + | 'T' + | 'W' + | 'X' + | 'Y' + | 'Z' + ) { + return false; + } + if ![d0, d1, d2, d3, d4, d5].iter().all(char::is_ascii_digit) { + return false; + } + if !matches!(suffix, 'A' | 'B' | 'C' | 'D') { + return false; + } + let prefix = [first, second].iter().collect::(); + !matches!( + prefix.as_str(), + "BG" | "GB" | "NK" | "KN" | "TN" | "NT" | "ZZ" + ) +} + +fn validate_es_dni(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']).to_uppercase(); + let chars = compact.chars().collect::>(); + let Ok(chars) = <[char; 9]>::try_from(chars) else { + return false; + }; + let [d0, d1, d2, d3, d4, d5, d6, d7, letter] = chars; + let digits = [d0, d1, d2, d3, d4, d5, d6, d7]; + let Some(number) = number_from_ascii_digits(&digits) else { + return false; + }; + spanish_check_letter(number) == Some(letter) +} + +fn validate_es_nie(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']).to_uppercase(); + let chars = compact.chars().collect::>(); + let Ok(chars) = <[char; 9]>::try_from(chars) else { + return false; + }; + let [prefix, d0, d1, d2, d3, d4, d5, d6, letter] = chars; + let prefix_value: u32 = match prefix { + 'X' => 0, + 'Y' => 1, + 'Z' => 2, + _ => return false, + }; + let digits = [d0, d1, d2, d3, d4, d5, d6]; + let Some(number) = number_from_ascii_digits(&digits) else { + return false; + }; + spanish_check_letter( + prefix_value + .saturating_mul(10_000_000) + .saturating_add(number), + ) == Some(letter) +} + +fn validate_es_cif(value: &str) -> bool { + let mut compact = compact_without(value, &[' ', '-', '/', '.']); + if compact.starts_with("ES") || compact.starts_with("es") { + compact = compact.chars().skip(2).collect(); + } + let compact = compact.to_uppercase(); + let chars = compact.chars().collect::>(); + let Ok(chars) = <[char; 9]>::try_from(chars) else { + return false; + }; + let [prefix, d0, d1, d2, d3, d4, d5, d6, check] = chars; + if !matches!( + prefix, + 'A' + | 'B' + | 'C' + | 'D' + | 'E' + | 'F' + | 'G' + | 'H' + | 'J' + | 'N' + | 'P' + | 'Q' + | 'R' + | 'S' + | 'U' + | 'V' + | 'W' + ) { + return false; + } + let digits = [d0, d1, d2, d3, d4, d5, d6]; + if !digits.iter().all(char::is_ascii_digit) { + return false; + } + let Some(cif_check) = spanish_cif_checksum(&digits) else { + return false; + }; + ascii_digit_value(check) == Some(cif_check) + || char_at(SPANISH_CIF_LETTERS, cif_check) == Some(check) +} + +fn spanish_check_letter(number: u32) -> Option { + char_at(SPANISH_CHECK_LETTERS, number % 23) +} + +fn spanish_cif_checksum(digits: &[char; 7]) -> Option { + let mut even = 0_u32; + let mut odd = 0_u32; + for (index, ch) in digits.iter().enumerate() { + let digit = ascii_digit_value(*ch)?; + if index.is_multiple_of(2) { + let doubled = digit.saturating_mul(2); + odd = odd.saturating_add( + doubled + .div_euclid(10) + .saturating_add(doubled.rem_euclid(10)), + ); + } else { + even = even.saturating_add(digit); + } + } + Some( + 10_u32 + .saturating_sub(even.saturating_add(odd).rem_euclid(10)) + .rem_euclid(10), + ) +} + +fn validate_au_abn(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']); + let Ok(mut digits) = <[u32; 11]>::try_from(decimal_digits_strict(&compact)) + else { + return false; + }; + let Some(first) = digits.first_mut() else { + return false; + }; + *first = (*first).saturating_sub(1); + weighted_sum(&digits, &[10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) + .is_multiple_of(89) +} + +fn validate_no_orgnr(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']); + let digits = decimal_digits_strict(&compact); + if digits.len() != 9 { + return false; + } + weighted_sum(&digits, &[3, 2, 7, 6, 5, 4, 3, 2, 1]).is_multiple_of(11) +} + +fn validate_no_mva(value: &str) -> bool { + let mut compact = compact_without(value, &[' ', '-']).to_uppercase(); + if compact.starts_with("NO") { + compact = compact.chars().skip(2).collect(); + } + if !compact.ends_with("MVA") { + return false; + } + let digits = compact + .get(..compact.len().saturating_sub(3)) + .unwrap_or_default(); + validate_no_orgnr(digits) +} + +fn validate_us_routing(value: &str) -> bool { + let compact = compact_without(value, &[' ', '-']); + let Ok(digits) = <[u32; 9]>::try_from(decimal_digits_strict(&compact)) else { + return false; + }; + let [d0, d1, d2, d3, d4, d5, d6, d7, d8] = digits; + let prefix = d0.saturating_mul(10).saturating_add(d1); + if !((1..=12).contains(&prefix) + || (21..=32).contains(&prefix) + || (61..=72).contains(&prefix) + || prefix == 80) + { + return false; + } + let first = d0.saturating_add(d3).saturating_add(d6).saturating_mul(3); + let second = d1.saturating_add(d4).saturating_add(d7).saturating_mul(7); + let checksum = first + .saturating_add(second) + .saturating_add(d2) + .saturating_add(d5) + .saturating_add(d8); + checksum.is_multiple_of(10) +} + +fn compact_without(value: &str, skipped: &[char]) -> String { + value.chars().filter(|ch| !skipped.contains(ch)).collect() +} + +fn decimal_digits(value: &str) -> Vec { + decimal_digit_chars(value) + .filter_map(|ch| ch.to_digit(10)) + .collect() +} + +fn decimal_digits_strict(value: &str) -> Vec { + if !is_ascii_digits(value) { + return Vec::new(); + } + decimal_digits(value) +} + +fn decimal_digit_chars(value: &str) -> impl Iterator + '_ { + value.chars().filter(char::is_ascii_digit) +} + +fn is_ascii_digits(value: &str) -> bool { + !value.is_empty() && value.chars().all(|ch| ch.is_ascii_digit()) +} + +fn ascii_digit_value(ch: char) -> Option { + ch.to_digit(10).filter(|_| ch.is_ascii_digit()) +} + +fn number_from_digits(digits: Option<&[u32]>) -> Option { + digits?.iter().try_fold(0_u32, |total, digit| { + total.checked_mul(10)?.checked_add(*digit) + }) +} + +fn number_from_ascii_digits(chars: &[char]) -> Option { + chars.iter().try_fold(0_u32, |total, ch| { + total.checked_mul(10)?.checked_add(ascii_digit_value(*ch)?) + }) +} + +fn char_at(text: &str, index: u32) -> Option { + usize::try_from(index) + .ok() + .and_then(|index| text.chars().nth(index)) +} + +fn weighted_sum(digits: &[u32], weights: &[u32]) -> u32 { + digits + .iter() + .zip(weights) + .map(|(digit, weight)| digit.saturating_mul(*weight)) + .sum() +} + +fn valid_date(year: u32, month: u32, day: u32) -> bool { + let days = match month { + 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31, + 4 | 6 | 9 | 11 => 30, + 2 if is_leap_year(year) => 29, + 2 => 28, + _ => return false, + }; + (1..=days).contains(&day) +} + +const fn is_leap_year(year: u32) -> bool { + year.is_multiple_of(4) && !year.is_multiple_of(100) + || year.is_multiple_of(400) +} diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 152f17d2..e48ec184 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1,11 +1,16 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] +use std::collections::{BTreeMap, BTreeSet}; + use stella_anonymize_core::{ - CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, - DiagnosticEventKind, DiagnosticStage, Error, FuzzySearchOptions, - GazetteerMatchData, LiteralSearchOptions, OperatorConfig, PatternSlice, - PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, + AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, + DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, + LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, + OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchArtifacts, + PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, TriggerData, + TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -22,9 +27,77 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { deny_list_data: None, gazetteer_data: None, country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, } } +fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { + let suffix_strings = suffixes + .iter() + .map(|suffix| (*suffix).to_owned()) + .collect::>(); + let regex_patterns = suffixes + .into_iter() + .map(|suffix| SearchPattern::Literal(suffix.to_owned())) + .collect::>(); + + PreparedSearch::new(PreparedSearchConfig { + regex_patterns, + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + legal_forms: PatternSlice { + start: 0, + end: u32::try_from(suffix_strings.len()).unwrap(), + }, + ..PreparedSearchSlices::default() + }, + legal_form_data: Some(LegalFormData { + suffixes: suffix_strings, + normalized_boundary_suffixes: vec![ + String::from("as"), + String::from("co"), + String::from("inc"), + String::from("llc"), + String::from("sro"), + ], + normalized_in_name_words: vec![String::from("co")], + normalized_suffix_words: vec![ + String::from("as"), + String::from("co"), + String::from("inc"), + String::from("llc"), + String::from("sro"), + ], + connector_words: vec![ + String::from("&"), + String::from("a"), + String::from("and"), + ], + and_connector_words: vec![String::from("and")], + in_name_prepositions: vec![String::from("of")], + company_suffix_words: vec![String::from("Company")], + sentence_verb_indicators: vec![ + String::from("include"), + String::from("is"), + String::from("podepsaly"), + ], + ..LegalFormData::default() + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap() +} + #[test] fn prepared_search_runs_normalized_literal_pass() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -50,6 +123,11 @@ fn prepared_search_runs_normalized_literal_pass() { is_fuzzy: vec![false], }), country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, }) .unwrap(); @@ -61,6 +139,107 @@ fn prepared_search_runs_normalized_literal_pass() { assert_eq!(result.gazetteer_entities[0].text, "Acme\u{00a0}Corp"); } +#[test] +fn prepared_search_artifacts_match_direct_prepare() { + let config = PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d{3}\b"))], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("identifier", 1.0)], + custom_regex_meta: vec![], + deny_list_data: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, + }; + let artifacts = PreparedSearch::prepare_artifacts(config.clone()).unwrap(); + assert!( + !artifacts.literals.slots.is_empty(), + "literal index should produce prepared artifacts" + ); + + let direct = PreparedSearch::new(config.clone()).unwrap(); + let prepared = + PreparedSearch::new_with_artifacts(config.clone(), &artifacts).unwrap(); + let text = "Acme\u{00a0}Corp. signed ID123"; + + assert_eq!( + prepared.find_matches(text).unwrap(), + direct.find_matches(text).unwrap() + ); + + let mut missing = artifacts; + missing.literals.slots.clear(); + assert!( + PreparedSearch::new_with_artifacts(config, &missing).is_err(), + "missing literal artifacts should fail" + ); +} + +#[test] +fn prepared_search_artifacts_roundtrip_bytes() { + let config = PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d{3}\b"))], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("identifier", 1.0)], + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + ..empty_config(PreparedSearchSlices::default()) + }; + let artifacts = PreparedSearch::prepare_artifacts(config.clone()).unwrap(); + let bytes = artifacts.to_bytes().unwrap(); + let decoded = PreparedSearchArtifacts::from_bytes(&bytes).unwrap(); + + assert_eq!(decoded, artifacts); + + let direct = PreparedSearch::new(config.clone()).unwrap(); + let prepared = PreparedSearch::new_with_artifacts(config, &decoded).unwrap(); + assert_eq!( + prepared.find_matches("Acme Corp signed ID123").unwrap(), + direct.find_matches("Acme Corp signed ID123").unwrap() + ); +} + +#[test] +fn prepared_search_artifacts_reject_invalid_bytes() { + let error = PreparedSearchArtifacts::from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, Error::InvalidStaticData { .. }), + "invalid prepared-search artifacts should fail at the format boundary" + ); +} + #[test] fn prepared_search_emits_static_detector_entities() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -111,6 +290,8 @@ fn prepared_search_emits_static_detector_entities() { score: 1.0, source_detail: Some(SourceDetail::CustomRegex), requires_validation: false, + validator_id: None, + validator_input: None, min_byte_length: None, }], deny_list_data: None, @@ -121,6 +302,11 @@ fn prepared_search_emits_static_detector_entities() { country_data: Some(CountryMatchData { labels: vec![String::from("country")], }), + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, }) .unwrap(); @@ -138,6 +324,412 @@ fn prepared_search_emits_static_detector_entities() { assert_eq!(result.country_entities[0].source, DetectionSource::Country); } +#[test] +fn prepared_search_drops_person_spans_ending_in_trailing_noun() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bCOBRA Reimbursement Period\b", + ))], + regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("person", 0.9)], + deny_list_data: Some(DenyListMatchData { + labels: Vec::new().into(), + custom_labels: Vec::new().into(), + originals: Vec::new(), + sources: Vec::new().into(), + filters: Some(DenyListFilterData { + person_trailing_nouns: BTreeSet::from([String::from("period")]), + ..DenyListFilterData::default() + }), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Payments continue during the COBRA Reimbursement Period.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_extracts_dates_from_anchored_data() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + date_data: Some(DateData { + month_names_by_language: BTreeMap::from([ + ( + String::from("en"), + vec![ + String::from("January"), + String::from("March"), + String::from("December"), + ], + ), + ( + String::from("cs"), + vec![String::from("ledna"), String::from("únor")], + ), + ]), + year_words_by_language: BTreeMap::from([( + String::from("cs"), + vec![String::from("roce")], + )]), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Signed 7 January 2025, renewed March 9, 2026, effective 2026. únor 3., filed 1.ledna 2026 and signed 1. ledna 2026. Ends December 31, \n\n2025. Výpis v roce 2026.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| (entity.text.as_str(), entity.label.as_str(), entity.source)) + .collect::>(); + + assert_eq!( + entities, + [ + ("7 January 2025", "date", DetectionSource::Regex), + ("March 9, 2026", "date", DetectionSource::Regex), + ("2026. únor 3.", "date", DetectionSource::Regex), + ("ledna 2026", "date", DetectionSource::Regex), + ("1. ledna 2026", "date", DetectionSource::Regex), + ("December 31, \n\n2025", "date", DetectionSource::Regex), + ("2026", "date", DetectionSource::Trigger), + ], + ); +} + +#[test] +fn prepared_search_extracts_written_date_of_birth_trigger() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 3 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Herr Müller, geboren am 21. März 1968, ist Geschäftsführer.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "date of birth" + && entity.text == "21. März 1968") + ); +} + +#[test] +fn prepared_search_extracts_year_after_duplicate_year_word_noise() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: ["rok", "an", "roce"] + .into_iter() + .map(|pattern| SearchPattern::LiteralWithOptions { + pattern: String::from(pattern), + case_insensitive: Some(true), + whole_words: Some(false), + }) + .collect(), + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 3 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: ["rok", "an", "roce"] + .into_iter() + .map(|trigger| TriggerRule { + trigger: String::from(trigger), + label: String::from("date"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: vec![TriggerValidation::MatchesPattern { + pattern: String::from(r"^(?:19|20)\d{2}\.?$"), + flags: None, + }], + include_trigger: false, + }) + .collect(), + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "účetní uzávěrku za roky 2019, 2020, 2021, 2022, 2023 a 2024, výpis z valné hromady konané v\u{00a0}roce 2026 a to nejpozději"; + let result = prepared.detect_static_entities(text).unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "date" && entity.text == "2026") + ); +} + +#[test] +fn prepared_search_trigger_caps_by_characters_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("ve výši"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("ve výši"), + label: String::from("monetary amount"), + strategy: TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: None, + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let expected = "0,2 % z Ceny Plnění dle příslušné Dílčí smlouvy za každý i započatý kalendářní den prodlení"; + let result = prepared + .detect_static_entities(&format!("Smluvní pokuta ve výši {expected}.")) + .unwrap(); + + assert!( + result.trigger_entities.iter().any(|entity| entity.label + == "monetary amount" + && entity.text == expected) + ); +} + +#[test] +fn prepared_search_rejects_lowercase_acronym_trigger_collisions() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("dni"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("DNI"), + label: String::from("national identification number"), + strategy: TriggerStrategy::CompanyIdValue, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let lower = prepared + .detect_static_entities("Cena je stanovena ke dni 6.11.2025.") + .unwrap(); + assert!(lower.trigger_entities.is_empty()); + + let upper = prepared + .detect_static_entities("Documento DNI 12345678Z.") + .unwrap(); + assert!( + upper + .trigger_entities + .iter() + .any(|entity| entity.text == "12345678Z" + && entity.label == "national identification number") + ); +} + +#[test] +fn prepared_search_trims_party_position_before_triggered_address() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("sídlo"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("sídlo"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(120), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: vec![String::from("prodávajícího")], + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Místem předání je sídlo prodávajícího Na Květnici 1657/16, 140 00 Praha 4.", + ) + .unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Na Květnici 1657/16, 140 00 Praha 4") + ); +} + +#[test] +fn prepared_search_extracts_money_from_anchored_data() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![String::from("USD"), String::from("EUR")], + symbols: vec![String::from("$")], + local_names: vec![String::from("Kč"), String::from("korun českých")], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![], + magnitude_suffixes: vec![MagnitudeSuffixData { + words: vec![String::from("million")], + abbreviations_case_insensitive: vec![], + abbreviations_case_sensitive: vec![], + }], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Fees are USD 1,250,000.00, $450,000, 25 million EUR and 275 000 Kč.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!( + entities, + [ + "USD 1,250,000.00", + "$450,000", + "25 million EUR", + "275 000 Kč", + ], + ); +} + +#[test] +fn prepared_search_extends_money_to_written_amount_parenthetical() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![], + symbols: vec![], + local_names: vec![String::from("Kč")], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![WrittenAmountPatternData { + keywords: vec![String::from("slovy")], + }], + magnitude_suffixes: vec![], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Smluvní pokuta je 50.000,- Kč (slovy: padesát tisíc korun českých).", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!( + entities, + ["50.000,- Kč (slovy: padesát tisíc korun českých)"], + ); +} + #[test] fn prepared_search_redacts_static_entities_end_to_end() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -185,6 +777,11 @@ fn prepared_search_redacts_static_entities_end_to_end() { country_data: Some(CountryMatchData { labels: vec![String::from("country")], }), + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, }) .unwrap(); @@ -240,6 +837,11 @@ fn prepared_search_reports_static_redaction_diagnostics() { is_fuzzy: vec![false], }), country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, }) .unwrap(); @@ -298,14 +900,19 @@ fn prepared_search_redacts_custom_deny_list_entities() { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: Some(DenyListMatchData { - labels: vec![vec![String::from("matter")]], - custom_labels: vec![vec![String::from("matter")]], + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), originals: vec![String::from("Secret Code")], - sources: vec![vec![String::from("custom-deny-list")]], + sources: vec![vec![String::from("custom-deny-list")]].into(), filters: None, }), gazetteer_data: None, country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, }) .unwrap(); @@ -324,52 +931,315 @@ fn prepared_search_redacts_custom_deny_list_entities() { #[test] fn prepared_search_rejects_unsupported_static_slices() { let unsupported = PatternSlice { start: 0, end: 1 }; - let cases = [ - ( - "legal_forms", - PreparedSearchSlices { - legal_forms: unsupported, - ..PreparedSearchSlices::default() - }, - ), - ( - "triggers", - PreparedSearchSlices { - triggers: unsupported, - ..PreparedSearchSlices::default() - }, - ), - ( - "deny_list", - PreparedSearchSlices { - deny_list: unsupported, - ..PreparedSearchSlices::default() - }, - ), - ( - "street_types", - PreparedSearchSlices { - street_types: unsupported, - ..PreparedSearchSlices::default() - }, - ), - ]; - - for (slice, slices) in cases { - let error = PreparedSearch::new(empty_config(slices)) - .err() - .expect("unsupported slice should be rejected"); + let error = PreparedSearch::new(empty_config(PreparedSearchSlices { + deny_list: unsupported, + ..PreparedSearchSlices::default() + })) + .err() + .expect("unsupported slice should be rejected"); - assert_eq!(error, Error::UnsupportedStaticSlice { slice }); - } + assert_eq!(error, Error::UnsupportedStaticSlice { slice: "deny_list" }); } #[test] -fn prepared_search_redacts_curated_deny_list_entities() { - let prepared = PreparedSearch::new(PreparedSearchConfig { - literal_patterns: vec![SearchPattern::LiteralWithOptions { - pattern: String::from("Prague"), - case_insensitive: Some(true), +fn prepared_search_requires_address_seed_data_for_street_types() { + let error = PreparedSearch::new(empty_config(PreparedSearchSlices { + street_types: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + })) + .err() + .expect("street types should require address seed data"); + + assert_eq!( + error, + Error::MissingStaticData { + field: "address_seed_data" + } + ); +} + +#[test] +fn prepared_search_expands_address_seeds_from_street_type_slice() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Boston"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Boston")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Send notices to 100 Main Street, Boston, MA 02101-1234.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "100 Main Street, Boston, MA 02101-1234") + ); +} + +#[test] +fn prepared_search_expands_address_seeds_from_city_and_postal_code() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Brno"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brno")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sídlo společnosti je Kamínky 302/16, Brno 634 00.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Kamínky 302/16, Brno 634 00") + ); +} + +#[test] +fn prepared_search_expands_compound_german_street_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Wiesbaden"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Düsseldorf")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "wohnhaft Schadowstraße 11, 40212 Düsseldorf.", + &OperatorConfig::default(), + ) + .unwrap(); + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Schadowstraße 11, 40212 Düsseldorf") + ); +} + +#[test] +fn prepared_search_expands_plain_postal_city_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Düsseldorf"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 3 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Wiesbaden")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "(2) Frau Karoline M. Brentano,\n geboren am 09. Juli 1982,\n wohnhaft Bismarckring 18, 65183 Wiesbaden,\n Steuer-ID: 78 123 456 789", + &OperatorConfig::default(), + ) + .unwrap(); + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Bismarckring 18, 65183 Wiesbaden") + ); +} + +#[test] +fn prepared_search_stops_address_before_notice_copy_instruction() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Wilmington"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Wilmington")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("with a copy")], + br_cep_cue_words: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "1209 Orange Street, Wilmington, DE 19801; with a copy to general counsel.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "1209 Orange Street, Wilmington, DE 19801") + ); + assert!( + result + .resolved_entities + .iter() + .all(|entity| !entity.text.contains("with a copy")) + ); +} + +#[test] +fn prepared_search_splits_address_seed_clusters_at_paragraph_breaks() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Brno"), + case_insensitive: Some(true), whole_words: Some(true), }], literal_options: SearchOptions { @@ -379,11 +1249,175 @@ fn prepared_search_redacts_curated_deny_list_entities() { }, ..SearchOptions::default() }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, deny_list_data: Some(DenyListMatchData { - labels: vec![vec![String::from("address")]], - custom_labels: vec![vec![]], + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brno")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Kamínky 5, Brno 634 00\n\nIČ: 48511229\n\nKamínky 302/16, Brno 634 00.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Kamínky 302/16, Brno 634 00") + ); +} + +#[test] +fn prepared_search_stops_address_seed_expansion_at_legal_prose() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Liberec"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Liberec")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("pokud")], + br_cep_cue_words: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Fakturu zašlete na Náspu 5, 460 01 Liberec, pokud nebude dohodnuto jinak. Přílohou bude seznam.", + &OperatorConfig::default(), + ) + .unwrap(); + + let addresses = result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect::>(); + assert!(addresses.contains(&"Náspu 5, 460 01 Liberec")); + assert!(!addresses.iter().any(|text| text.contains("pokud"))); + assert!(!addresses.iter().any(|text| text.contains("Přílohou"))); +} + +#[test] +fn prepared_search_does_not_cluster_address_seed_inside_register_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Handelsregister des Amtsgerichts Düsseldorf unter HRB \d+", + ))], + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + regex_options: SearchOptions { + regex: RegexSearchOptions { whole_words: false }, + ..SearchOptions::default() + }, + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Düsseldorf"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Düsseldorf")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("eingetragen")], + br_cep_cue_words: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sitz: Königsallee 27, 40212 Düsseldorf,\n eingetragen im Handelsregister des Amtsgerichts Düsseldorf unter HRB 78219.", + &OperatorConfig::default(), + ) + .unwrap(); + + let addresses = result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect::>(); + assert!(addresses.contains(&"Königsallee 27, 40212 Düsseldorf")); + assert!(!addresses.iter().any(|text| text.contains("Sitz:"))); + assert!( + !addresses + .iter() + .any(|text| text.contains("Handelsregister")) + ); +} + +#[test] +fn prepared_search_redacts_curated_deny_list_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Prague"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), originals: vec![String::from("Prague")], - sources: vec![vec![String::from("city")]], + sources: vec![vec![String::from("city")]].into(), filters: Some(DenyListFilterData::default()), }), ..empty_config(PreparedSearchSlices { @@ -404,10 +1438,10 @@ fn prepared_search_redacts_curated_deny_list_entities() { fn prepared_search_rejects_curated_deny_list_without_filters() { let error = PreparedSearch::new(PreparedSearchConfig { deny_list_data: Some(DenyListMatchData { - labels: vec![vec![String::from("address")]], - custom_labels: vec![vec![]], + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), originals: vec![String::from("Prague")], - sources: vec![vec![String::from("city")]], + sources: vec![vec![String::from("city")]].into(), filters: None, }), ..empty_config(PreparedSearchSlices { @@ -430,10 +1464,10 @@ fn prepared_search_rejects_curated_deny_list_without_filters() { fn prepared_search_rejects_truncated_deny_list_data() { let error = PreparedSearch::new(PreparedSearchConfig { deny_list_data: Some(DenyListMatchData { - labels: vec![vec![String::from("matter")]], - custom_labels: vec![], + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![].into(), originals: vec![String::from("Secret Code")], - sources: vec![vec![String::from("custom-deny-list")]], + sources: vec![vec![String::from("custom-deny-list")]].into(), filters: None, }), ..empty_config(PreparedSearchSlices { @@ -453,3 +1487,65 @@ fn prepared_search_rejects_truncated_deny_list_data() { } ); } + +#[test] +fn prepared_search_detects_non_english_legal_form_entities() { + let prepared = legal_form_prepared_search(vec!["a.s.", "a. s."]); + + let result = prepared + .detect_static_entities("Smlouvu podepsaly Pražské služby, a.s. dnes.") + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!(result.legal_form_entities[0].text, "Pražské služby, a.s."); + assert_eq!( + result.legal_form_entities[0].source, + DetectionSource::LegalForm + ); +} + +#[test] +fn prepared_search_keeps_indented_line_wrapped_legal_form_suffix() { + let prepared = legal_form_prepared_search(vec!["Co.", "LLC"]); + + let result = prepared + .detect_static_entities( + "The underwriter is Goldman Sachs & Co.\n LLC, joint book-runner.", + ) + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!( + result.legal_form_entities[0].text, + "Goldman Sachs & Co.\n LLC" + ); +} + +#[test] +fn prepared_search_splits_embedded_legal_form_lists() { + let prepared = legal_form_prepared_search(vec!["LLC", "Inc."]); + + let result = prepared + .detect_static_entities( + "The parties include Acme LLC, Beta Inc. and others.", + ) + .unwrap(); + let texts = result + .legal_form_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!(texts, vec!["Acme LLC", "Beta Inc."]); +} + +#[test] +fn prepared_search_rejects_dotted_citation_legal_form_substrings() { + let prepared = legal_form_prepared_search(vec!["S.C."]); + + let result = prepared + .detect_static_entities("See 18 U.S.C. Section 1833(b) for civil immunity.") + .unwrap(); + + assert!(result.legal_form_entities.is_empty()); +} diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs index 560a2e1c..58701793 100644 --- a/crates/anonymize-core/tests/processors.rs +++ b/crates/anonymize-core/tests/processors.rs @@ -3,7 +3,7 @@ use stella_anonymize_core::{ CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, Error, GazetteerMatchData, PatternSlice, PipelineEntity, RegexMatchMeta, - SearchMatch, SourceDetail, process_country_matches, + SearchMatch, SigningPlaceGuardData, SourceDetail, process_country_matches, process_deny_list_matches, process_gazetteer_matches, process_regex_matches, }; @@ -33,6 +33,8 @@ fn regex_processor_filters_slice_and_short_matches_by_meta() { score: 0.8, source_detail: None, requires_validation: false, + validator_id: None, + validator_input: None, min_byte_length: Some(7), }, ]; @@ -70,6 +72,8 @@ fn regex_processor_rejects_unported_validators() { score: 0.9, source_detail: None, requires_validation: true, + validator_id: None, + validator_input: None, min_byte_length: None, }]; @@ -87,6 +91,71 @@ fn regex_processor_rejects_unported_validators() { ); } +#[test] +fn regex_processor_applies_native_validator_ids() { + let matches = vec![ + SearchMatch::Regex { + pattern: 3, + start: 4, + end: 14, + }, + SearchMatch::Regex { + pattern: 3, + start: 19, + end: 29, + }, + ]; + let meta = vec![RegexMatchMeta { + label: String::from("tax identification number"), + score: 0.9, + source_detail: None, + requires_validation: true, + validator_id: Some(String::from("us.ein")), + validator_input: None, + min_byte_length: None, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 3, end: 4 }, + "EIN 87-2451993 bad 00-2451993", + &meta, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "87-2451993"); +} + +#[test] +fn regex_processor_applies_validator_input_kind() { + let matches = vec![SearchMatch::Regex { + pattern: 0, + start: 0, + end: 24, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("national identification number"), + score: 0.95, + source_detail: None, + requires_validation: true, + validator_id: Some(String::from("gb.nhs")), + validator_input: Some(String::from("digits-only")), + min_byte_length: None, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "NHS number: 401 023 2137", + &meta, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "NHS number: 401 023 2137"); +} + #[test] fn regex_processor_preserves_custom_regex_source_detail() { let matches = vec![SearchMatch::Regex { @@ -99,6 +168,8 @@ fn regex_processor_preserves_custom_regex_source_detail() { score: 0.7, source_detail: Some(SourceDetail::CustomRegex), requires_validation: false, + validator_id: None, + validator_input: None, min_byte_length: None, }]; @@ -121,10 +192,10 @@ fn deny_list_processor_emits_custom_labels() { end: 11, }]; let data = DenyListMatchData { - labels: vec![vec![String::from("matter")]], - custom_labels: vec![vec![String::from("matter")]], + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), originals: vec![String::from("Secret Code")], - sources: vec![vec![String::from("custom-deny-list")]], + sources: vec![vec![String::from("custom-deny-list")]].into(), filters: None, }; @@ -160,10 +231,10 @@ fn deny_list_processor_rejects_embedded_custom_word_matches() { }, ]; let data = DenyListMatchData { - labels: vec![vec![String::from("matter")]], - custom_labels: vec![vec![String::from("matter")]], + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), originals: vec![String::from("Secret")], - sources: vec![vec![String::from("custom-deny-list")]], + sources: vec![vec![String::from("custom-deny-list")]].into(), filters: None, }; @@ -187,10 +258,10 @@ fn deny_list_processor_emits_curated_non_person_labels() { end: 6, }]; let data = DenyListMatchData { - labels: vec![vec![String::from("address")]], - custom_labels: vec![vec![]], + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), originals: vec![String::from("Prague")], - sources: vec![vec![String::from("city")]], + sources: vec![vec![String::from("city")]].into(), filters: Some(DenyListFilterData::default()), }; @@ -207,6 +278,120 @@ fn deny_list_processor_emits_curated_non_person_labels() { assert_eq!(entities[0].source_detail, None); } +#[test] +fn deny_list_processor_suppresses_signing_place_address() { + let text = "Podepsano V Brně dne 1. ledna 2026."; + let start = u32::try_from(text.find("Brně").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Brně".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brně")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + signing_place_guards: vec![SigningPlaceGuardData { + prefix_phrases: [String::from("v"), String::from("ve")].into(), + suffix_phrases: [String::from("dne")].into(), + }], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert!(entities.is_empty()); +} + +#[test] +fn deny_list_processor_keeps_real_address_city() { + let text = "Sidlo: Ulice 12, Brně 602 00."; + let start = u32::try_from(text.find("Brně").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Brně".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brně")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + address_stopwords: [String::from("brně")].into(), + signing_place_guards: vec![SigningPlaceGuardData { + prefix_phrases: [String::from("v"), String::from("ve")].into(), + suffix_phrases: [String::from("dne")].into(), + }], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Brně"); +} + +#[test] +fn deny_list_processor_keeps_address_when_signing_guards_do_not_pair() { + let text = "Company is incorporated in Delaware."; + let start = u32::try_from(text.find("Delaware").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Delaware".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Delaware")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + signing_place_guards: vec![ + SigningPlaceGuardData { + prefix_phrases: [String::new()].into(), + suffix_phrases: [String::from("den")].into(), + }, + SigningPlaceGuardData { + prefix_phrases: [String::from("signed in")].into(), + suffix_phrases: [String::new()].into(), + }, + ], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Delaware"); +} + #[test] fn deny_list_processor_rejects_curated_sources_without_filters() { let matches = vec![SearchMatch::Literal { @@ -215,10 +400,10 @@ fn deny_list_processor_rejects_curated_sources_without_filters() { end: 6, }]; let data = DenyListMatchData { - labels: vec![vec![String::from("address")]], - custom_labels: vec![vec![]], + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), originals: vec![String::from("Prague")], - sources: vec![vec![String::from("city")]], + sources: vec![vec![String::from("city")]].into(), filters: None, }; diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 47a3f28e..85b1e885 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -95,6 +95,79 @@ fn same_priority_uses_score_then_length() { assert_eq!(longer.first().expect("result").end, 10); } +#[test] +fn structured_regex_span_beats_inner_trigger_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.95, 0, 22, "registration number"), + entity(DetectionSource::Trigger, 0.95, 7, 8, "registration number"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 22); +} + +#[test] +fn structured_regex_span_beats_trigger_fragment_with_trailing_punctuation() { + let regex_text = "oddíl C, vložka 240118"; + let trigger_start = byte_len("oddíl C, vložka "); + let trigger_text = "240118,"; + let result = merge_and_dedup(&[ + PipelineEntity::detected( + 0, + byte_len(regex_text), + "registration number", + regex_text, + 0.95, + DetectionSource::Regex, + ), + PipelineEntity::detected( + trigger_start, + trigger_start + byte_len(trigger_text), + "registration number", + trigger_text, + 0.95, + DetectionSource::Trigger, + ), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, byte_len(regex_text)); +} + +#[test] +fn person_regex_span_beats_inner_name_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 0, 21, "person"), + entity(DetectionSource::Trigger, 0.95, 5, 21, "person"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 21); +} + +#[test] +fn address_trigger_still_beats_inner_address_regex() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Trigger, 0.95, 0, 30, "address"), + entity(DetectionSource::Regex, 0.9, 10, 20, "address"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Trigger); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 30); +} + #[test] fn identical_spans_with_different_labels_are_kept() { let result = merge_and_dedup(&[ @@ -204,6 +277,47 @@ fn sanitize_keeps_known_period_suffixes_from_data() { ); } +#[test] +fn sanitize_preserves_single_non_breaking_space() { + let result = sanitize_entities(&[ + text_entity( + "Městským soudem v\u{00a0}Praze", + "organization", + DetectionSource::Trigger, + ), + text_entity("Acme\n Corp", "organization", DetectionSource::Trigger), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Městským soudem v\u{00a0}Praze", "Acme Corp"] + ); +} + +#[test] +fn sanitize_keeps_legal_form_periods_from_legal_form_source() { + let result = sanitize_entities(&[ + text_entity("Acme INC.", "organization", DetectionSource::LegalForm), + text_entity("Eagles z.s.", "organization", DetectionSource::LegalForm), + text_entity( + "Národní agentura s. p.", + "organization", + DetectionSource::LegalForm, + ), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Acme INC.", "Eagles z.s.", "Národní agentura s. p."] + ); +} + #[test] fn sanitize_drops_empty_entities() { let result = sanitize_entities(&[text_entity( diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index ddfacc00..defb4edf 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -1,8 +1,8 @@ #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] use stella_anonymize_core::{ - FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, - SearchMatch, SearchOptions, SearchPattern, + Error, FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, + SearchIndex, SearchIndexArtifacts, SearchMatch, SearchOptions, SearchPattern, }; #[test] @@ -223,3 +223,96 @@ fn search_index_reports_match_presence_across_engines() { assert!(index.is_match("Case 2026").unwrap()); assert!(!index.is_match("No hit").unwrap()); } + +#[test] +fn search_index_prepared_artifacts_match_direct_index() { + let patterns = vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\b[A-Z]{2}\d{4}\b")), + SearchPattern::Fuzzy { + pattern: String::from("Muller"), + distance: Some(1), + }, + ]; + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + regex: RegexSearchOptions { whole_words: false }, + fuzzy: FuzzySearchOptions { + case_insensitive: true, + whole_words: true, + normalize_diacritics: false, + }, + }; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!( + !artifacts.slots.is_empty(), + "prepared search index should record text-search slot artifacts" + ); + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns, options, &artifacts).unwrap(); + let haystack = "Alice signed AB1234. Later, Muler countersigned."; + + assert_eq!( + prepared.find_iter(haystack).unwrap(), + direct.find_iter(haystack).unwrap() + ); + assert_eq!(prepared.is_match(haystack), direct.is_match(haystack)); +} + +#[test] +fn search_index_prepared_artifacts_roundtrip_bytes() { + let patterns = vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Literal(String::from("Bob")), + ]; + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: true, + }, + ..SearchOptions::default() + }; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let bytes = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&bytes).unwrap(); + + assert_eq!(decoded, artifacts); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns, options, &decoded).unwrap(); + assert_eq!( + prepared.find_iter("Alice and Bob").unwrap(), + direct.find_iter("Alice and Bob").unwrap() + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_invalid_bytes() { + let error = SearchIndexArtifacts::from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, Error::InvalidStaticData { .. }), + "invalid artifact bytes should fail at the format boundary" + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_wrong_slot_count() { + let patterns = vec![SearchPattern::Literal(String::from("Alice"))]; + let options = SearchOptions::default(); + let mut artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + artifacts.slots.clear(); + + assert!( + SearchIndex::new_with_artifacts(patterns, options, &artifacts).is_err(), + "missing prepared slot artifacts should fail" + ); +} diff --git a/crates/anonymize-napi/Cargo.toml b/crates/anonymize-napi/Cargo.toml index 0964fdf7..27d5cd8c 100644 --- a/crates/anonymize-napi/Cargo.toml +++ b/crates/anonymize-napi/Cargo.toml @@ -11,6 +11,7 @@ repository.workspace = true crate-type = ["cdylib"] [dependencies] +blake3 = "1" napi = { version = "3", default-features = false, features = [ "napi9", "serde-json", diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 51736589..b2efe6c8 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -1,19 +1,67 @@ -use std::collections::BTreeMap; +use std::{ + collections::{BTreeMap, VecDeque}, + sync::{Arc, LazyLock, Mutex}, + time::Instant, +}; use napi::bindgen_prelude::*; use napi_derive::napi; use stella_anonymize_adapter_contract::{ - BindingCountryMatchData, BindingDenyListFilterData, BindingDenyListMatchData, - BindingGazetteerMatchData, BindingOperatorConfig, BindingOperatorEntry, - BindingPatternSlice, BindingPreparedSearchConfig, - BindingPreparedSearchSlices, BindingRedactionResult, BindingRegexMatchMeta, - BindingSearchOptions, BindingSearchPattern, BindingStaticRedactionResult, - ContractError, operator_config_from_binding, - prepared_search_config_from_binding, + BindingOperatorConfig, BindingOperatorEntry, BindingPreparedSearchConfig, + BindingRedactionResult, BindingStaticRedactionResult, ContractError, + operator_config_from_binding, prepared_search_config_from_binding, + prepared_search_package_digest, prepared_search_package_from_bytes, + prepared_search_package_to_bytes, + prepared_search_package_to_compressed_bytes, static_redaction_diagnostic_result_to_binding, static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, }; -use stella_anonymize_core::{PreparedSearch, StaticRedactionDiagnostics}; +use stella_anonymize_core::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, PreparedSearch, + PreparedSearchArtifacts, StaticRedactionDiagnostics, +}; + +const PREPARED_SEARCH_CACHE_LIMIT: usize = 8; + +static PREPARED_SEARCH_CACHE: LazyLock> = + LazyLock::new(|| Mutex::new(PreparedSearchCache::new())); + +struct PreparedSearchCache { + entries: BTreeMap<[u8; 32], Arc>, + order: VecDeque<[u8; 32]>, +} + +impl PreparedSearchCache { + const fn new() -> Self { + Self { + entries: BTreeMap::new(), + order: VecDeque::new(), + } + } + + fn get(&mut self, key: &[u8; 32]) -> Option> { + let entry = self.entries.get(key).cloned()?; + self.retain_order_without(key); + self.order.push_back(*key); + Some(entry) + } + + fn insert(&mut self, key: [u8; 32], value: Arc) { + self.entries.insert(key, value); + self.retain_order_without(&key); + self.order.push_back(key); + + while self.order.len() > PREPARED_SEARCH_CACHE_LIMIT { + if let Some(evicted) = self.order.pop_front() { + self.entries.remove(&evicted); + } + } + } + + fn retain_order_without(&mut self, key: &[u8; 32]) { + self.order.retain(|entry| entry != key); + } +} #[napi(object)] pub struct JsSearchPattern { @@ -62,6 +110,8 @@ pub struct JsRegexMatchMeta { pub score: f64, pub source_detail: Option, pub requires_validation: Option, + pub validator_id: Option, + pub validator_input: Option, pub min_byte_length: Option, } @@ -90,13 +140,22 @@ pub struct JsDenyListFilterData { pub stopwords: Vec, pub allow_list: Vec, pub person_stopwords: Vec, + pub person_trailing_nouns: Vec, pub address_stopwords: Vec, + pub address_jurisdiction_prefixes: Vec, pub street_types: Vec, pub first_names: Vec, pub generic_roles: Vec, pub sentence_starters: Vec, pub trailing_address_word_exclusions: Vec, pub defined_term_cues: Vec, + pub signing_place_guards: Vec, +} + +#[napi(object)] +pub struct JsSigningPlaceGuardData { + pub prefix_phrases: Vec, + pub suffix_phrases: Vec, } #[napi(object)] @@ -233,23 +292,237 @@ pub fn redact_static_entities_diagnostics_json( serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) } +#[napi(js_name = "prepareStaticSearchArtifactsBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_artifacts_bytes( + config_json: BufferSlice<'_>, +) -> Result { + let config = + serde_json::from_slice::(config_json.as_ref()) + .map_err(|error| to_napi_serde_error(&error))?; + let config = prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?; + PreparedSearch::prepare_artifacts(config) + .and_then(|artifacts| artifacts.to_bytes()) + .map(Buffer::from) + .map_err(|error| to_napi_core_error(&error)) +} + +#[napi(js_name = "prepareStaticSearchPackageBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_package_bytes( + config_json: BufferSlice<'_>, +) -> Result { + prepare_static_search_package_bytes_with(config_json.as_ref(), false) +} + +#[napi(js_name = "prepareStaticSearchCompressedPackageBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_compressed_package_bytes( + config_json: BufferSlice<'_>, +) -> Result { + prepare_static_search_package_bytes_with(config_json.as_ref(), true) +} + +fn prepare_static_search_package_bytes_with( + config_json: &[u8], + compressed: bool, +) -> Result { + let binding_config = + serde_json::from_slice::(config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let core_config = prepared_search_config_from_binding(binding_config.clone()) + .map_err(|error| to_napi_contract_error(&error))?; + let artifacts = PreparedSearch::prepare_artifacts(core_config) + .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_napi_core_error(&error))?; + let package = if compressed { + prepared_search_package_to_compressed_bytes(&binding_config, &artifacts) + } else { + prepared_search_package_to_bytes(&binding_config, &artifacts) + }; + package + .map(Buffer::from) + .map_err(|error| to_napi_contract_error(&error)) +} + #[napi] pub struct NativePreparedSearch { - inner: PreparedSearch, + inner: Arc, prepare_diagnostics: StaticRedactionDiagnostics, } #[napi] impl NativePreparedSearch { #[napi(constructor)] - pub fn new(config: JsPreparedSearchConfig) -> Result { - let config = prepared_search_config_from_binding(to_binding_config(config)) + pub fn new(config_json: String) -> Result { + let config_bytes = config_json.into_bytes(); + Self::from_config_bytes(&config_bytes, None) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_config_json_bytes(config_json: BufferSlice<'_>) -> Result { + Self::from_config_bytes(config_json.as_ref(), None) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_config_json_and_artifact_bytes( + config_json: BufferSlice<'_>, + artifact_bytes: BufferSlice<'_>, + ) -> Result { + Self::from_config_bytes(config_json.as_ref(), Some(artifact_bytes.as_ref())) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_prepared_package_bytes( + package_bytes: BufferSlice<'_>, + ) -> Result { + Self::from_package_bytes(package_bytes.as_ref()) + } + + fn from_config_bytes( + config_bytes: &[u8], + artifact_bytes: Option<&[u8]>, + ) -> Result { + let input_bytes_len = config_bytes + .len() + .saturating_add(artifact_bytes.map_or(0, <[u8]>::len)); + let cache_key = prepared_search_cache_key(config_bytes, artifact_bytes); + let cache_start = Instant::now(); + if let Some(inner) = prepared_search_cache_get(&cache_key) { + return Ok(Self { + inner, + prepare_diagnostics: StaticRedactionDiagnostics { + events: vec![stage_event( + DiagnosticStage::PrepareCacheHit, + Some(1), + Some(elapsed_us(cache_start)), + Some(input_bytes_len), + )], + }, + }); + } + + let cache_elapsed = elapsed_us(cache_start); + let parse_start = Instant::now(); + let config = + serde_json::from_slice::(config_bytes) + .map_err(|error| to_napi_serde_error(&error))?; + let parse_elapsed = elapsed_us(parse_start); + Self::from_binding_config( + config, + artifact_bytes, + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + ) + } + + fn from_package_bytes(package_bytes: &[u8]) -> Result { + let input_bytes_len = package_bytes.len(); + let cache_key = prepared_search_package_cache_key(package_bytes)?; + let cache_start = Instant::now(); + if let Some(inner) = prepared_search_cache_get(&cache_key) { + return Ok(Self { + inner, + prepare_diagnostics: StaticRedactionDiagnostics { + events: vec![stage_event( + DiagnosticStage::PrepareCacheHit, + Some(1), + Some(elapsed_us(cache_start)), + Some(input_bytes_len), + )], + }, + }); + } + + let cache_elapsed = elapsed_us(cache_start); + let parse_start = Instant::now(); + let package = prepared_search_package_from_bytes(package_bytes) .map_err(|error| to_napi_contract_error(&error))?; - let result = PreparedSearch::new_with_diagnostics(config) + let parse_elapsed = elapsed_us(parse_start); + Self::from_binding_config( + package.config, + Some(&package.artifacts), + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + ) + } + + fn from_binding_config( + config: BindingPreparedSearchConfig, + artifact_bytes: Option<&[u8]>, + input_bytes_len: usize, + cache_key: [u8; 32], + cache_elapsed: u64, + parse_elapsed: u64, + ) -> Result { + let convert_start = Instant::now(); + let config = prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?; + let pattern_count = config + .regex_patterns + .len() + .saturating_add(config.custom_regex_patterns.len()) + .saturating_add(config.literal_patterns.len()); + let convert_elapsed = elapsed_us(convert_start); + let artifact_decode_start = Instant::now(); + let artifacts = artifact_bytes + .map(PreparedSearchArtifacts::from_bytes) + .transpose() .map_err(|error| to_napi_core_error(&error))?; + let artifact_decode_elapsed = + artifact_bytes.map(|_| elapsed_us(artifact_decode_start)); + let result = if let Some(artifacts) = artifacts.as_ref() { + PreparedSearch::new_with_artifacts_diagnostics(config, artifacts) + } else { + PreparedSearch::new_with_diagnostics(config) + } + .map_err(|error| to_napi_core_error(&error))?; + let inner = Arc::new(result.prepared); + let mut diagnostics = StaticRedactionDiagnostics { + events: vec![ + stage_event( + DiagnosticStage::PrepareCacheMiss, + Some(0), + Some(cache_elapsed), + Some(input_bytes_len), + ), + stage_event( + DiagnosticStage::PrepareBindingParse, + None, + Some(parse_elapsed), + Some(input_bytes_len), + ), + stage_event( + DiagnosticStage::PrepareBindingConvert, + Some(pattern_count), + Some(convert_elapsed), + None, + ), + ], + }; + if let (Some(elapsed), Some(bytes)) = + (artifact_decode_elapsed, artifact_bytes.map(<[u8]>::len)) + { + diagnostics.events.push(stage_event( + DiagnosticStage::PrepareArtifactsDecode, + None, + Some(elapsed), + Some(bytes), + )); + } + diagnostics.extend(result.diagnostics); + prepared_search_cache_insert(cache_key, Arc::clone(&inner)); Ok(Self { - inner: result.prepared, - prepare_diagnostics: result.diagnostics, + inner, + prepare_diagnostics: diagnostics, }) } @@ -300,125 +573,57 @@ impl NativePreparedSearch { } } -fn to_binding_config( - config: JsPreparedSearchConfig, -) -> BindingPreparedSearchConfig { - BindingPreparedSearchConfig { - regex_patterns: to_binding_patterns(config.regex_patterns), - custom_regex_patterns: to_binding_patterns(config.custom_regex_patterns), - literal_patterns: to_binding_patterns(config.literal_patterns), - regex_options: config.regex_options.as_ref().map(to_binding_options), - custom_regex_options: config - .custom_regex_options - .as_ref() - .map(to_binding_options), - literal_options: config.literal_options.as_ref().map(to_binding_options), - slices: to_binding_slices(&config.slices), - regex_meta: to_binding_regex_meta(config.regex_meta), - custom_regex_meta: to_binding_regex_meta(config.custom_regex_meta), - deny_list_data: config.deny_list_data.map(|data| { - BindingDenyListMatchData { - labels: data.labels, - custom_labels: data.custom_labels, - originals: data.originals, - sources: data.sources, - filters: data.filters.map(to_binding_deny_list_filters), - } - }), - gazetteer_data: config.gazetteer_data.map(|data| { - BindingGazetteerMatchData { - labels: data.labels, - is_fuzzy: data.is_fuzzy, - } - }), - country_data: config.country_data.map(|data| BindingCountryMatchData { - labels: data.labels, - }), - } -} - -fn to_binding_deny_list_filters( - filters: JsDenyListFilterData, -) -> BindingDenyListFilterData { - BindingDenyListFilterData { - stopwords: filters.stopwords, - allow_list: filters.allow_list, - person_stopwords: filters.person_stopwords, - address_stopwords: filters.address_stopwords, - street_types: filters.street_types, - first_names: filters.first_names, - generic_roles: filters.generic_roles, - sentence_starters: filters.sentence_starters, - trailing_address_word_exclusions: filters.trailing_address_word_exclusions, - defined_term_cues: filters.defined_term_cues, - } -} - -fn to_binding_patterns( - patterns: Vec, -) -> Vec { - patterns - .into_iter() - .map(|pattern| BindingSearchPattern { - kind: pattern.kind, - pattern: pattern.pattern, - distance: pattern.distance, - case_insensitive: pattern.case_insensitive, - whole_words: pattern.whole_words, - lazy: pattern.lazy, - prefilter_any: pattern.prefilter_any, - prefilter_case_insensitive: pattern.prefilter_case_insensitive, - prefilter_regex: pattern.prefilter_regex, - }) - .collect() -} - -const fn to_binding_options(options: &JsSearchOptions) -> BindingSearchOptions { - BindingSearchOptions { - literal_case_insensitive: options.literal_case_insensitive, - literal_whole_words: options.literal_whole_words, - regex_whole_words: options.regex_whole_words, - fuzzy_case_insensitive: options.fuzzy_case_insensitive, - fuzzy_whole_words: options.fuzzy_whole_words, - fuzzy_normalize_diacritics: options.fuzzy_normalize_diacritics, - } +fn prepared_search_cache_get(key: &[u8; 32]) -> Option> { + with_prepared_search_cache(|cache| cache.get(key)) } -fn to_binding_slices( - slices: &JsPreparedSearchSlices, -) -> BindingPreparedSearchSlices { - BindingPreparedSearchSlices { - regex: slices.regex.as_ref().map(to_binding_slice), - custom_regex: slices.custom_regex.as_ref().map(to_binding_slice), - legal_forms: slices.legal_forms.as_ref().map(to_binding_slice), - triggers: slices.triggers.as_ref().map(to_binding_slice), - deny_list: slices.deny_list.as_ref().map(to_binding_slice), - street_types: slices.street_types.as_ref().map(to_binding_slice), - gazetteer: slices.gazetteer.as_ref().map(to_binding_slice), - countries: slices.countries.as_ref().map(to_binding_slice), - } +fn prepared_search_cache_insert(key: [u8; 32], value: Arc) { + with_prepared_search_cache(|cache| cache.insert(key, value)); } -const fn to_binding_slice(slice: &JsPatternSlice) -> BindingPatternSlice { - BindingPatternSlice { - start: slice.start, - end: slice.end, +fn prepared_search_cache_key( + config_bytes: &[u8], + artifact_bytes: Option<&[u8]>, +) -> [u8; 32] { + let mut hasher = blake3::Hasher::new(); + hasher.update(b"config"); + hasher.update(config_bytes); + match artifact_bytes { + Some(bytes) => { + hasher.update(b"artifacts"); + hasher.update(bytes); + } + None => { + hasher.update(b"no-artifacts"); + } } -} - -fn to_binding_regex_meta( - meta: Vec, -) -> Vec { - meta - .into_iter() - .map(|entry| BindingRegexMatchMeta { - label: entry.label, - score: entry.score, - source_detail: entry.source_detail, - requires_validation: entry.requires_validation, - min_byte_length: entry.min_byte_length, - }) - .collect() + *hasher.finalize().as_bytes() +} + +fn prepared_search_package_cache_key(package_bytes: &[u8]) -> Result<[u8; 32]> { + let digest = prepared_search_package_digest(package_bytes) + .map_err(|error| to_napi_contract_error(&error))?; + let mut hasher = blake3::Hasher::new(); + hasher.update(b"prepared-package"); + hasher.update(&digest); + let len = u64::try_from(package_bytes.len()).map_err(|_| { + Error::from_reason(format!( + "Prepared package byte length exceeds u64 range: {}", + package_bytes.len() + )) + })?; + hasher.update(&len.to_le_bytes()); + Ok(*hasher.finalize().as_bytes()) +} + +fn with_prepared_search_cache( + action: impl FnOnce(&mut PreparedSearchCache) -> T, +) -> T { + let mut cache = match PREPARED_SEARCH_CACHE.lock() { + Ok(cache) => cache, + Err(poisoned) => poisoned.into_inner(), + }; + action(&mut cache) } fn to_binding_operator_config( @@ -486,6 +691,37 @@ fn to_js_operator_entries( .collect() } +const fn stage_event( + stage: DiagnosticStage, + count: Option, + elapsed_us: Option, + input_bytes: Option, +) -> DiagnosticEvent { + DiagnosticEvent { + stage, + kind: DiagnosticEventKind::StageSummary, + count, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes, + reason: None, + } +} + +fn elapsed_us(start: Instant) -> u64 { + let micros = start.elapsed().as_micros(); + u64::try_from(micros).unwrap_or(u64::MAX) +} + fn to_napi_core_error(error: &stella_anonymize_core::Error) -> Error { Error::from_reason(error.to_string()) } diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index 92c6e374..e0c8ad81 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -1,15 +1,19 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; +use pyo3::types::PyBytes; use stella_anonymize_adapter_contract::{ BindingOperatorConfig, BindingOperatorEntry, BindingPipelineEntity, BindingPreparedSearchConfig, BindingRedactionEntry, BindingRedactionResult, BindingStaticRedactionResult, ContractError, operator_config_from_binding, - prepared_search_config_from_binding, + prepared_search_config_from_binding, prepared_search_package_from_bytes, + prepared_search_package_to_bytes, + prepared_search_package_to_compressed_bytes, static_redaction_diagnostic_result_to_binding, static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, }; use stella_anonymize_core::{ - PreparedSearch as CorePreparedSearch, StaticRedactionDiagnostics, + PreparedSearch as CorePreparedSearch, PreparedSearchArtifacts, + StaticRedactionDiagnostics, }; #[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] @@ -64,12 +68,43 @@ pub struct PyPreparedSearch { impl PyPreparedSearch { #[new] fn new(config_json: &str) -> PyResult { - let config = parse_prepared_search_config(config_json)?; - let result = CorePreparedSearch::new_with_diagnostics( - prepared_search_config_from_binding(config) - .map_err(|error| to_py_contract_error(&error))?, - ) - .map_err(|error| to_py_core_error(&error))?; + let config = parse_core_prepared_search_config(config_json)?; + let result = CorePreparedSearch::new_with_diagnostics(config) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + #[staticmethod] + fn from_config_json_and_artifact_bytes( + config_json: &str, + artifact_bytes: &[u8], + ) -> PyResult { + let config = parse_core_prepared_search_config(config_json)?; + let artifacts = PreparedSearchArtifacts::from_bytes(artifact_bytes) + .map_err(|error| to_py_core_error(&error))?; + let result = + CorePreparedSearch::new_with_artifacts_diagnostics(config, &artifacts) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + #[staticmethod] + fn from_prepared_package_bytes(package_bytes: &[u8]) -> PyResult { + let package = prepared_search_package_from_bytes(package_bytes) + .map_err(|error| to_py_contract_error(&error))?; + let config = prepared_search_config_from_binding(package.config) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = PreparedSearchArtifacts::from_bytes(&package.artifacts) + .map_err(|error| to_py_core_error(&error))?; + let result = + CorePreparedSearch::new_with_artifacts_diagnostics(config, &artifacts) + .map_err(|error| to_py_core_error(&error))?; Ok(Self { inner: result.prepared, prepare_diagnostics: result.diagnostics, @@ -145,6 +180,54 @@ fn redact_static_entities_json( prepared.redact_static_entities_json(full_text, operators_json) } +#[pyfunction] +fn prepare_static_search_artifacts_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + let config = parse_core_prepared_search_config(config_json)?; + let bytes = CorePreparedSearch::prepare_artifacts(config) + .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_py_core_error(&error))?; + Ok(PyBytes::new(py, &bytes)) +} + +#[pyfunction] +fn prepare_static_search_package_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + prepare_static_search_package_bytes_with(py, config_json, false) +} + +#[pyfunction] +fn prepare_static_search_compressed_package_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + prepare_static_search_package_bytes_with(py, config_json, true) +} + +fn prepare_static_search_package_bytes_with<'py>( + py: Python<'py>, + config_json: &str, + compressed: bool, +) -> PyResult> { + let binding_config = parse_prepared_search_config(config_json)?; + let core_config = prepared_search_config_from_binding(binding_config.clone()) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = CorePreparedSearch::prepare_artifacts(core_config) + .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_py_core_error(&error))?; + let package = if compressed { + prepared_search_package_to_compressed_bytes(&binding_config, &artifacts) + } else { + prepared_search_package_to_bytes(&binding_config, &artifacts) + }; + let bytes = package.map_err(|error| to_py_contract_error(&error))?; + Ok(PyBytes::new(py, &bytes)) +} + #[pyfunction] fn redact_static_entities_diagnostics_json( config_json: &str, @@ -166,6 +249,15 @@ fn parse_prepared_search_config( serde_json::from_str(config_json).map_err(|error| to_py_serde_error(&error)) } +fn parse_core_prepared_search_config( + config_json: &str, +) -> PyResult { + prepared_search_config_from_binding(parse_prepared_search_config( + config_json, + )?) + .map_err(|error| to_py_contract_error(&error)) +} + fn parse_operator_config( operators_json: Option<&str>, ) -> PyResult> { @@ -315,6 +407,18 @@ fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module .add_function(wrap_pyfunction!(redact_static_entities_json, module)?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_artifacts_bytes, + module + )?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_package_bytes, + module + )?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_compressed_package_bytes, + module + )?)?; module.add_function(wrap_pyfunction!( redact_static_entities_diagnostics_json, module diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 5601bd2d..30566c7d 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -39,6 +39,36 @@ const WARM_ITERATIONS = positiveIntegerEnv( "ANONYMIZE_MIGRATION_WARM_ITERATIONS", 2, ); +const CACHED_PREPARE_ITERATIONS = positiveIntegerEnv( + "ANONYMIZE_MIGRATION_CACHED_PREPARE_ITERATIONS", + 3, +); +const PROFILE_REGEX_LABELS = + process.env.ANONYMIZE_MIGRATION_PROFILE_REGEX_LABELS === "1"; +const PROFILE_SCOPED_PREPARE = + process.env.ANONYMIZE_MIGRATION_PROFILE_SCOPED_PREPARE === "1"; +const NATIVE_PREPARED_PACKAGE = + process.env.ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE === "1"; +const NATIVE_COMPRESSED_PACKAGE = + process.env.ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE === "1"; +const NATIVE_PREPARED_ARTIFACTS = + !NATIVE_PREPARED_PACKAGE && + process.env.ANONYMIZE_MIGRATION_NATIVE_PREPARED_ARTIFACTS === "1"; +const FIXTURE_LANGUAGE_FILTER = stringListEnv( + "ANONYMIZE_MIGRATION_FIXTURE_LANGUAGES", +); +const CONTENT_LANGUAGE = + process.env.ANONYMIZE_MIGRATION_CONTENT_LANGUAGE?.trim() ?? ""; +const NATIVE_CONFIG_PATH = + process.env.ANONYMIZE_MIGRATION_NATIVE_CONFIG_PATH?.trim() ?? ""; +const WRITE_NATIVE_CONFIG_PATH = + process.env.ANONYMIZE_MIGRATION_WRITE_NATIVE_CONFIG_PATH?.trim() ?? ""; +const NATIVE_PACKAGE_PATH = + process.env.ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH?.trim() ?? ""; +const WRITE_NATIVE_PACKAGE_PATH = + process.env.ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH?.trim() ?? ""; +const USER_DATA_SCENARIO = + process.env.ANONYMIZE_MIGRATION_USER_DATA_SCENARIO?.trim() ?? "none"; if (process.env.ANONYMIZE_MIGRATION_WORKER === "1") { await runWorker(); @@ -47,7 +77,11 @@ if (process.env.ANONYMIZE_MIGRATION_WORKER === "1") { } async function runCoordinator() { - const fixtures = discoverFixtures(FIXTURES_DIR); + const fixtures = discoverFixtures(FIXTURES_DIR).filter((fixture) => + FIXTURE_LANGUAGE_FILTER.length === 0 + ? true + : FIXTURE_LANGUAGE_FILTER.includes(fixtureLanguage(fixture)), + ); if (fixtures.length === 0) { throw new Error(`No contract fixtures found in ${FIXTURES_DIR}`); } @@ -128,6 +162,9 @@ function runVariant({ ANONYMIZE_MIGRATION_FIXTURES: JSON.stringify(fixtures), ANONYMIZE_MIGRATION_RESULT_PATH: resultPath, ANONYMIZE_MIGRATION_WARM_ITERATIONS: String(WARM_ITERATIONS), + ANONYMIZE_MIGRATION_CACHED_PREPARE_ITERATIONS: String( + CACHED_PREPARE_ITERATIONS, + ), }, encoding: "utf8", maxBuffer: 64 * 1024 * 1024, @@ -155,51 +192,130 @@ async function runWorker() { const resultPath = requiredEnv("ANONYMIZE_MIGRATION_RESULT_PATH"); const fixtures = JSON.parse(requiredEnv("ANONYMIZE_MIGRATION_FIXTURES")); validateRuntime(runtime); - - const importStart = Bun.nanoseconds(); - const [indexModule, configModule, dictionaryModule] = await Promise.all([ - importSource(sourceRoot, "packages/anonymize/src/index.ts", variant), - importSource( - sourceRoot, - "packages/anonymize/src/__test__/contract-config.ts", - variant, - ), - importSource( - sourceRoot, - "packages/anonymize/src/__test__/load-dictionaries.ts", - variant, - ), - ]); - const importMs = elapsedMs(importStart); - - const dictionaryStart = Bun.nanoseconds(); - const dictionaries = await dictionaryModule.loadTestDictionaries(); - const dictionaryMs = elapsedMs(dictionaryStart); - - const config = { - ...configModule.contractTestConfig(`migration-fixtures-${variant}`), - dictionaries, - }; - const context = indexModule.createPipelineContext(); - - const prepareStart = Bun.nanoseconds(); - const search = - runtime === "native-static" - ? await prepareNativeStaticSearch({ + const usePrebuiltNativePackage = + runtime === "native-static" && NATIVE_PACKAGE_PATH.length > 0; + const usePrebuiltNativeConfig = + !usePrebuiltNativePackage && + runtime === "native-static" && + NATIVE_CONFIG_PATH.length > 0; + + let indexModule = null; + let config = null; + let context = null; + let search = null; + let nativeConfigBytes = null; + let nativePackageBuffer = null; + let importMs = 0; + let dictionaryMs = 0; + let prepareMs = 0; + let nativeConfigReadMs = 0; + let nativeConfigParseMs = 0; + let nativePackageReadMs = 0; + let nativePackageCompressed = NATIVE_COMPRESSED_PACKAGE; + + if (usePrebuiltNativePackage) { + const packageReadStart = Bun.nanoseconds(); + nativePackageBuffer = readFileSync(NATIVE_PACKAGE_PATH); + nativePackageReadMs = elapsedMs(packageReadStart); + nativePackageCompressed = isCompressedNativePackage(nativePackageBuffer); + } else if (usePrebuiltNativeConfig) { + const configReadStart = Bun.nanoseconds(); + nativeConfigBytes = readFileSync(NATIVE_CONFIG_PATH); + nativeConfigReadMs = elapsedMs(configReadStart); + const configParseStart = Bun.nanoseconds(); + search = { + nativeStaticConfig: JSON.parse(nativeConfigBytes.toString("utf8")), + }; + nativeConfigParseMs = elapsedMs(configParseStart); + } else { + const importStart = Bun.nanoseconds(); + const [loadedIndexModule, configModule, dictionaryModule] = + await Promise.all([ + importSource(sourceRoot, "packages/anonymize/src/index.ts", variant), + importSource( sourceRoot, + "packages/anonymize/src/__test__/contract-config.ts", variant, - config, - context, - }) - : await indexModule.preparePipelineSearch({ config, context }); - const prepareMs = elapsedMs(prepareStart); - const nativeRewrite = describeNativeRewrite(config, search, runtime); - - const runtimeRunner = - runtime === "native-static" - ? createNativeStaticRunner(search.nativeStaticConfig) - : null; + ), + importSource( + sourceRoot, + "packages/anonymize/src/__test__/load-dictionaries.ts", + variant, + ), + ]); + indexModule = loadedIndexModule; + importMs = elapsedMs(importStart); + + const scope = contentLanguageScope(); + const dictionaryStart = Bun.nanoseconds(); + const dictionaries = await dictionaryModule.loadTestDictionaries(scope); + dictionaryMs = elapsedMs(dictionaryStart); + + config = { + ...configModule.contractTestConfig(`migration-fixtures-${variant}`), + ...scope, + dictionaries, + }; + config = applyUserDataScenario(config); + context = indexModule.createPipelineContext(); + + const prepareStart = Bun.nanoseconds(); + search = + runtime === "native-static" + ? await prepareNativeStaticSearch({ + sourceRoot, + variant, + config, + context, + }) + : await indexModule.preparePipelineSearch({ config, context }); + prepareMs = elapsedMs(prepareStart); + if ( + runtime === "native-static" && + WRITE_NATIVE_CONFIG_PATH.length > 0 && + search.nativeStaticConfig + ) { + writeFileSync( + WRITE_NATIVE_CONFIG_PATH, + JSON.stringify(search.nativeStaticConfig), + ); + } + } + const nativeRewrite = usePrebuiltNativePackage + ? describeNativeRewriteFromNativePackage(runtime) + : usePrebuiltNativeConfig && search.nativeStaticConfig + ? describeNativeRewriteFromNativeConfig( + search.nativeStaticConfig, + runtime, + ) + : describeNativeRewrite(config, search, runtime); + + let runtimeRunner = null; + if (runtime === "native-static" && nativePackageBuffer !== null) { + runtimeRunner = + createNativeStaticRunnerFromPackageBytes(nativePackageBuffer); + } else if (runtime === "native-static" && nativeConfigBytes === null) { + runtimeRunner = createNativeStaticRunner(search.nativeStaticConfig); + } else if (runtime === "native-static") { + runtimeRunner = createNativeStaticRunnerFromJsonBytes(nativeConfigBytes); + } const nativePrepareMs = runtimeRunner?.prepareMs ?? 0; + const nativeStringifyMs = runtimeRunner?.stringifyMs ?? 0; + const nativeArtifactPrepareMs = runtimeRunner?.artifactPrepareMs ?? 0; + const nativeArtifactBytes = runtimeRunner?.artifactBytes ?? 0; + const nativePackagePrepareMs = runtimeRunner?.packagePrepareMs ?? 0; + const nativePackageBytes = runtimeRunner?.packageBytes ?? 0; + const nativeCachedPrepareMsByIteration = + runtimeRunner?.cachedPrepareMsByIteration ?? []; + const nativeCachedPrepareAvgMs = + nativeCachedPrepareMsByIteration.length === 0 + ? 0 + : roundMs( + nativeCachedPrepareMsByIteration.reduce( + (sum, value) => sum + value, + 0, + ) / nativeCachedPrepareMsByIteration.length, + ); const coldRun = runtimeRunner === null @@ -233,6 +349,23 @@ async function runWorker() { runtimeRunner === null ? null : collectNativeDiagnostics({ runner: runtimeRunner, fixtures }); + if (nativeDiagnostics !== null && PROFILE_REGEX_LABELS) { + nativeDiagnostics.regexPrepareByLabel = profileNativeRegexPrepare( + search.nativeStaticConfig, + ); + } + if ( + nativeDiagnostics !== null && + PROFILE_SCOPED_PREPARE && + !usePrebuiltNativeConfig + ) { + nativeDiagnostics.scopedPrepare = await profileScopedNativePrepare({ + sourceRoot, + variant, + baseConfig: config, + fixtures, + }); + } const snapshots = Object.fromEntries( coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), ); @@ -250,13 +383,35 @@ async function runWorker() { importMs, dictionaryMs, prepareMs, + nativeConfigReadMs, + nativeConfigParseMs, + nativePackageReadMs, + nativeStringifyMs, + nativeArtifactPrepareMs, + nativeArtifactBytes, + nativePackageCompressed, + nativePackagePrepareMs, + nativePackageBytes, nativePrepareMs, + nativeCachedPrepareMsByIteration, + nativeCachedPrepareAvgMs, coldRunMs: coldRun.ms, coldPipelineMs: roundMs( - dictionaryMs + prepareMs + nativePrepareMs + coldRun.ms, + dictionaryMs + + prepareMs + + nativeConfigReadMs + + nativeStringifyMs + + nativePrepareMs + + coldRun.ms, ), coldTotalMs: roundMs( - importMs + dictionaryMs + prepareMs + nativePrepareMs + coldRun.ms, + importMs + + dictionaryMs + + prepareMs + + nativeConfigReadMs + + nativeStringifyMs + + nativePrepareMs + + coldRun.ms, ), warmRunMsByIteration: warmRuns.map((run) => run.ms), warmRunMs, @@ -379,6 +534,17 @@ function collectNativeDiagnostics({ runner, fixtures }) { diagnosticStageSummaries(runner.prepareDiagnostics.events), ), }, + cachedPrepare: + runner.cachedPrepareDiagnostics === null + ? null + : { + stages: diagnosticStageSummaries( + runner.cachedPrepareDiagnostics.events, + ), + topStages: topDiagnosticStages( + diagnosticStageSummaries(runner.cachedPrepareDiagnostics.events), + ), + }, run: summarizeFixtureDiagnostics(fixtureDiagnostics), }; } @@ -488,6 +654,8 @@ function toSnapshot(indexModule, fullText, entities, context) { entities: sorted.map(({ start, end, label, text, source }) => ({ start, end, + byteStart: utf16OffsetToUtf8ByteOffset(fullText, start), + byteEnd: utf16OffsetToUtf8ByteOffset(fullText, end), label, text, source, @@ -515,6 +683,8 @@ function toNativeSnapshot(result) { entities: entities.map(({ start, end, label, text, source }) => ({ start, end, + byteStart: start, + byteEnd: end, label, text, source, @@ -533,7 +703,7 @@ function compareSnapshots(baseline, candidate) { for (const fixture of [...fixtureNames].sort()) { const expected = baseline.snapshots[fixture]; const actual = candidate.snapshots[fixture]; - if (JSON.stringify(expected) === JSON.stringify(actual)) { + if (snapshotsAreEquivalent(expected, actual)) { continue; } mismatches.push(describeMismatch(fixture, expected, actual)); @@ -544,6 +714,7 @@ function compareSnapshots(baseline, candidate) { baseline: baseline.variant, candidate: candidate.variant, equal: mismatches.length === 0, + mismatchSummary: mismatchSummary(mismatches), fixtureCount: fixtureNames.size, mismatches, timingComparison: timingComparison(baseline, candidate), @@ -554,6 +725,54 @@ function compareSnapshots(baseline, candidate) { }; } +function mismatchSummary(mismatches) { + const byCategory = {}; + let materialMismatchCount = 0; + let redactionMismatchCount = 0; + let sourceOnlyMismatchCount = 0; + + for (const mismatch of mismatches) { + const category = mismatch.category ?? mismatch.kind; + byCategory[category] = (byCategory[category] ?? 0) + 1; + if (mismatch.sourceAgnosticEqual !== true) { + materialMismatchCount += 1; + } + if (mismatch.redactedTextEqual === false) { + redactionMismatchCount += 1; + } + if ( + mismatch.redactedTextEqual && + mismatch.sourceOnlyCount > 0 && + Object.keys(mismatch.candidateExtraByLabel ?? {}).length === 0 && + Object.keys(mismatch.candidateMissingByLabel ?? {}).length === 0 + ) { + sourceOnlyMismatchCount += 1; + } + } + + return { + strictMismatchCount: mismatches.length, + materialMismatchCount, + redactionMismatchCount, + sourceOnlyMismatchCount, + byCategory, + }; +} + +function snapshotsAreEquivalent(expected, actual) { + if (expected === undefined || actual === undefined) { + return false; + } + if (JSON.stringify(expected) === JSON.stringify(actual)) { + return true; + } + return ( + expected.redactedText === actual.redactedText && + JSON.stringify(byteNormalizedSnapshot(expected)) === + JSON.stringify(byteNormalizedSnapshot(actual)) + ); +} + function describeMismatch(fixture, expected, actual) { if (expected === undefined || actual === undefined) { return { @@ -566,10 +785,23 @@ function describeMismatch(fixture, expected, actual) { expected.entities, actual.entities, ); + const expectedByteSnapshot = byteNormalizedSnapshot(expected); + const actualByteSnapshot = byteNormalizedSnapshot(actual); + const byteNormalizedEqual = + JSON.stringify(expectedByteSnapshot) === JSON.stringify(actualByteSnapshot); + const sourceAgnosticEqual = + JSON.stringify(sourceAgnosticSnapshot(expectedByteSnapshot)) === + JSON.stringify(sourceAgnosticSnapshot(actualByteSnapshot)); + const firstByteEntityDiff = firstDifferentIndex( + expectedByteSnapshot.entities, + actualByteSnapshot.entities, + ); + const category = mismatchCategory(expected, actual); return { fixture, kind: "snapshot-mismatch", + category: category.kind, entityCount: { baseline: expected.entityCount, candidate: actual.entityCount, @@ -579,6 +811,15 @@ function describeMismatch(fixture, expected, actual) { candidate: actual.counts, }, redactedTextEqual: expected.redactedText === actual.redactedText, + byteNormalizedEqual, + sourceAgnosticEqual, + sourceOnlyCount: category.sourceOnlyCount, + candidateExtraByLabel: category.candidateExtraByLabel, + candidateMissingByLabel: category.candidateMissingByLabel, + candidateExtra: category.candidateExtra, + candidateMissing: category.candidateMissing, + firstCandidateExtra: category.firstCandidateExtra, + firstCandidateMissing: category.firstCandidateMissing, firstEntityDiff: firstEntityDiff === -1 ? null @@ -587,6 +828,230 @@ function describeMismatch(fixture, expected, actual) { baseline: expected.entities.at(firstEntityDiff) ?? null, candidate: actual.entities.at(firstEntityDiff) ?? null, }, + firstByteEntityDiff: + firstByteEntityDiff === -1 + ? null + : { + index: firstByteEntityDiff, + baseline: + expectedByteSnapshot.entities.at(firstByteEntityDiff) ?? null, + candidate: + actualByteSnapshot.entities.at(firstByteEntityDiff) ?? null, + }, + }; +} + +function mismatchCategory(expected, actual) { + const expectedByteEntities = byteNormalizedSnapshot(expected).entities; + const actualByteEntities = byteNormalizedSnapshot(actual).entities; + const redactedTextEqual = expected.redactedText === actual.redactedText; + const entitySetEqual = + JSON.stringify(expectedByteEntities) === JSON.stringify(actualByteEntities); + if (redactedTextEqual && entitySetEqual) { + return emptyMismatchCategory("metadata-only"); + } + + const expectedSpanLabel = countEntitiesByKey( + expectedByteEntities, + entitySpanLabelKey, + ); + const actualSpanLabel = countEntitiesByKey( + actualByteEntities, + entitySpanLabelKey, + ); + if (mapsEqual(expectedSpanLabel, actualSpanLabel)) { + return { + ...sourceDriftCategory(expectedByteEntities, actualByteEntities), + kind: redactedTextEqual ? "text-or-source" : "span-label-only", + }; + } + + const expectedContent = countEntitiesByKey( + expectedByteEntities, + entityContentKey, + ); + const actualContent = countEntitiesByKey( + actualByteEntities, + entityContentKey, + ); + if (mapsEqual(expectedContent, actualContent)) { + return { + ...sourceDriftCategory(expectedByteEntities, actualByteEntities), + kind: redactedTextEqual ? "source-only" : "source-or-order", + }; + } + + const delta = entityDelta(expectedByteEntities, actualByteEntities); + return { + kind: delta.missing.length === 0 ? "candidate-extra" : "coverage-drift", + sourceOnlyCount: sourceDriftCategory( + expectedByteEntities, + actualByteEntities, + ).sourceOnlyCount, + candidateExtraByLabel: countByLabel(delta.extra), + candidateMissingByLabel: countByLabel(delta.missing), + candidateExtra: delta.extra.map(entitySummary), + candidateMissing: delta.missing.map(entitySummary), + firstCandidateExtra: entitySummary(delta.extra.at(0)), + firstCandidateMissing: entitySummary(delta.missing.at(0)), + }; +} + +function emptyMismatchCategory(kind) { + return { + kind, + sourceOnlyCount: 0, + candidateExtraByLabel: {}, + candidateMissingByLabel: {}, + candidateExtra: [], + candidateMissing: [], + firstCandidateExtra: null, + firstCandidateMissing: null, + }; +} + +function sourceDriftCategory(expectedEntities, actualEntities) { + const expectedByContent = groupEntitiesByKey( + expectedEntities, + entityContentKey, + ); + const actualByContent = groupEntitiesByKey(actualEntities, entityContentKey); + let sourceOnlyCount = 0; + for (const [key, expectedGroup] of expectedByContent) { + const actualGroup = actualByContent.get(key) ?? []; + const expectedSources = expectedGroup.map((entity) => entity.source).sort(); + const actualSources = actualGroup.map((entity) => entity.source).sort(); + if (JSON.stringify(expectedSources) !== JSON.stringify(actualSources)) { + sourceOnlyCount += Math.max(expectedGroup.length, actualGroup.length); + } + } + return { + ...emptyMismatchCategory("source-only"), + sourceOnlyCount, + }; +} + +function entityDelta(expectedEntities, actualEntities) { + const expectedCounts = countEntitiesByKey(expectedEntities, entityContentKey); + const actualCounts = countEntitiesByKey(actualEntities, entityContentKey); + + return { + missing: takeEntityDelta(expectedEntities, expectedCounts, actualCounts), + extra: takeEntityDelta(actualEntities, actualCounts, expectedCounts), + }; +} + +function takeEntityDelta(entities, ownCounts, otherCounts) { + const remaining = new Map(); + for (const [key, ownCount] of ownCounts) { + const diff = ownCount - (otherCounts.get(key) ?? 0); + if (diff > 0) { + remaining.set(key, diff); + } + } + + const delta = []; + for (const entity of entities) { + const key = entityContentKey(entity); + const count = remaining.get(key) ?? 0; + if (count <= 0) { + continue; + } + delta.push(entity); + remaining.set(key, count - 1); + } + return delta; +} + +function entityContentKey(entity) { + return [entity.start, entity.end, entity.label, entity.text].join("\u0000"); +} + +function entitySpanLabelKey(entity) { + return [entity.start, entity.end, entity.label].join("\u0000"); +} + +function countEntitiesByKey(entities, keyFn) { + const counts = new Map(); + for (const entity of entities) { + const key = keyFn(entity); + counts.set(key, (counts.get(key) ?? 0) + 1); + } + return counts; +} + +function groupEntitiesByKey(entities, keyFn) { + const groups = new Map(); + for (const entity of entities) { + const key = keyFn(entity); + const group = groups.get(key) ?? []; + group.push(entity); + groups.set(key, group); + } + return groups; +} + +function mapsEqual(left, right) { + if (left.size !== right.size) { + return false; + } + for (const [key, value] of left) { + if (right.get(key) !== value) { + return false; + } + } + return true; +} + +function countByLabel(entities) { + const counts = {}; + for (const entity of entities) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + return counts; +} + +function entitySummary(entity) { + if (!entity) { + return null; + } + return { + start: entity.start, + end: entity.end, + label: entity.label, + source: entity.source, + }; +} + +function byteNormalizedSnapshot(snapshot) { + const entities = snapshot.entities + .map(({ byteStart, byteEnd, label, text, source }) => ({ + start: byteStart, + end: byteEnd, + label, + text, + source, + })) + .toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + + return { + entityCount: snapshot.entityCount, + counts: snapshot.counts, + entities, + redactedText: snapshot.redactedText, + }; +} + +function sourceAgnosticSnapshot(snapshot) { + return { + ...snapshot, + entities: snapshot.entities.map(({ source: _source, ...entity }) => entity), }; } @@ -630,37 +1095,56 @@ function printVariantSummary(result) { } function describeNativeRewrite(config, search, runtime) { - const sliceLengths = Object.fromEntries( + const tsSliceLengths = Object.fromEntries( Object.entries(search.slices).map(([name, slice]) => [ name, sliceLength(slice), ]), ); - const regexValidationSlots = countRegexValidationSlots(search.regexMeta); - const denyListSourceCounts = countDenyListSources(search.denyListData); const nativeStaticConfig = search.nativeStaticConfig; - const unsupportedSearchSlots = [ - unsupportedSlot("regex", regexValidationSlots, "regex validators"), - unsupportedSlot("triggers", sliceLengths.triggers, "trigger extraction"), - unsupportedSlot("streetTypes", sliceLengths.streetTypes, "address seeds"), - ].filter((slot) => slot.count > 0); - const supportedSearchSlots = nativeStaticConfig + const sliceLengths = nativeStaticConfig + ? nativeSliceLengths(nativeStaticConfig, tsSliceLengths) + : tsSliceLengths; + const regexValidationSlots = countUnsupportedRegexValidationSlots( + search.regexMeta, + nativeStaticConfig, + ); + const denyListSourceCounts = countDenyListSources(search.denyListData); + const nativeSupported = nativeStaticConfig ? nativeStaticConfig.regex_patterns.length + nativeStaticConfig.custom_regex_patterns.length + nativeStaticConfig.literal_patterns.length - : Math.max(0, sliceLengths.regex - regexValidationSlots) + + : null; + const unsupportedSearchSlots = [ + unsupportedSlot("regex", regexValidationSlots, "regex validators"), + unsupportedSlot( + "triggers", + nativeStaticConfig ? 0 : sliceLengths.triggers, + "trigger extraction", + ), + unsupportedSlot( + "streetTypes", + nativeStaticConfig ? 0 : tsSliceLengths.streetTypes, + "address seeds", + ), + ].filter((slot) => slot.count > 0); + const supportedSearchSlots = + nativeSupported ?? + Math.max(0, sliceLengths.regex - regexValidationSlots) + sliceLengths.customRegex + denyListSourceCounts.customOnly + denyListSourceCounts.curated + sliceLengths.gazetteer + sliceLengths.countries; - const totalSearchSlots = Object.values(sliceLengths).reduce( - (sum, length) => sum + length, - 0, - ); + const totalSearchSlots = nativeSupported + ? supportedSearchSlots + + unsupportedSearchSlots.reduce((sum, slot) => sum + slot.count, 0) + : Object.values(sliceLengths).reduce((sum, length) => sum + length, 0); const unsupportedPipelineStages = describeUnsupportedPipelineStages( config, search, + runtime, + nativeStaticConfig, ); return { @@ -683,12 +1167,81 @@ function describeNativeRewrite(config, search, runtime) { }; } -function describeUnsupportedPipelineStages(config, search) { +function describeNativeRewriteFromNativeConfig(nativeStaticConfig, runtime) { + const supportedSearchSlots = + nativeStaticConfig.regex_patterns.length + + nativeStaticConfig.custom_regex_patterns.length + + nativeStaticConfig.literal_patterns.length; + const sliceLengths = nativeSliceLengths(nativeStaticConfig, {}); + + return { + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, + fullPipelineNativeEligible: false, + searchSlotCoverage: { + supported: supportedSearchSlots, + total: supportedSearchSlots, + ratio: 1, + }, + sliceLengths, + unsupportedSearchSlots: [], + unsupportedPipelineStages: ["prebuilt-config-summary-only"], + }; +} + +function describeNativeRewriteFromNativePackage(runtime) { + return { + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, + fullPipelineNativeEligible: false, + searchSlotCoverage: { + supported: 0, + total: 0, + ratio: 1, + }, + sliceLengths: { + regex: 0, + customRegex: 0, + legalForms: 0, + triggers: 0, + denyList: 0, + streetTypes: 0, + gazetteer: 0, + countries: 0, + }, + unsupportedSearchSlots: [], + unsupportedPipelineStages: ["prebuilt-package-summary-only"], + }; +} + +function nativeSliceLengths(nativeStaticConfig, fallbackSliceLengths) { + const slices = nativeStaticConfig.slices ?? {}; + return { + regex: sliceLength(slices.regex), + customRegex: sliceLength(slices.custom_regex), + legalForms: sliceLength(slices.legal_forms), + triggers: sliceLength(slices.triggers), + denyList: sliceLength(slices.deny_list), + streetTypes: nativeStaticConfig + ? sliceLength(slices.street_types) + : fallbackSliceLengths.streetTypes, + gazetteer: sliceLength(slices.gazetteer), + countries: sliceLength(slices.countries), + }; +} + +function describeUnsupportedPipelineStages( + config, + search, + runtime, + nativeStaticConfig, +) { const stages = []; - if (config.enableLegalForms) { + const nativeRuntime = runtime === "native-static" && nativeStaticConfig; + if (config.enableLegalForms && !nativeRuntime) { stages.push("legal-forms-v2"); } - if (config.enableTriggerPhrases) { + if (config.enableTriggerPhrases && !nativeRuntime) { stages.push("triggers"); } if (config.enableNameCorpus) { @@ -708,19 +1261,38 @@ function describeUnsupportedPipelineStages(config, search) { if (config.enableCoreference) { stages.push("coreference"); } - if (sliceLength(search.slices.streetTypes) > 0) { + if (!nativeRuntime && sliceLength(search.slices.streetTypes) > 0) { stages.push("address-seeds"); } - stages.push("signatures", "false-positive-filters", "final-extensions"); + if (!nativeRuntime) { + stages.push("signatures"); + } + stages.push("false-positive-filters", "final-extensions"); return stages; } -function countRegexValidationSlots(regexMeta) { - return regexMeta.reduce( - (count, meta) => count + (meta.requiresValidation === true ? 1 : 0), - 0, +function countUnsupportedRegexValidationSlots(regexMeta, nativeStaticConfig) { + const nativeValidatorIds = new Set( + (nativeStaticConfig?.regex_meta ?? []) + .map((meta) => meta.validator_id) + .filter((validatorId) => typeof validatorId === "string"), ); + let count = 0; + for (const meta of regexMeta) { + if (!regexMetaRequiresValidation(meta)) { + continue; + } + if (meta.validatorId && nativeValidatorIds.has(meta.validatorId)) { + continue; + } + count += 1; + } + return count; +} + +function regexMetaRequiresValidation(meta) { + return meta?.validator !== undefined || meta?.requiresValidation === true; } function countDenyListSources(denyListData) { @@ -807,6 +1379,10 @@ function firstDifferentIndex(left, right) { return -1; } +function utf16OffsetToUtf8ByteOffset(text, offset) { + return Buffer.byteLength(text.slice(0, offset), "utf8"); +} + function discoverFixtures(fixturesDir) { const paths = []; for (const language of readdirSync(fixturesDir)) { @@ -873,132 +1449,526 @@ function createNativeStaticRunner(nativeStaticConfig) { throw new Error("Native static runtime requires nativeStaticConfig"); } + const stringifyStart = Bun.nanoseconds(); + const configJson = JSON.stringify(nativeStaticConfig); + const stringifyMs = elapsedMs(stringifyStart); + return createNativeStaticRunnerFromJson(configJson, stringifyMs); +} + +function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { const native = loadNativeAdapter(); + const configBytes = Buffer.from(configJson); + const packageStart = Bun.nanoseconds(); + const packageBytes = NATIVE_PREPARED_PACKAGE + ? prepareNativePackageBytes(native, configBytes) + : null; + writeNativePackageIfRequested(packageBytes); + const packagePrepareMs = packageBytes === null ? 0 : elapsedMs(packageStart); + const artifactStart = Bun.nanoseconds(); + const artifactBytes = NATIVE_PREPARED_ARTIFACTS + ? native.prepareStaticSearchArtifactsBytes(configBytes) + : null; + const artifactPrepareMs = + artifactBytes === null ? 0 : elapsedMs(artifactStart); + const prepare = () => { + if (packageBytes !== null) { + return native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + } + if (artifactBytes !== null) { + return native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + configBytes, + artifactBytes, + ); + } + return new native.NativePreparedSearch(configJson); + }; const prepareStart = Bun.nanoseconds(); - const prepared = new native.NativePreparedSearch( - toNapiConfig(nativeStaticConfig), - ); + const prepared = prepare(); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } return { prepared, prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + configBytes: Buffer.byteLength(configJson, "utf8"), + artifactBytes: artifactBytes?.byteLength ?? 0, + artifactPrepareMs, + packageBytes: packageBytes?.byteLength ?? 0, + packagePrepareMs, + stringifyMs, prepareMs, }; } -function loadNativeAdapter() { - const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-fixture-napi-")); - const napiPath = join(tempDir, "stella_anonymize_napi.node"); - copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); - const loaded = createRequire(import.meta.url)(napiPath); - const NativePreparedSearch = Reflect.get( - Object(loaded), - "NativePreparedSearch", +function createNativeStaticRunnerFromJsonBytes(configBytes) { + const native = loadNativeAdapter(); + const packageStart = Bun.nanoseconds(); + const packageBytes = NATIVE_PREPARED_PACKAGE + ? prepareNativePackageBytes(native, configBytes) + : null; + writeNativePackageIfRequested(packageBytes); + const packagePrepareMs = packageBytes === null ? 0 : elapsedMs(packageStart); + const artifactStart = Bun.nanoseconds(); + const artifactBytes = NATIVE_PREPARED_ARTIFACTS + ? native.prepareStaticSearchArtifactsBytes(configBytes) + : null; + const artifactPrepareMs = + artifactBytes === null ? 0 : elapsedMs(artifactStart); + const prepare = (bytes) => { + if (packageBytes !== null) { + return native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + } + if (artifactBytes !== null) { + return native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + bytes, + artifactBytes, + ); + } + const factory = Reflect.get( + native.NativePreparedSearch, + "fromConfigJsonBytes", + ); + if (typeof factory === "function") { + return factory.call(native.NativePreparedSearch, bytes); + } + return new native.NativePreparedSearch(bytes.toString("utf8")); + }; + const prepareStart = Bun.nanoseconds(); + const prepared = prepare(configBytes); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(configBytes); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } + return { + prepared, + prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + configBytes: configBytes.byteLength, + artifactBytes: artifactBytes?.byteLength ?? 0, + artifactPrepareMs, + packageBytes: packageBytes?.byteLength ?? 0, + packagePrepareMs, + stringifyMs: 0, + prepareMs, + }; +} + +function createNativeStaticRunnerFromPackageBytes(packageBytes) { + const native = loadNativeAdapter(); + const prepare = () => + native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + const prepareStart = Bun.nanoseconds(); + const prepared = prepare(); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } + return { + prepared, + prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + configBytes: 0, + artifactBytes: 0, + artifactPrepareMs: 0, + packageBytes: packageBytes.byteLength, + packagePrepareMs: 0, + stringifyMs: 0, + prepareMs, + }; +} + +function writeNativePackageIfRequested(packageBytes) { + if (packageBytes !== null && WRITE_NATIVE_PACKAGE_PATH.length > 0) { + writeFileSync(WRITE_NATIVE_PACKAGE_PATH, packageBytes); + } +} + +function profileNativeRegexPrepare(nativeStaticConfig) { + if (!nativeStaticConfig) { + return null; + } + + const native = loadNativeAdapter(); + const regexCount = sliceLength(nativeStaticConfig.slices?.regex); + const regexMeta = nativeStaticConfig.regex_meta ?? []; + const labels = [...new Set(regexMeta.map((meta) => meta.label))].sort( + (left, right) => left.localeCompare(right), ); - if (typeof NativePreparedSearch !== "function") { - throw new TypeError("Native anonymize adapter exports are incomplete"); + const labelCounts = Object.fromEntries( + labels.map((label) => [ + label, + regexMeta.filter((meta) => meta.label === label).length, + ]), + ); + + return { + regexCount, + labelCounts, + only: labels.map((label) => + measureNativeConfigPrepare( + native.NativePreparedSearch, + `only:${label}`, + nativeRegexOnlyConfig(nativeStaticConfig, new Set([label])), + ), + ), + without: labels.map((label) => + measureNativeConfigPrepare( + native.NativePreparedSearch, + `without:${label}`, + nativeConfigWithRegexLabels( + nativeStaticConfig, + new Set([label]), + false, + ), + ), + ), + withoutHotGroups: measureNativeConfigPrepare( + native.NativePreparedSearch, + "without:date+monetary amount", + nativeConfigWithRegexLabels( + nativeStaticConfig, + new Set(["date", "monetary amount"]), + false, + ), + ), + }; +} + +async function profileScopedNativePrepare({ + sourceRoot, + variant, + baseConfig, + fixtures, +}) { + const module = await importSource( + sourceRoot, + "packages/anonymize/src/build-unified-search.ts", + variant, + ); + const buildNativeStaticSearchBundle = Reflect.get( + Object(module), + "buildNativeStaticSearchBundle", + ); + if (typeof buildNativeStaticSearchBundle !== "function") { + throw new TypeError("Native static search bundle builder is unavailable"); + } + const contextModule = await importSource( + sourceRoot, + "packages/anonymize/src/context.ts", + `${variant}:scoped-prepare`, + ); + const createPipelineContext = Reflect.get( + Object(contextModule), + "createPipelineContext", + ); + if (typeof createPipelineContext !== "function") { + throw new TypeError("Pipeline context factory is unavailable"); + } + + const native = loadNativeAdapter(); + const languages = [ + ...new Set(fixtures.map((fixture) => fixtureLanguage(fixture))), + ].sort((left, right) => left.localeCompare(right)); + + const scopes = []; + for (const language of languages) { + const scopedConfig = applyFixtureLanguageScope(baseConfig, language); + const buildStart = Bun.nanoseconds(); + const bundle = await buildNativeStaticSearchBundle( + scopedConfig, + [], + createPipelineContext(), + ); + const buildMs = elapsedMs(buildStart); + const prepare = measureNativeConfigPrepare( + native.NativePreparedSearch, + language, + bundle.nativeStaticConfig, + ); + scopes.push({ + language, + scope: fixtureLanguageScope(language), + buildMs, + ...prepare, + }); } - return { NativePreparedSearch }; + + return scopes; } -function toNapiConfig(config) { +function nativeConfigWithRegexLabels(config, labels, keepMatching) { + const regexMeta = config.regex_meta ?? []; + const regexPatterns = []; + const nextMeta = []; + for (const [index, meta] of regexMeta.entries()) { + const matches = labels.has(meta.label); + if (matches !== keepMatching) { + continue; + } + regexPatterns.push(config.regex_patterns[index]); + nextMeta.push(meta); + } + + const oldRegexCount = regexMeta.length; + const tail = config.regex_patterns.slice(oldRegexCount); + const nextRegexCount = regexPatterns.length; + const legalFormCount = sliceLength(config.slices?.legal_forms); + const triggerCount = sliceLength(config.slices?.triggers); + return { - regexPatterns: config.regex_patterns.map(toNapiPattern), - customRegexPatterns: config.custom_regex_patterns.map(toNapiPattern), - literalPatterns: config.literal_patterns.map(toNapiPattern), - regexOptions: toNapiOptions(config.regex_options), - customRegexOptions: toNapiOptions(config.custom_regex_options), - literalOptions: toNapiOptions(config.literal_options), + ...config, + regex_patterns: [...regexPatterns, ...tail], + regex_meta: nextMeta, slices: { - regex: config.slices.regex, - customRegex: config.slices.custom_regex, - legalForms: config.slices.legal_forms, - triggers: config.slices.triggers, - denyList: config.slices.deny_list, - streetTypes: config.slices.street_types, - gazetteer: config.slices.gazetteer, - countries: config.slices.countries, + ...config.slices, + regex: { start: 0, end: nextRegexCount }, + legal_forms: { + start: nextRegexCount, + end: nextRegexCount + legalFormCount, + }, + triggers: { + start: nextRegexCount + legalFormCount, + end: nextRegexCount + legalFormCount + triggerCount, + }, + }, + }; +} + +function nativeRegexOnlyConfig(config, labels) { + const regexMeta = config.regex_meta ?? []; + const regexPatterns = []; + const nextMeta = []; + for (const [index, meta] of regexMeta.entries()) { + if (!labels.has(meta.label)) { + continue; + } + regexPatterns.push(config.regex_patterns[index]); + nextMeta.push(meta); + } + + return { + ...config, + regex_patterns: regexPatterns, + regex_meta: nextMeta, + literal_patterns: [], + literal_patterns_from_deny_list_data: false, + deny_list_data: undefined, + gazetteer_data: undefined, + country_data: undefined, + trigger_data: undefined, + legal_form_data: undefined, + slices: { + regex: { start: 0, end: regexPatterns.length }, + custom_regex: { start: 0, end: 0 }, + legal_forms: { + start: regexPatterns.length, + end: regexPatterns.length, + }, + triggers: { + start: regexPatterns.length, + end: regexPatterns.length, + }, + deny_list: { start: 0, end: 0 }, + street_types: { start: 0, end: 0 }, + gazetteer: { start: 0, end: 0 }, + countries: { start: 0, end: 0 }, }, - regexMeta: config.regex_meta.map(toNapiRegexMeta), - customRegexMeta: config.custom_regex_meta.map(toNapiRegexMeta), - denyListData: - config.deny_list_data === undefined - ? undefined - : { - labels: config.deny_list_data.labels, - customLabels: config.deny_list_data.custom_labels, - originals: config.deny_list_data.originals, - sources: config.deny_list_data.sources, - filters: - config.deny_list_data.filters === undefined - ? undefined - : toNapiDenyListFilters(config.deny_list_data.filters), - }, - gazetteerData: - config.gazetteer_data === undefined - ? undefined - : { - labels: config.gazetteer_data.labels, - isFuzzy: config.gazetteer_data.is_fuzzy, - }, - countryData: config.country_data, }; } -function toNapiPattern(pattern) { +function measureNativeConfigPrepare(NativePreparedSearch, name, config) { + const stringifyStart = Bun.nanoseconds(); + const configJson = JSON.stringify(config); + const stringifyMs = elapsedMs(stringifyStart); + const prepareStart = Bun.nanoseconds(); + const prepared = new NativePreparedSearch(configJson); + const prepareMs = elapsedMs(prepareStart); + const diagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const stages = diagnosticStageSummaries(diagnostics.events); + return { - kind: pattern.kind, - pattern: pattern.pattern, - distance: pattern.distance, - caseInsensitive: pattern.case_insensitive, - wholeWords: pattern.whole_words, - lazy: pattern.lazy, - prefilterAny: pattern.prefilter_any, - prefilterCaseInsensitive: pattern.prefilter_case_insensitive, - prefilterRegex: pattern.prefilter_regex, + name, + configBytes: Buffer.byteLength(configJson, "utf8"), + sliceLengths: nativeSliceLengths(config, {}), + stringifyMs, + prepareMs, + topStages: topDiagnosticStages(stages).slice(0, 8), }; } -function toNapiOptions(options) { - if (options === undefined) { - return undefined; +function fixtureLanguage(fixturePath) { + return relative(FIXTURES_DIR, fixturePath).split(/[\\/]/)[0] ?? "und"; +} + +function applyFixtureLanguageScope(config, language) { + return { + ...config, + ...fixtureLanguageScope(language), + }; +} + +function fixtureLanguageScope(language) { + switch (language) { + case "cs": + return { + denyListCountries: ["CZ", "SK"], + nameCorpusLanguages: ["cs", "sk"], + }; + case "de": + return { + denyListCountries: ["DE", "AT", "CH"], + nameCorpusLanguages: ["de"], + }; + case "en": + return { + denyListCountries: ["US", "GB", "CA", "AU", "IE"], + nameCorpusLanguages: ["en"], + }; + default: + return {}; + } +} + +function contentLanguageScope() { + if (CONTENT_LANGUAGE.length === 0) { + return {}; } + return { - literalCaseInsensitive: options.literal_case_insensitive, - literalWholeWords: options.literal_whole_words, - regexWholeWords: options.regex_whole_words, - fuzzyCaseInsensitive: options.fuzzy_case_insensitive, - fuzzyWholeWords: options.fuzzy_whole_words, - fuzzyNormalizeDiacritics: options.fuzzy_normalize_diacritics, + language: CONTENT_LANGUAGE, + ...fixtureLanguageScope(CONTENT_LANGUAGE), }; } -function toNapiRegexMeta(meta) { +function applyUserDataScenario(config) { + switch (USER_DATA_SCENARIO) { + case "none": + return config; + case "sample": + return withUserDataOverlay(config, { + denyListCount: 50, + regexCount: 5, + }); + case "heavy": + return withUserDataOverlay(config, { + denyListCount: 5_000, + regexCount: 50, + }); + default: + throw new Error( + `ANONYMIZE_MIGRATION_USER_DATA_SCENARIO must be none, sample, or heavy; got ${USER_DATA_SCENARIO}`, + ); + } +} + +function withUserDataOverlay(config, { denyListCount, regexCount }) { return { - label: meta.label, - score: meta.score, - sourceDetail: meta.source_detail, - requiresValidation: meta.requires_validation, - minByteLength: meta.min_byte_length, + ...config, + customDenyList: [ + ...(config.customDenyList ?? []), + ...generatedCustomDenyList(denyListCount), + ], + customRegexes: [ + ...(config.customRegexes ?? []), + ...generatedCustomRegexes(regexCount), + ], }; } -function toNapiDenyListFilters(filters) { +function generatedCustomDenyList(count) { + return Array.from({ length: count }, (_, index) => ({ + value: `CustomerPrivateTerm${index.toString().padStart(5, "0")}`, + label: index % 2 === 0 ? "organization" : "person", + variants: [`Customer Private Term ${index.toString().padStart(5, "0")}`], + })); +} + +function generatedCustomRegexes(count) { + return Array.from({ length: count }, (_, index) => ({ + pattern: `USR-${index.toString().padStart(4, "0")}-[A-Z]{2}\\d{4}`, + label: + index % 2 === 0 ? "registration number" : "tax identification number", + score: 0.92, + })); +} + +function loadNativeAdapter() { + const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-fixture-napi-")); + const napiPath = join(tempDir, "stella_anonymize_napi.node"); + copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); + const loaded = createRequire(import.meta.url)(napiPath); + const NativePreparedSearch = Reflect.get( + Object(loaded), + "NativePreparedSearch", + ); + const prepareStaticSearchArtifactsBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchArtifactsBytes", + ); + const prepareStaticSearchPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchPackageBytes", + ); + const prepareStaticSearchCompressedPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchCompressedPackageBytes", + ); + if ( + typeof NativePreparedSearch !== "function" || + typeof prepareStaticSearchArtifactsBytes !== "function" || + typeof prepareStaticSearchPackageBytes !== "function" || + typeof prepareStaticSearchCompressedPackageBytes !== "function" + ) { + throw new TypeError("Native anonymize adapter exports are incomplete"); + } return { - stopwords: filters.stopwords, - allowList: filters.allow_list, - personStopwords: filters.person_stopwords, - addressStopwords: filters.address_stopwords, - streetTypes: filters.street_types, - firstNames: filters.first_names, - genericRoles: filters.generic_roles, - sentenceStarters: filters.sentence_starters, - trailingAddressWordExclusions: filters.trailing_address_word_exclusions, - definedTermCues: filters.defined_term_cues, + NativePreparedSearch, + prepareStaticSearchArtifactsBytes, + prepareStaticSearchPackageBytes, + prepareStaticSearchCompressedPackageBytes, }; } +function prepareNativePackageBytes(native, configBytes) { + if (NATIVE_COMPRESSED_PACKAGE) { + return native.prepareStaticSearchCompressedPackageBytes(configBytes); + } + return native.prepareStaticSearchPackageBytes(configBytes); +} + +function isCompressedNativePackage(packageBytes) { + return packageBytes.subarray(0, 8).toString("ascii") === "ANONPKZ1"; +} + function nativeLibraryPath(name) { if (process.platform === "darwin") { return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); @@ -1062,6 +2032,17 @@ function positiveIntegerEnv(name, fallback) { return value; } +function stringListEnv(name) { + const raw = process.env[name]?.trim(); + if (!raw) { + return []; + } + return raw + .split(",") + .map((part) => part.trim()) + .filter((part) => part.length > 0); +} + function requiredEnv(name) { const value = process.env[name]; if (value === undefined || value === "") { diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index ff941104..38b0ccbd 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -157,9 +157,7 @@ const rustSummary = JSON.parse(rustOutput); printSummary("rust-core", rustSummary, cases.length, ITERATIONS); const tsPrepareStart = Bun.nanoseconds(); -const prepared = new native.NativePreparedSearch( - toNapiConfig(JSON.parse(configJson)), -); +const prepared = new native.NativePreparedSearch(configJson); const tsPrepareMs = elapsedMs(tsPrepareStart); const tsStart = Bun.nanoseconds(); for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { @@ -212,103 +210,6 @@ function buildCases() { return fixtureCases; } -function toNapiConfig(config) { - return { - regexPatterns: config.regex_patterns.map(toNapiPattern), - customRegexPatterns: config.custom_regex_patterns.map(toNapiPattern), - literalPatterns: config.literal_patterns.map(toNapiPattern), - regexOptions: toNapiOptions(config.regex_options), - customRegexOptions: toNapiOptions(config.custom_regex_options), - literalOptions: toNapiOptions(config.literal_options), - slices: { - regex: config.slices.regex, - customRegex: config.slices.custom_regex, - legalForms: config.slices.legal_forms, - triggers: config.slices.triggers, - denyList: config.slices.deny_list, - streetTypes: config.slices.street_types, - gazetteer: config.slices.gazetteer, - countries: config.slices.countries, - }, - regexMeta: config.regex_meta.map(toNapiRegexMeta), - customRegexMeta: config.custom_regex_meta.map(toNapiRegexMeta), - denyListData: - config.deny_list_data === undefined - ? undefined - : { - labels: config.deny_list_data.labels, - customLabels: config.deny_list_data.custom_labels, - originals: config.deny_list_data.originals, - sources: config.deny_list_data.sources, - filters: - config.deny_list_data.filters === undefined - ? undefined - : toNapiDenyListFilters(config.deny_list_data.filters), - }, - gazetteerData: - config.gazetteer_data === undefined - ? undefined - : { - labels: config.gazetteer_data.labels, - isFuzzy: config.gazetteer_data.is_fuzzy, - }, - countryData: config.country_data, - }; -} - -function toNapiPattern(pattern) { - return { - kind: pattern.kind, - pattern: pattern.pattern, - distance: pattern.distance, - caseInsensitive: pattern.case_insensitive, - wholeWords: pattern.whole_words, - lazy: pattern.lazy, - prefilterAny: pattern.prefilter_any, - prefilterCaseInsensitive: pattern.prefilter_case_insensitive, - prefilterRegex: pattern.prefilter_regex, - }; -} - -function toNapiOptions(options) { - if (options === undefined) { - return undefined; - } - return { - literalCaseInsensitive: options.literal_case_insensitive, - literalWholeWords: options.literal_whole_words, - regexWholeWords: options.regex_whole_words, - fuzzyCaseInsensitive: options.fuzzy_case_insensitive, - fuzzyWholeWords: options.fuzzy_whole_words, - fuzzyNormalizeDiacritics: options.fuzzy_normalize_diacritics, - }; -} - -function toNapiRegexMeta(meta) { - return { - label: meta.label, - score: meta.score, - sourceDetail: meta.source_detail, - requiresValidation: meta.requires_validation, - minByteLength: meta.min_byte_length, - }; -} - -function toNapiDenyListFilters(filters) { - return { - stopwords: filters.stopwords, - allowList: filters.allow_list, - personStopwords: filters.person_stopwords, - addressStopwords: filters.address_stopwords, - streetTypes: filters.street_types, - firstNames: filters.first_names, - genericRoles: filters.generic_roles, - sentenceStarters: filters.sentence_starters, - trailingAddressWordExclusions: filters.trailing_address_word_exclusions, - definedTermCues: filters.defined_term_cues, - }; -} - function nativeLibraryPath(name) { if (process.platform === "darwin") { return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); diff --git a/packages/anonymize/src/__test__/countries.test.ts b/packages/anonymize/src/__test__/countries.test.ts index 187cddd1..fc9b01cb 100644 --- a/packages/anonymize/src/__test__/countries.test.ts +++ b/packages/anonymize/src/__test__/countries.test.ts @@ -193,6 +193,21 @@ describe("country detector", () => { expect(found).not.toContain("America"); }); + test("ambiguous short territory surfaces do not block city-state addresses", async () => { + const cityState = "Any arbitration shall take place in Norfolk, Virginia."; + const cityStateEntities = await detect(cityState); + expect(countries(cityStateEntities)).not.toContain("Norfolk"); + expect( + cityStateEntities.some( + (entity) => + entity.label === "address" && entity.text === "Norfolk, Virginia", + ), + ).toBe(true); + + const fullCountry = "The court is located on Norfolk Island."; + expect(countries(await detect(fullCountry))).toContain("Norfolk Island"); + }); + test("country token contained in a person span loses to the person", async () => { // "Chad", "Georgia", "Jordan" are first names AND // countries. When a longer person span contains the diff --git a/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json index 938969d7..4b2e3172 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json @@ -111,9 +111,9 @@ }, { "start": 12008, - "end": 12077, + "end": 12029, "label": "address", - "text": "Kodaňská 46, Praha 10, nebude-li v konkrétním případě dohodnuto jinak", + "text": "Kodaňská 46, Praha 10", "source": "regex" }, { @@ -229,5 +229,5 @@ "source": "trigger" } ], - "redactedText": "RÁMCOVÁ DOHODA NA POSKYTOVÁNÍ PRÁVNÍCH SLUŽEB\n\nČíslo 2026/051 NAKIT\n\n\n\nSmluvní strany\n\n\n\n[ORGANIZATION_1]\n\nse sídlem \t[ADDRESS_1]\n\nIČO: \t[REGISTRATION_NUMBER_1] \n\nDIČ: \t [TAX_IDENTIFICATION_NUMBER_1]\n\nzastoupen: \txxx\n\nzapsán v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2]\n\nbankovní spojení \txxx\n\n\tč. ú. xxx\n\n(dále jen „Objednatel“)\n\n\n\na\n\n[PERSON_1]\n\nse sídlem [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2] \n\nbankovní spojení xxx\n\n č. ú. xxx\n\n \n\n (dále jen „Poskytovatel“)\n\n\n\n(Objednatel a Poskytovatel budou v této rámcové dohodě na poskytování právních služeb označováni jednotlivě jako „Smluvní strana“ a společně jako „Smluvní strany“ a tato rámcová dohoda jako „Smlouva“),\n\n\n\nuzavírají v souladu s ustanovením § 1746 odst. 2 zákona č. 89/2012 Sb., občanský zákoník, v platném znění (dále jen „Občanský zákoník“) a v souladu s ustanovením § 29 písm. k) bod 1. a 2. zákona č. 134/2016 Sb., o zadávání veřejných zakázek, ve znění pozdějších předpisů (dále jen „Zákon o zadávání veřejných zakázek“), jakož i v souladu se zákonem č. 85/1996 Sb., o advokacii, ve znění pozdějších předpisů (dále jen „Zákon o advokacii“) tuto Smlouvu. \n\n\n\n\n\n\n\n\n\nÚčel a předmět Smlouvy\n\nÚčelem této Smlouvy je stanovení podmínek a právního rámce pro uzavírání Dílčích smluv (jak je tento pojem definován níže v odst. 1.8 Smlouvy) mezi Objednatelem a Poskytovatelem na poskytování právních služeb, a to na základě písemných Objednávek Objednatele.\n\nPředmětem této Smlouvy je stanovení práv a povinností Smluvních stran pro postup při uzavírání Dílčích smluv a následném poskytování právních služeb Poskytovatelem Objednateli, přičemž poskytováním právních služeb se pro účely této Smlouvy rozumí poskytování právních služeb ve smyslu § 29 odst. 1 písm. k) bod 1. a 2. Zákona o zadávání veřejných zakázek (dále jen „Služby“).\n\nPoskytovatel se zavazuje poskytnout Objednateli Služby na základě Dílčí smlouvy. Služby poskytované Poskytovatelem Objednateli na základě konkrétní Dílčí smlouvy budou dále nazývány jako „Plnění“. Dílčí smlouvy budou uzavírány níže uvedeným postupem, na základě písemné Objednávky Objednatele doručené Poskytovateli (dále jen „Objednávka“). Objednávka musí obsahovat minimálně tyto náležitosti:\n\nidentifikační údaje Poskytovatele a Objednatele;\n\nčíslo a datum vystavení Objednávky;\n\nčíslo Smlouvy;\n\nrámcové vymezení Plnění;\n\nmaximální rozsah a maximální cenu Plnění; a\n\npodpis oprávněné osoby Objednatele.\n\nObjednatel je oprávněn, avšak nikoli povinen, vystavovat dle svého uvážení Objednávky ode dne nabytí účinnosti této Smlouvy. Každá takto vystavená Objednávka se považuje za návrh na uzavření Dílčí smlouvy za podmínek stanovených touto Smlouvou. Poskytovatel je povinen písemně potvrdit Objednávku ve lhůtě dvou (2) pracovních dnů od jejího doručení Poskytovateli.\n\nPotvrzení Objednávky musí obsahovat minimálně tyto náležitosti: \n\nidentifikační údaje Objednatele a Poskytovatele; \n\nčíslo Objednávky, která je potvrzována; a \n\npodpis oprávněné osoby Poskytovatele.\n\nV případě, že Objednávka nebude splňovat uvedené minimální náležitosti, má Poskytovatel povinnost na tuto skutečnost neprodleně upozornit Objednatele. Objednatel je poté povinen vystavit novou Objednávku a Poskytovatel je povinen ji ve lhůtě dvou (2) pracovních dnů od jejího doručení písemně potvrdit. Není-li v článku 4 Smlouvy stanoveno jinak, běží lhůta pro poskytnutí Plnění dle příslušné Dílčí smlouvy od okamžiku doručení této nové Objednávky. \n\nPotvrzení Objednávky, které obsahuje dodatky, výhrady, omezení nebo jiné změny se považuje za odmítnutí Objednávky a tvoří nový návrh Poskytovatele na uzavření Dílčí smlouvy, a to i v případě takového dodatku, výhrady, omezení nebo jiné změny, které podstatně nemění podmínky Objednávky. Dílčí smlouva je v takovém případě uzavřena pouze tehdy, pokud tento nový návrh Objednatel písemně potvrdí a doručí zpět Poskytovateli. \n\nDoručením potvrzení Objednávky Objednateli dojde k uzavření smlouvy o poskytnutí služeb, přičemž práva a povinnosti Smluvních stran dle této smlouvy o poskytnutí služeb odpovídají v celém rozsahu právům a povinnostem Objednatele a Poskytovatele stanovených touto Smlouvou (dále jen „Dílčí smlouva“).\n\nPočet Objednávek vystavených Objednatelem není omezený. Současně platí, že Objednatel není povinen Objednávku vystavit.\n\nPoskytovatel se zavazuje poskytnout Objednateli Plnění za podmínek uvedených v této Smlouvě a v Dílčí smlouvě ve sjednaném rozsahu, jakosti a čase. \n\nObjednatel se zavazuje zaplatit za Plnění poskytnuté v souladu s touto Smlouvou a Dílčí smlouvou Cenu dle článku 2 této Smlouvy.\n\nObjednatel při uzavírání této Smlouvy negarantuje žádný minimální objem plnění, který bude zadán v průběhu její platnosti. Objednatel uzpůsobuje rozsah poptávaného plnění svým aktuálním potřebám, které jsou v čase proměnlivé. Poskytovatel se přes výše uvedené zavazuje být připraven poskytnout plnění v rozsahu poptávaném Objednatelem dle podmínek této Smlouvy. \n\nSmluvní strany sjednávají, že k poskytnutí konkrétního Plnění (resp. jeho relevantní části) na základě Dílčí smlouvy je Poskytovatel povinen na základě, v rozsahu a v souladu s požadavky a/nebo pokyny Objednatele, které budou činěny prostřednictvím e-mailové komunikace kontaktní osobou Objednatele uvedenou v čl. 13 odst. 13.11 písm. a) Smlouvy nebo jí pověřenou osobou. V e-mailu podle přechozí věty Objednatel uvede specifikaci konkrétního požadavku (včetně případného požadavku na výstup) a/nebo pokynu. Hovoří-li se v této Smlouvě o Plnění, rozumí se jím i jeho relevantní část, poskytnutá Objednateli na základě konkrétního požadavku a/nebo pokynu dle tohoto odstavce Smlouvy.\n\nKaždá Dílčí smlouva nabývá platnosti dnem uzavření. Dílčí smlouva nabývá účinnosti dnem uzavření, nevztahuje-li se na ni povinnost zveřejnění v registru smluv podle zákona č. 340/2015 Sb., o zvláštních podmínkách účinnosti některých smluv, uveřejňování těchto smluv a o registru smluv (zákon o registru smluv) ve znění pozdějších předpisů (dále jen „Zákon o registru smluv“). Vztahuje-li se na příslušnou Dílčí smlouvu povinnost jejího zveřejnění v registru smluv, nabývá Dílčí smlouva účinnosti dnem zveřejnění v registru smluv, přičemž zveřejnění Dílčí smlouvy v registru smluv zajistí Objednatel. V Dílčí smlouvě může být výslovně uvedeno pozdější datum nabytí účinnosti než dnem jejího uzavření/zveřejnění v registru smluv (dle relevance).\n\nCena\n\nCena za poskytnutí Plnění Poskytovatelem odpovídá součinu skutečného časového rozsahu poskytnutého Plnění a hodinové sazby dle Přílohy č. 1 této Smlouvy na základě konkrétní Dílčí smlouvy (dále jen „Cena“). Nejnižší časová jednotka odpracovaného času, za kterou náleží Poskytovateli odměna za poskytnuté Plnění, je jedna (1) hodina.\n\nSkutečný časový rozsah Plnění je limitován odhadovaným maximálním časovým rozsahem Plnění uvedeným v Dílčí smlouvě. Skutečný časový rozsah Plnění bude Poskytovatelem Objednateli dokladován v rámci akceptační procedury dle článku 6 Smlouvy, jejíž průběh bude stvrzen Smluvními stranami podpisem Akceptačního protokolu, jehož vzor tvoří Přílohu č. 2 této Smlouvy a je její nedílnou součástí. \n\nObjednatel si vyhrazuje právo uznat v rámci fakturace pouze takový časový rozsah Plnění, který byl na poskytování Plnění účelně vynaložen. \n\nCena každé jednotlivé složky Plnění zahrnuje veškeré náklady Poskytovatele spojené s plněním Smlouvy, Dílčí smlouvy a poskytnutím Plnění Objednateli, vyjma pravomocně přiznané odměny za zastupování v soudním řízení, která připadá Poskytovateli. Tato Cena je cenou konečnou.\n\nCelková cena Plnění poskytnutého na základě této Smlouvy a Dílčích smluv nesmí převýšit částku [MONETARY_AMOUNT_1] bez DPH. DPH bude připočítána k ceně v souladu s platnými právními předpisy ke dni uskutečnění zdanitelného plnění.\n\nPlatební podmínky\n\nDaňové doklady za poskytování Plnění budou Poskytovatelem vystavovány vždy k poslednímu dni příslušného kalendářního měsíce, ve kterém bylo Plnění poskytováno, a bude v nich vyúčtováno Plnění poskytnuté Objednateli bez jakýchkoli vad v příslušném kalendářním měsíci. Za den uskutečnění zdanitelného plnění se považuje den podpisu Akceptačního protokolu Objednatelem.\n\nDaňový doklad (faktura) musí obsahovat náležitosti řádného daňového dokladu podle příslušných právních předpisů, zejména podle § 29 zákona č. 235/2004 Sb., o dani z přidané hodnoty, ve znění pozdějších předpisů (dále jen „Zákon o DPH“), dle zákona č. 563/1991 Sb., o účetnictví, ve znění pozdějších předpisů, dle § 435 Občanského zákoníku a níže uvedené údaje:\n\nčíslo Smlouvy a Dílčí smlouvy (Objednávky),\n\nplatební podmínky v souladu se Smlouvou a Dílčí smlouvou,\n\nmísto a datum předání a převzetí Plnění,\n\npopis fakturovaného Plnění, rozsah, jednotkovou a celkovou cenu,\n\npřílohou je kopie Akceptačního protokolu s výrokem „Akceptováno“, odsouhlaseného a potvrzeného Objednatelem.\n\nSplatnost daňového dokladu (faktury) vystaveného Poskytovatelem je třicet (30) kalendářních dní ode dne jeho doručení Objednateli. \n\nPoskytovatel zašle daňový doklad spolu s veškerými požadovanými dokumenty Objednateli nejpozději do pěti (5) kalendářních dnů od podpisu Akceptačního protokolu, jedním z následujících způsobů: \n\nbuď v elektronické podobě na adresu:\n\nxxx\n\nnebo doporučeným dopisem na následující adresu: \n\n[ORGANIZATION_1]\n\n[ADDRESS_1].\n\nV případě, že faktura nebude obsahovat stanovené náležitosti, přílohy nebo nebude vystavena v souladu s touto Smlouvou, je Objednatel oprávněn vrátit ji ve lhůtě splatnosti Poskytovateli k doplnění či opravě, aniž se tím dostane do prodlení. Lhůta splatnosti v délce třicet (30) kalendářních dní počíná běžet znovu ode dne doručení náležitě doplněné či opravené faktury Objednateli.\n\nPlatba bude provedena v české měně formou bankovního převodu na účet Poskytovatele uvedený v záhlaví této Smlouvy. Cena se považuje za uhrazenou dnem odepsání fakturované částky z účtu Objednatele ve prospěch účtu Poskytovatele.\n\nObjednatel neposkytuje Poskytovateli jakékoliv zálohy na cenu za Služby / Plnění.\n\nSmluvní strany se dohodly, že pokud bude v okamžiku uskutečnění zdanitelného plnění správcem daně zveřejněna způsobem umožňujícím dálkový přístup skutečnost, že poskytovatel zdanitelného plnění (Poskytovatel) je nespolehlivým plátcem ve smyslu ust. § 106a Zákona o DPH nebo že úplata za toto plnění má být poskytnuta zcela nebo zčásti bezhotovostním převodem na jiný účet než účet Poskytovatele, který je správcem daně zveřejněn způsobem umožňujícím dálkový přístup ve smyslu ust. § 96 Zákona o DPH, je příjemce zdanitelného plnění (Objednatel) oprávněn část ceny odpovídající dani z přidané hodnoty zaplatit přímo na bankovní účet správce daně ve smyslu ust. § 109a Zákona o DPH. Na bankovní účet Poskytovatele bude v tomto případě uhrazena část ceny odpovídající výši základu daně z přidané hodnoty. Úhrada ceny plnění (základu daně) provedená Objednatelem v souladu s ustanovením tohoto odstavce bude považována za řádnou úhradu ceny plnění poskytnutého dle Smlouvy.\n\nDoba, místo a podmínky plnění\n\nPoskytovatel je povinen poskytnout Objednateli Plnění a předat Objednateli výstup/y nejdéle do pěti (5) kalendářních dnů ode dne doručení požadavku a/nebo pokynu ve smyslu čl. 1 odst. 1.13 Smlouvy, nedohodnou-li se Smluvní strany písemně (např. e-mailem) na jiném termínu poskytnutí Plnění, nebo nevyplývá-li jiný čas poskytnutí Plnění z platných právních předpisů nebo z požadavku či výzvy příslušného orgánu. \n\nPoskytovatel se zavazuje poskytovat Služby dle této Smlouvy na celém území [COUNTRY_1]. Místem předání veškerých výstupů dle této Smlouvy je [ADDRESS_3]. \n\nDalší práva a povinnosti Smluvních stran\n\nPoskytovatel je povinen postupovat při poskytování Služeb / Plnění s odbornou péčí podle svých nejlepších odborných znalostí a schopností, v souladu s právním řádem [COUNTRY_1] a se Smlouvou, přičemž je při své činnosti povinen sledovat a chránit zájmy a dobré jméno Objednatele a postupovat v souladu s jeho aktuálními potřebami a pokyny. V případě nevhodných pokynů Objednatele je Poskytovatel povinen na nevhodnost těchto pokynů Objednatele písemně upozornit, v opačném případě nese Poskytovatel zejména odpovědnost za vady a za škodu, které v důsledku nevhodných pokynů Objednatele Poskytovateli a/nebo třetím osobám vznikly.\n\nPoskytovatel je dále povinen bezodkladně oznámit Objednateli všechny okolnosti, o kterých se při poskytování Služeb / Plnění dozví, a které by mohly mít vliv na změnu pokynů Objednatele nebo na poskytování Služeb / Plnění dle této Smlouvy a Dílčí smlouvy. \n\nPoskytovatel je povinen informovat Objednatele na jeho žádost o průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, a akceptovat jeho doplňující pokyny a připomínky k poskytovanému Plnění. V případě, že Objednatel zjistí v průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, nedostatky, Poskytovatel je povinen na písemnou výzvu Objednatele tyto nedostatky odstranit bez nároku na navýšení ceny poskytovaného Plnění bezodkladně, nejdéle však do pěti (5) pracovních dnů ode dne obdržení výzvy.\n\nObjednatel poskytne Poskytovateli k plnění požadovaného Plnění:\n\nveškerou jemu dostupnou dokumentaci;\n\nb)\tpravdivé a včasné informace potřebné k řádnému poskytování Plnění;\n\nc)\tveškerou součinnost nezbytnou pro řádné poskytování Plnění.\n\nPoskytovatel je povinen řádně pečovat o věci a dokumenty, které od Objednatele k poskytnutí požadovaného Plnění obdrží. \n\nSmluvní strany se zavazují vzájemně se písemně informovat o případných změnách sídla, právní formy, změně bankovního spojení, zrušení registrace k DPH, a dalších významných skutečností rozhodných pro plnění ze Smlouvy, resp. Dílčí smlouvy, a to bezodkladně po uskutečnění takovéto změny. \n\nPoskytovatel je povinen neprodleně informovat Objednatele o kybernetických bezpečnostních incidentech (dále jen „KBI“) na straně Poskytovatele souvisejících s plněním dle Smlouvy a/nebo Dílčích smluv, které by mohly mít dopad na kybernetickou bezpečnost u Objednatele. KBI je definován v § 2 odst. 2 písm. f) zákona č. 264/2025 Sb., o kybernetické bezpečnosti. \n\nPoskytovatel poskytne Objednateli součinnost při zvládání KBI v souvislosti s poskytovaným plněním dle Smlouvy a/nebo Dílčích smluv, a bude se v této souvislosti řídit pokyny Objednatele.\n\nPoskytovatel prohlašuje, že si je vědom předpisů týkajících se mezinárodních sankcí, zejm. pak čl. 5 k nařízení Rady EU č. 833/2014 o omezujících opatřeních vzhledem k činnostem Ruska destabilizujícím situaci na Ukrajině, ve znění pozdějších předpisů a nařízení Rady EU č. 269/2014 o omezujících opatřeních vzhledem k činnostem narušujícím nebo ohrožujícím územní celistvost, svrchovanost a nezávislost Ukrajiny, ve znění pozdějších předpisů, vč. prováděcího nařízení Rady EU 2022/581 ze dne [DATE_1], ve znění pozdějších předpisů (dále jen „Předpisy o mezinárodních sankcích“). Poskytovatel prohlašuje, že u něho, jakož ani u okruhu sledovaných subjektů dle právních Předpisů o mezinárodních sankcích vztahujícího se k plnění Smlouvy a/nebo Dílčích smluv není dána překážka uzavření či plnění Smlouvy a/nebo Dílčích smluv. Dále výslovně Poskytovatel zvláště prohlašuje, že nezpřístupní žádné finanční prostředky ani hospodářské zdroje sankcionovaným subjektům ve smyslu tohoto odstavce Smlouvy. Pro vyloučení pochybností se stanoví, že: (i) prohlášení musí být v platnosti po celou dobu plnění Smlouvy, resp. Dílčích smluv, a (ii) jsou-li do tohoto prohlášení zahrnuti poddodavatelé či jiné třetí osoby, je Poskytovatel povinen zjistit skutečnosti vztahující se k těmto třetím osobám s řádnou péčí, přinejmenším ověřením informace u třetích osob a prověřením veřejných rejstříků a evidencí. Poskytovatel je povinen zajistit smluvně dodržování příslušných povinností a omezovat rizika vyplývajících z okolností vedoucích k mezinárodním sankcím, a zavazuje se zajistit, aby jeho prohlášení dle tohoto odstavce Smlouvy zůstala pravdivá a v platnosti po celou dobu účinnosti Smlouvy a/nebo Dílčích smluv. V případě, že Poskytovatel zjistí, že pravdivost jeho prohlášení je, byť jen ohrožena, je povinen o tom Objednatele bezodkladně písemně vyrozumět.\n\nSmluvní strany se dohodly, že pokud to bude potřebné ke splnění požadavků v oblasti kybernetické bezpečnosti stanovených obecně závaznými právními předpisy, zejména v návaznosti na nový zákon č. 264/2025 Sb., o kybernetické bezpečnosti, který nabyl účinnosti dne [DATE_2], a související prováděcí právní předpisy, uzavřou bez zbytečného odkladu po výzvě Objednatele písemný dodatek k této Smlouvě a/nebo Dílčí smlouvě zohledňující takové požadavky, a to formou úpravy či doplnění ustanovení týkajících se zajištění bezpečnostních požadavků v souladu s novou právní úpravou a implementovaným systémem řízení bezpečnosti informací na straně Objednatele a/nebo koncového zákazníka Objednatele. Náklady na bezpečnost informací v důsledku změny legislativy v oblasti bezpečnosti informací nese Poskytovatel.\n\nSchválení poskytnutého Plnění a převzetí výstupů\n\nPoskytovatel splní svou povinnost řádně poskytnout Plnění dnem, kdy je příslušná činnost řádně vykonána a její výstup v Objednatelem požadované formě (dále jen „výstup“) řádně předán Objednateli. Poskytovatel je povinen vypracovat písemnou zprávu, která bude obsahovat zejména údaje o Objednateli a Poskytovateli, číslo této Smlouvy a Dílčí smlouvy, obsah a rozsah poskytnutého Plnění, závěr z poskytnutého Plnění, popř. doporučení Poskytovatele pro další postup Objednatele. Výstup bude Objednateli Poskytovatelem předán v českém jazyce v dohodnutých termínech buď v listinné podobě vytištěné v jednom (1) originálu nebo v elektronické podobě ve formátu požadovaném Objednatelem. \n\nSplnění povinnosti Poskytovatele podle odstavce 6.1 Smlouvy Smluvní strany osvědčí sepsáním protokolu o schválení poskytnutého Plnění a předání a převzetí výstupu, obsahujícího soupis poskytnutého Plnění, včetně rozpisu hodin odpracovaných Poskytovatelem při plnění jednotlivých úkolů, a označení veškerých předávaných výstupů, který bude vyhotoven ve dvou (2) vyhotoveních s platností originálu a bude opatřen podpisem oprávněných osob obou Smluvních stran (dále jen „Akceptační protokol“), přičemž každá ze Smluvních stran obdrží po jednom (1) vyhotovení. Takto vyhotovený Akceptační protokol předá Poskytovatel Objednateli vždy do pěti (5) kalendářních dnů od skončení příslušného měsíce, za který se Akceptační protokol vyhotovuje. \n\nObjednatel je oprávněn odmítnout převzetí výstupu, a tedy podepsat Akceptační protokol s výrokem „Neakceptováno“, pokud Plnění nebylo poskytnuto řádně v souladu s touto Smlouvou a Dílčí smlouvou a/nebo ve sjednané kvalitě a/nebo pokud výstup neobsahoval veškeré údaje požadované Objednatelem a/nebo Objednatel nesouhlasí s počtem hodin poskytnutého Plnění, které budou Objednateli účtovány, přičemž v takovém případě Objednatel důvody odmítnutí převzetí výstupu písemně Poskytovateli sdělí, a to nejpozději do pěti (5) pracovních dnů od předání Akceptačního protokolu. Na následné předání výstupu se použijí výše uvedená ustanovení tohoto článku Smlouvy. Pokud Objednatel uplatní písemný nárok na odstranění vad výstupu, zavazuje se Poskytovatel tyto vady odstranit bez zbytečného odkladu, nejpozději však do pěti (5) pracovních dnů, nestanoví-li Objednatel jinak. \n\nV případě zjevných vad poskytnutého Plnění nebo jeho výstupů není Objednatel povinen Plnění schválit a výstupy převzít a do odstranění těchto vad není povinen podepsat Akceptační protokol s výrokem „Akceptováno“ a zaplatit fakturovanou cenu Plnění. \n\nPovinnost mlčenlivosti a zpracování osobních údajů\n\nSmluvní strany sjednávají, že za důvěrné informace považují takové informace, které získají od druhé Smluvní strany, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, nebo které nejsou v obchodních kruzích běžně dostupné, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, zejména pak informace, údaje a skutečnosti o jakýchkoliv obchodních, finančních, technických, právních a jiných skutečnostech, které by s ohledem na dané podmínky mohly být považovány za důvěrné, poskytnuté či jakkoliv zpřístupněné jednou ze Smluvních stran či jejími zástupci druhé Smluvní straně či jejím zástupcům, ať v ústní, písemné, grafické, elektronické či jiné formě, které se Smluvní strany dozvěděly v souvislosti se Smlouvou a/nebo Dílčí smlouvou, a to bez ohledu zda jsou nebo nejsou označené za důvěrné informace (dále jen „Důvěrné informace“). \n\nS těmito Důvěrnými informacemi budou nakládat jako s vlastním obchodním tajemstvím, aniž by bylo nutné takové informace jako Důvěrné vždy jednotlivě označovat. Výše uvedené nevylučuje možnost v jednotlivých případech při zvýšeném zájmu toto označení pro jednotlivé informace použít. Smluvní strany berou zároveň na vědomí, že některé z Důvěrných informací jsou také předmětem obchodního tajemství druhé Smluvní strany, chráněným dle příslušných ustanovení Občanského zákoníku. \n\nKaždá ze Smluvních stran se zavazuje vynaložit maximální úsilí, které lze spravedlivě požadovat, aby důvěrnost Důvěrných informací druhé Smluvní strany byla důsledně dodržována jejími pracovníky i osobami, které případně, v souladu s dohodou uzavřenou s druhou Smluvní stranou, k plnění účelu spolupráce použije. Použije-li některá ze Smluvních stran k plnění třetí osoby, je oprávněna zpřístupnit jí Důvěrné informace získané od druhé Smluvní strany pouze s jejím souhlasem a v rozsahu nezbytně nutném pro jí poskytované plnění, a je rovněž povinna zavázat třetí osobu povinností mlčenlivosti v rozsahu dle Smlouvy. Za porušení povinností třetí osobou odpovídá Smluvní strana, která jí Důvěrné informace zpřístupnila.\n\nSmluvní strany se dále zavazují:\n\nzachovat mlčenlivost o Důvěrných informací, a to až do doby, kdy se informace této povahy stanou obecně známými za předpokladu, že se tak nestane porušením povinnosti mlčenlivosti;\n\npoužít informace uvedené povahy pouze pro činnosti související s přípravou a plněním Smlouvy a/nebo Dílčí smlouvy, dále tyto informace nerozšiřovat ani nereprodukovat, nezpracovávat je v systémech umělé inteligence (systémech AI), nezpřístupnit je jiným osobám ani je nevyužít pro sebe či pro jinou osobu;\n\nomezit počet svých pracovníků pro styk s těmito Důvěrnými informacemi a přijmout účinná opatření pro zamezení jejich úniku, případně zabezpečit, aby i tyto osoby považovaly uvedené informace za Důvěrné a zachovávaly o nich mlčenlivost.\n\nPovinnost plnit ustanovení dle čl. 7 odst. 7.1 až 7.4 této Smlouvy se nevztahuje na informace, které:\n\nje Smluvní strana povinna zveřejnit na základě zákonem stanovené povinnosti;\n\nmohou být zveřejněny bez porušení Smlouvy;\n\nbyly písemným souhlasem obou Smluvních stran zproštěny těchto omezení;\n\njsou známé nebo byly zveřejněny jinak, než následkem zanedbání povinnosti jedné ze Smluvních stran;\n\npříjemce je zná dříve, než je sdělí Smluvní strana;\n\njsou vyžádány soudem, státním zastupitelstvím nebo příslušným správním orgánem na základě zákona;\n\nSmluvní strana je sdělí osobě vázané zákonnou povinností mlčenlivosti (např. advokátovi nebo daňovému poradci) za účelem uplatňování svých práv;\n\nje Objednatel povinen sdělit svému zakladateli.\n\nV případě, že se kterákoliv Smluvní strana hodnověrným způsobem dozví, popřípadě bude mít důvodné podezření, že došlo k prozrazení či zpřístupnění Důvěrné informace neoprávněné osobě, je povinna neprodleně tuto skutečnost druhé Smluvní straně oznámit.\n\nPokud Smluvní strana, která poruší svůj závazek vyplývající z tohoto článku Smlouvy, takto způsobí druhé Smluvní straně škodu nebo ona či jiná třetí osoba získá na základě takové skutečnosti majetkový prospěch, má druhá Smluvní strana vůči porušující Smluvní straně nárok na náhradu veškeré jí vzniklé škody a na zaplacení částky odpovídající majetkovému prospěchu získanému v souvislosti s touto skutečností porušující Smluvní stranou či jinou třetí osobou. Nárok na náhradu případné škody není sjednáním ani zaplacením kterékoliv smluvní pokuty dle Smlouvy a/nebo Dílčí smlouvy dotčen.\n\nPovinnost ochrany Důvěrných informací trvá bez ohledu na ukončení platnosti a účinnosti Smlouvy a/nebo Dílčích smluv.\n\nPokud řádné poskytování Služeb vyžaduje zpracování osobních údajů zaměstnanců Objednatele, budou tyto osobní údaje zaměstnanců Objednatele v postavení kontaktních osob zpracovávány Poskytovatelem v rozsahu:\n\njméno, příjmení a titul,\n\ne-mailová adresa,\n\ntelefonní číslo.\n\nZpracováním osobních údajů ve smyslu tohoto odstavce Smlouvy se rozumí zejména jejich shromažďování, ukládání na nosiče informací, používání, třídění nebo kombinování, blokování a likvidace s využitím manuálních a automatizovaných prostředků v rozsahu nezbytném pro zajištění řádného poskytování Služeb.\n\nOsobní údaje budou zpracovány po dobu poskytování Služeb. Ukončením této Smlouvy / Dílčí smlouvy nezanikají povinnosti Poskytovatele týkající se bezpečnosti a ochrany osobních údajů až do okamžiku jejich úplné likvidace či předání jinému zpracovateli.\n\nSmluvní strany se dohodly, že Poskytovatel nemá nárok na náhradu nákladů spojených se zpracováním osobních údajů či s plněním povinností vyplývajících z příslušné právní úpravy.\n\nObjednatel je povinen přijmout vhodná opatření na to, aby poskytl subjektům údajů stručným, transparentním, srozumitelným a snadno přístupným způsobem za použití jasných a jednoduchých jazykových prostředků veškeré informace a učinil veškerá sdělení požadovaná Nařízením Evropského parlamentu a Rady (EU) č. 2016/679 ze dne [DATE_3], obecného nařízení o ochraně osobních údajů (dále jen „Nařízení“) ve spojení se zákonem o zpracování osobních údajů.\n\n\n\nPoskytovatel je při plnění této povinnosti povinen:\n\nzpracovávat osobní údaje pouze na základě doložených pokynů Objednatele;\n\nzohledňovat povahu zpracování osobních údajů a být Objednateli nápomocen pro splnění Objednatelovy povinnosti reagovat na žádosti o výkon práv subjektu údajů, jakož i pro splnění dalších povinností ve smyslu Nařízení;\n\nzajistit, že jeho zaměstnanci budou zpracovávat osobní údaje pouze za podmínek a v rozsahu Poskytovatelem stanoveném;\n\nPoskytovatel je při plnění této povinnosti oprávněn v rozsahu nezbytném pro plnění předmětu Smlouvy / Dílčí smlouvy zapojit do zpracování i další případné zpracovatele, k čemuž mu Objednatel tímto uděluje povolení. \n\nSmluvní strany jsou při zpracování povinny:\n\nzavést technická, organizační, personální a jiná vhodná opatření ve smyslu Nařízení, aby zajistily a byly schopny kdykoliv doložit, že zpracování osobních údajů je prováděno v souladu s Nařízením a zákonem o zpracování osobních údajů tak, aby nemohlo dojít k neoprávněnému nebo nahodilému přístupu k osobním údajům a k datovým nosičům, které tyto údaje obsahují, k jejich změně, zničení či ztrátě, neoprávněným přenosům, k jejich jinému neoprávněnému zpracování, jakož i k jinému zneužití, a tato opatření podle potřeby průběžné revidovat a aktualizovat;\n\nvést a průběžné revidovat a aktualizovat záznamy o zpracování osobních údajů ve smyslu Nařízení;\n\nřádně a včas ohlašovat případná porušení zabezpečení osobních údajů Úřadu pro ochranu osobních údajů a spolupracovat s tímto úřadem v nezbytném rozsahu;\n\nnavzájem se informovat o všech okolnostech významných pro plnění dle tohoto článku Smlouvy;\n\nzachovávat mlčenlivost o osobních údajích a o bezpečnostních opatřeních, jejichž zveřejnění by ohrozilo zabezpečení osobních údajů, a to i po skončení této Smlouvy / Dílčí smlouvy;\n\npostupovat v souladu s dalšími požadavky Nařízení a zákona o zpracování osobních údajů, zejména dodržovat obecné zásady zpracování osobních údajů, plnit své informační povinnosti, nepředávat osobní údaje třetím osobám bez potřebného oprávnění, respektovat práva subjektů údajů a poskytovat v této souvislosti nezbytnou součinnost.\n\nOchrana autorských práv\n\nPodpisem Smlouvy Poskytovatel poskytuje Objednateli na dobu trvání majetkových práv autorských nevypověditelnou, převoditelnou, výhradní a územně neomezenou licenci k vytváření kopií, užívání a zpřístupnění dalším osobám všech výstupů a dále jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, jež podle obecně závazných právních předpisů představují autorská díla nebo práva pořizovatele k jím pořízené databázi, včetně práva upravovat a )měnit takováto autorská díla nebo databáze.\n\nObjednatel není ve svých právech k užití výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, nijak omezen. Objednatel je oprávněn bez souhlasu Poskytovatele výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořené v souvislosti s ní Poskytovatelem či jeho poddodavateli, nebo jejich části upravovat či doplňovat.\n\nPoskytovatel není oprávněn výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich části, jakkoliv rozšiřovat bez předchozího písemného souhlasu Objednatele. Přenechání výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich částí Poskytovatelem třetí osobě bez předchozího písemného souhlasu Objednatele se považuje za podstatné porušení Smlouvy.\n\nPoskytovatel odpovídá za to, že plnění předmětu Smlouvy / Dílčí smlouvy, nezasahuje a nebude zasahovat do práv jiných osob, zejména práv z průmyslového nebo jiného duševního vlastnictví, a to pro jakékoliv využití plnění v [COUNTRY_2] i v zahraničí.\n\nSmluvní strany tímto sjednávají, že veškerá finanční vyrovnání za poskytnutí licence dle tohoto článku 8 Smlouvy jsou zahrnuta v Ceně dle článku 2 Smlouvy.\n\nPoddodavatelé\n\nPoskytuje-li Poskytovatel Objednateli část plnění předmětu Smlouvy, resp. Dílčí smlouvy, prostřednictvím poddodavatele písemně schváleným ze strany Objednatele, je za veškerá taková plnění poddodavatele odpovědný Poskytovatel sám, jako kdyby tato plnění byla poskytována Poskytovatelem. \n\nPoskytovatel je povinen zajistit, aby všichni poddodavatelé měli platná příslušná oprávnění, odbornou kvalifikaci a dostatek odborných zkušeností, jež jsou nezbytné pro poskytování příslušných částí Služeb dle jejich smluv s Poskytovatelem. Žádná poddodavatelská smlouva nezakládá smluvní vztahy mezi Objednatelem a poddodavatelem. \n\nPoskytovatel je dále povinen smluvně zajistit, že i jeho poddodavatelé, kteří se budou podílet na plnění dle Smlouvy, resp. Dílčích smluv, se zaváží dodržovat v plném rozsahu ujednání mezi Poskytovatelem a Objednatelem a nebudou v rozporu s požadavky Objednatele uvedenými ve Smlouvě a/nebo Dílčích smlouvách.\n\nPokud Objednatel shledá, že plnění předmětu Smlouvy, resp. Dílčí smlouvy, uskutečněné poddodavatelem nedosahuje potřebných kvalit, je vadné nebo nevykazuje jiné náležitosti požadované Objednatelem, nebo že sám poddodavatel není subjektem kompetentním pro provádění plnění z této Smlouvy, resp. Dílčí smlouvy, je oprávněn požadovat, aby Poskytovatel neprodleně svěřil takto identifikovanou část plnění jinému poddodavateli, nebo se ujal této části plnění sám. \n\nZajištění plnění, která Poskytovatel svěří poddodavateli, není poddodavatel oprávněn zadat třetím osobám. Poskytovatel je povinen na tuto skutečnost poddodavatele upozornit před uzavřením poddodavatelské smlouvy a odpovídá za její dodržování. \n\nUzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, budou považovány za podstatné porušení Smlouvy.\n\nCompliance ujednání\n\nSmluvní strany se zavazují dodržovat právní předpisy a chovat se tak, aby jejich jednání nemohlo vzbudit důvodné podezření ze spáchání nebo páchání trestného činu, a to ani takového, který by mohl být přičitatelný Objednateli podle zákona č. 418/2011 Sb., o trestní odpovědnosti právnických osob a řízení proti nim, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že učiní všechna opatření k tomu, aby se nedopustily ony a ani nikdo z jejich zaměstnanců či zástupců jakékoliv formy korupčního jednání, zejména jednání, které by mohlo být vnímáno jako přijetí úplatku, podplácení nebo nepřímé úplatkářství či jiný trestný čin spojený s korupcí dle zákona č. 40/2009 Sb., trestní zákoník, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že:\n\nneposkytnou, nenabídnou ani neslíbí úplatek jinému nebo pro jiného v souvislosti s obstaráváním věcí obecného zájmu anebo v souvislosti s podnikáním svým nebo jiného; \n\núplatek nepřijmou, ani si jej nedají slíbit, ať už pro sebe nebo pro jiného v souvislosti s obstaráním věcí obecného zájmu nebo v souvislosti s podnikáním svým nebo jiného. \n\nÚplatkem se přitom rozumí neoprávněná výhoda spočívající v přímém majetkovém obohacení nebo jiném zvýhodnění, které se dostává nebo má dostat uplácené osobě nebo s jejím souhlasem jiné osobě, a na kterou není nárok.\n\nSmluvní strany nebudou ani u svých obchodních partnerů tolerovat jakoukoliv formu korupce či uplácení.\n\nV případě, že je zahájeno trestní stíhání Poskytovatele, zavazuje se Poskytovatel o tomto bez zbytečného odkladu Objednatele písemně informovat.\n\nSankce\n\nV případě nedodržení termínu poskytnutí Plnění a/nebo předání výstupu a/nebo odstranění vad poskytnutého Plnění ve sjednané kvalitě podle této Smlouvy a Dílčí smlouvy ze strany Poskytovatele je Poskytovatel povinen uhradit Objednateli smluvní pokutu ve výši [MONETARY_AMOUNT_2]. \n\nZa každé jednotlivé porušení povinnosti mlčenlivosti podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinnosti Poskytovatele při zpracování osobních údajů podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinností Poskytovatele stanovených v čl. 5.7 až 5.9 této Smlouvy je Objednatel oprávněn požadovat po Poskytovateli zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nJestliže se jakékoli prohlášení Poskytovatele podle článku 8 Smlouvy ukáže nepravdivým nebo zavádějícím nebo Poskytovatel poruší jiné povinnosti podle článku 8 této Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nV případě prodlení Objednatele s úhradou řádně vystavených a doručených faktur, je Objednatel povinen uhradit Poskytovateli úrok z prodlení dle nařízení vlády č. 351/2013 Sb., kterým se určuje výše úroků z prodlení a nákladů spojených s uplatněním pohledávky, určuje odměna likvidátora, likvidačního správce a člena orgánu právnické osoby jmenovaného soudem a upravují některé otázky Obchodního věstníku a veřejných rejstříků právnických a fyzických osob, evidence svěřenských fondů a evidence údajů o skutečných majitelích.\n\nVyúčtování smluvní pokuty / úroků z prodlení – penalizační faktura, musí být druhé Smluvní straně zasláno datovou zprávou prostřednictvím datové schránky. Smluvní pokuta a úroky z prodlení jsou splatné ve lhůtě třiceti (30) kalendářních dnů ode dne doručení penalizační faktury povinné Smluvní straně. Úhrada smluvní pokuty / úroků z prodlení se provádí bankovním převodem na účet oprávněné Smluvní strany uvedený v penalizační faktuře. Částka se považuje za zaplacenou okamžikem jejího připsání ve prospěch účtu oprávněné Smluvní strany.\n\nZaplacením smluvní pokuty není dotčen nárok Objednatele na náhradu újmy v celém rozsahu způsobené újmy, ani povinnost Poskytovatele řádně dokončit plnění předmětu Smlouvy, popř. odstranit vady.\n\nObjednatel je v případě uplatnění smluvní pokuty vůči Poskytovateli dle této Smlouvy a v případě neuhrazení smluvní pokuty ze strany Poskytovatele oprávněn využít institut započtení vzájemných pohledávek.\n\nDoba trvání Smlouvy\n\nTato Smlouva nabývá platnosti dnem jejího podpisu oběma Smluvními stranami a účinnosti dnem jejího uveřejnění v registru smluv v souladu se zákonem č. 340/2015 Sb., o registru smluv, ve znění pozdějších předpisů. Zveřejnění Smlouvy v registru smluv zajistí Objednatel. \n\nTato Smlouva se uzavírá na dobu určitou, a to na dobu 48 měsíců od nabytí účinnosti. \n\nSmlouva / Dílčí smlouva může být ukončena dohodou Smluvních stran v písemné formě, přičemž účinky zrušení Smlouvy / Dílčí smlouvy nastanou k okamžiku stanoveném v této dohodě. Nebude-li takovýto okamžik dohodou stanoven, pak tyto účinky nastanou ke dni uzavření takovéto dohody.\n\nSmluvní strany jsou oprávněny od Smlouvy a/nebo Dílčí smlouvy odstoupit v případě jejího podstatného porušení druhou Smluvní stranou, za podmínek uvedených v § 2001 a násl. Občanského zákoníku.\n\nZa podstatné porušení této Smlouvy a/nebo Dílčí Smlouvy Poskytovatelem, které zakládá právo Objednatele na odstoupení od této Smlouvy a/nebo Dílčí smlouvy, se považuje zejména:\n\nnedodržení právních předpisů Poskytovatelem při poskytování Služeb / Plnění;\n\nprodlení Poskytovatele s poskytováním Služeb / Plnění z důvodů spočívajících výlučně na straně Poskytovatele a/nebo předáním výstupu po dobu delší než deset (10) kalendářních dnů;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 7 této Smlouvy;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 8 této Smlouvy;\n\nporušení jakékoliv povinnosti Poskytovatele stanovené v čl. 5.7 až 5.9 této Smlouvy;\n\nuzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, ve smyslu čl. 9 odst. 9.5 Smlouvy; \n\npostup Poskytovatele při poskytování Služeb / Plnění v rozporu s oprávněnými pokyny Objednatele.\n\nObjednatel je dále oprávněn od této Smlouvy a/nebo Dílčí Smlouvy odstoupit v případě, že \n\nPoskytovatel přestane splňovat požadavky na kvalifikaci uvedené ve výzvě k podání nabídky; \n\nvůči majetku Poskytovatele probíhá insolvenční řízení, v němž bylo vydáno rozhodnutí o úpadku, pokud to právní předpisy umožňují;\n\ninsolvenční návrh na Poskytovatele byl zamítnut proto, že majetek Poskytovatele nepostačuje k úhradě nákladů insolvenčního řízení;\n\nbyl Poskytovatel pravomocně odsouzen pro trestný čin. \n\nPoskytovatel je oprávněn odstoupit od této Smlouvy a/nebo Dílčí Smlouvy, pokud Objednatel bude přes písemné upozornění Poskytovatele déle než třicet (30) kalendářních dnů od písemného upozornění Poskytovatele v prodlení s plněním své platební povinnosti vůči Poskytovateli. \n\nÚčinky odstoupení nastávají uplynutím lhůty deseti (10) kalendářních dnů, která počíná běžet prvním dnem následujícím po doručení projevu vůle odstoupit od Smlouvy a/nebo Dílčí Smlouvy druhé Smluvní straně. Poskytovatel je v případě odstoupení od Smlouvy / Dílčí smlouvy povinen učinit již jen takové úkony, bez nichž by mohly být zájmy Objednatele vážně ohroženy.\n\nOdstoupení od Smlouvy a/nebo Dílčí Smlouvy se nedotýká zejména práva na náhradu újmy, smluvní pokuty a povinnosti mlčenlivosti, ani ujednání, které mají vzhledem ke své povaze zavazovat Smluvní strany i po ukončení Smlouvy / Dílčí smlouvy.\n\nSmluvní strany jsou oprávněny Smlouvu a/nebo Dílčí Smlouvu vypovědět, i bez uvedení důvodu, na základě písemné výpovědi. Výpovědní doba činí tři (3) měsíce a počíná běžet dnem doručení výpovědi druhé Smluvní straně.\n\nV případě jakéhokoliv skončení tohoto smluvního vztahu podle Smlouvy a/nebo Dílčí Smlouvy, je Poskytovatel vždy povinen neprodleně předat Objednateli veškeré věci a dokumenty, vztahující se k plnění této Smlouvy a/nebo Dílčí Smlouvy nebo poskytnuté za účelem plnění předmětu Smlouvy a/nebo Dílčí Smlouvy, nejpozději však do pěti (5) pracovních dnů ode dne ukončení smluvního vztahu. \n\nZávěrečná ustanovení\n\nSmluvní strany potvrzují, že si při uzavírání Smlouvy vzájemně sdělily všechny skutkové a právní okolnosti, o nichž ví nebo vědět musí, tak, aby se každá ze Smluvních stran mohla přesvědčit o možnosti uzavřít platnou Smlouvu a aby byl každé ze Smluvních stran zřejmý zájem druhé Smluvní strany Smlouvu uzavřít. \n\nSmluvní strany výslovně potvrzují, že si vzájemně sdělily veškeré okolnosti důležité pro uzavření Smlouvy. Smluvní strany prohlašují, že se dohodly o veškerých náležitostech Smlouvy.\n\nPoskytovatel prohlašuje a potvrzuje, že na sebe přebírá nebezpečí změny okolností ve smyslu ustanovení § 1765 odst. 2 Občanského zákoníku.\n\nSmluvní strany si ve smyslu ustanovení § 1794 odst. 2 Občanského zákoníku ujednaly, že se Poskytovatel výslovně vzdává jeho práva ve smyslu ustanovení § 1793 Občanského zákoníku a souhlasí s cenou tak, jak byla Smluvními stranami sjednána výše v této Smlouvě.\n\nSmluvní strany se zavazují vyvinout maximální úsilí k odstranění vzájemných sporů, vzniklých na základě této Smlouvy nebo v souvislosti s touto Smlouvou, a k jejich vyřešení zejména prostřednictvím jednání odpovědných pracovníků nebo jiných pověřených subjektů. Nedohodnou-li se na způsobu řešení vzájemného sporu, dohodly se Smluvní strany, že místně příslušným soudem pro řešení případných sporů bude soud příslušný dle místa sídla Objednatele.\n\nTato Smlouva může být měněna pouze vzestupně očíslovanými písemnými dodatky ke Smlouvě podepsanými oběma Smluvními stranami. \n\nDnem doručení písemností odeslaných na základě této Smlouvy nebo v souvislosti s touto Smlouvou prostřednictvím provozovatele poštovních služeb, pokud není prokázán jiný den doručení, se rozumí poslední den lhůty, ve které byla písemnost pro adresáta uložena u provozovatele poštovních služeb, a to i tehdy, jestliže se adresát o jejím uložení nedověděl. Smluvní strany tímto výslovně vylučují ust. § 573 Občanského zákoníku.\n\nPokud kterékoli ustanovení této Smlouvy nebo jeho část je nebo se stane neplatným či nevynutitelným, nebude mít tato neplatnost či nevynutitelnost vliv na platnost či vynutitelnost ostatních ustanovení této Smlouvy nebo jejích částí, pokud nevyplývá přímo z obsahu této Smlouvy, že toto ustanovení nebo jeho část nelze oddělit od dalšího obsahu. V takovém případě se obě Smluvní strany zavazují neúčinné a neplatné ustanovení nahradit novým ustanovením, které je svým účelem a významem co nejbližší ustanovení této Smlouvy, jež má být nahrazeno. \n\nPro případ, že tato Smlouva není uzavírána za přítomnosti všech Smluvních stran, platí, že Smlouva nebude uzavřena, pokud ji Poskytovatel podepíše s jakoukoliv změnou či odchylkou, byť nepodstatnou, nebo dodatkem, ledaže Objednatel takovou změnu či odchylku nebo dodatek následně schválí. To platí i v případě připojení obchodních podmínek Poskytovatele, které budou odporovat svým obsahem jakýmkoliv způsobem textu této Smlouvy. \n\nTato Smlouva je vyhotovena ve čtyřech (4) rovnocenných vyhotoveních, z nichž každé má platnost originálu. Každá ze Smluvních stran obdrží po dvou (2) stejnopisech. V případě, že bude Smlouva uzavírána elektronicky obdrží Poskytovatel/Objednatel elektronický dokument podepsaný v souladu s platnou právní úpravou.\n\nKontaktní údaje Smluvních stran pro doručování jsou následující:\n\nKontaktní osoba [PERSON_2]:\n\nxxx\n\ne-mail: xxx tel. xxx\n\nKontaktní osoba [PERSON_3]:\n\n xxx\n\n e-mail: xxx tel. xxx\n\n\n\n Nedílnou součástí této Smlouvy jsou následující přílohy:\n\nPříloha č. 1 – Cena\n\nPříloha č. 2 – Akceptační protokol\n\n\n\n\n\n\n\n\n\nSmluvní strany prohlašují, že tato Smlouva je projevem jejich pravé a svobodné vůle a nebyla sjednána v tísni ani za jinak jednostranně nevýhodných podmínek. Na důkaz toho připojují Smluvní strany své podpisy.\n\n\n\nV Praze dne: Dle elektronického podpisu\tV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\n__________________________________\n\nxxx\n\n[PERSON_1]\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\nV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\nxxx\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 1 – Cena\n\n\n\nHodinová sazba v Kč bez DPH za poskytování Služeb:\n\n\n\nHodinová sazba v Kč bez DPH\n\n2 190,- \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 2 – Akceptační protokol\n\n\n\nAkceptační protokol\n\n\n\n\n\nPlnění / výstup:\t……………………………………………………………………………………….\n\n\n\nPředáno dne: \n\n\n\n…………………………………………………………………………………………………………..\n\n\n\n\n\nPřevzal: \t\t\t\t\t\tPředal:\n\n\n\n…………………………………………… \t\t……………………………………………….\n\nObjednatel:\t\t\t\t\t\t[ORGANIZATION_3]:\n\nNárodní agentura pro komunikační a  \n\ninformační technologie, s. p. \n\nJméno, příjmení \t\t\t\t \n\nFunkce\t\t\t \n\n\t\n\n\n\nAkceptováno dne: datum akceptace\n\n\n\nVýsledek akceptace: \tAKCEPTOVÁNO / NEAKCEPTOVÁNO\n\n\n\nAkceptoval\n\nFunkce\n\nPodpis\n\n\n\n\n\n\n\n\n\n\n\n\n\nPřipomínky, výhrady:\n\n\n\npřipomínky a výhrady k předanému Plnění" + "redactedText": "RÁMCOVÁ DOHODA NA POSKYTOVÁNÍ PRÁVNÍCH SLUŽEB\n\nČíslo 2026/051 NAKIT\n\n\n\nSmluvní strany\n\n\n\n[ORGANIZATION_1]\n\nse sídlem \t[ADDRESS_1]\n\nIČO: \t[REGISTRATION_NUMBER_1] \n\nDIČ: \t [TAX_IDENTIFICATION_NUMBER_1]\n\nzastoupen: \txxx\n\nzapsán v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2]\n\nbankovní spojení \txxx\n\n\tč. ú. xxx\n\n(dále jen „Objednatel“)\n\n\n\na\n\n[PERSON_1]\n\nse sídlem [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2] \n\nbankovní spojení xxx\n\n č. ú. xxx\n\n \n\n (dále jen „Poskytovatel“)\n\n\n\n(Objednatel a Poskytovatel budou v této rámcové dohodě na poskytování právních služeb označováni jednotlivě jako „Smluvní strana“ a společně jako „Smluvní strany“ a tato rámcová dohoda jako „Smlouva“),\n\n\n\nuzavírají v souladu s ustanovením § 1746 odst. 2 zákona č. 89/2012 Sb., občanský zákoník, v platném znění (dále jen „Občanský zákoník“) a v souladu s ustanovením § 29 písm. k) bod 1. a 2. zákona č. 134/2016 Sb., o zadávání veřejných zakázek, ve znění pozdějších předpisů (dále jen „Zákon o zadávání veřejných zakázek“), jakož i v souladu se zákonem č. 85/1996 Sb., o advokacii, ve znění pozdějších předpisů (dále jen „Zákon o advokacii“) tuto Smlouvu. \n\n\n\n\n\n\n\n\n\nÚčel a předmět Smlouvy\n\nÚčelem této Smlouvy je stanovení podmínek a právního rámce pro uzavírání Dílčích smluv (jak je tento pojem definován níže v odst. 1.8 Smlouvy) mezi Objednatelem a Poskytovatelem na poskytování právních služeb, a to na základě písemných Objednávek Objednatele.\n\nPředmětem této Smlouvy je stanovení práv a povinností Smluvních stran pro postup při uzavírání Dílčích smluv a následném poskytování právních služeb Poskytovatelem Objednateli, přičemž poskytováním právních služeb se pro účely této Smlouvy rozumí poskytování právních služeb ve smyslu § 29 odst. 1 písm. k) bod 1. a 2. Zákona o zadávání veřejných zakázek (dále jen „Služby“).\n\nPoskytovatel se zavazuje poskytnout Objednateli Služby na základě Dílčí smlouvy. Služby poskytované Poskytovatelem Objednateli na základě konkrétní Dílčí smlouvy budou dále nazývány jako „Plnění“. Dílčí smlouvy budou uzavírány níže uvedeným postupem, na základě písemné Objednávky Objednatele doručené Poskytovateli (dále jen „Objednávka“). Objednávka musí obsahovat minimálně tyto náležitosti:\n\nidentifikační údaje Poskytovatele a Objednatele;\n\nčíslo a datum vystavení Objednávky;\n\nčíslo Smlouvy;\n\nrámcové vymezení Plnění;\n\nmaximální rozsah a maximální cenu Plnění; a\n\npodpis oprávněné osoby Objednatele.\n\nObjednatel je oprávněn, avšak nikoli povinen, vystavovat dle svého uvážení Objednávky ode dne nabytí účinnosti této Smlouvy. Každá takto vystavená Objednávka se považuje za návrh na uzavření Dílčí smlouvy za podmínek stanovených touto Smlouvou. Poskytovatel je povinen písemně potvrdit Objednávku ve lhůtě dvou (2) pracovních dnů od jejího doručení Poskytovateli.\n\nPotvrzení Objednávky musí obsahovat minimálně tyto náležitosti: \n\nidentifikační údaje Objednatele a Poskytovatele; \n\nčíslo Objednávky, která je potvrzována; a \n\npodpis oprávněné osoby Poskytovatele.\n\nV případě, že Objednávka nebude splňovat uvedené minimální náležitosti, má Poskytovatel povinnost na tuto skutečnost neprodleně upozornit Objednatele. Objednatel je poté povinen vystavit novou Objednávku a Poskytovatel je povinen ji ve lhůtě dvou (2) pracovních dnů od jejího doručení písemně potvrdit. Není-li v článku 4 Smlouvy stanoveno jinak, běží lhůta pro poskytnutí Plnění dle příslušné Dílčí smlouvy od okamžiku doručení této nové Objednávky. \n\nPotvrzení Objednávky, které obsahuje dodatky, výhrady, omezení nebo jiné změny se považuje za odmítnutí Objednávky a tvoří nový návrh Poskytovatele na uzavření Dílčí smlouvy, a to i v případě takového dodatku, výhrady, omezení nebo jiné změny, které podstatně nemění podmínky Objednávky. Dílčí smlouva je v takovém případě uzavřena pouze tehdy, pokud tento nový návrh Objednatel písemně potvrdí a doručí zpět Poskytovateli. \n\nDoručením potvrzení Objednávky Objednateli dojde k uzavření smlouvy o poskytnutí služeb, přičemž práva a povinnosti Smluvních stran dle této smlouvy o poskytnutí služeb odpovídají v celém rozsahu právům a povinnostem Objednatele a Poskytovatele stanovených touto Smlouvou (dále jen „Dílčí smlouva“).\n\nPočet Objednávek vystavených Objednatelem není omezený. Současně platí, že Objednatel není povinen Objednávku vystavit.\n\nPoskytovatel se zavazuje poskytnout Objednateli Plnění za podmínek uvedených v této Smlouvě a v Dílčí smlouvě ve sjednaném rozsahu, jakosti a čase. \n\nObjednatel se zavazuje zaplatit za Plnění poskytnuté v souladu s touto Smlouvou a Dílčí smlouvou Cenu dle článku 2 této Smlouvy.\n\nObjednatel při uzavírání této Smlouvy negarantuje žádný minimální objem plnění, který bude zadán v průběhu její platnosti. Objednatel uzpůsobuje rozsah poptávaného plnění svým aktuálním potřebám, které jsou v čase proměnlivé. Poskytovatel se přes výše uvedené zavazuje být připraven poskytnout plnění v rozsahu poptávaném Objednatelem dle podmínek této Smlouvy. \n\nSmluvní strany sjednávají, že k poskytnutí konkrétního Plnění (resp. jeho relevantní části) na základě Dílčí smlouvy je Poskytovatel povinen na základě, v rozsahu a v souladu s požadavky a/nebo pokyny Objednatele, které budou činěny prostřednictvím e-mailové komunikace kontaktní osobou Objednatele uvedenou v čl. 13 odst. 13.11 písm. a) Smlouvy nebo jí pověřenou osobou. V e-mailu podle přechozí věty Objednatel uvede specifikaci konkrétního požadavku (včetně případného požadavku na výstup) a/nebo pokynu. Hovoří-li se v této Smlouvě o Plnění, rozumí se jím i jeho relevantní část, poskytnutá Objednateli na základě konkrétního požadavku a/nebo pokynu dle tohoto odstavce Smlouvy.\n\nKaždá Dílčí smlouva nabývá platnosti dnem uzavření. Dílčí smlouva nabývá účinnosti dnem uzavření, nevztahuje-li se na ni povinnost zveřejnění v registru smluv podle zákona č. 340/2015 Sb., o zvláštních podmínkách účinnosti některých smluv, uveřejňování těchto smluv a o registru smluv (zákon o registru smluv) ve znění pozdějších předpisů (dále jen „Zákon o registru smluv“). Vztahuje-li se na příslušnou Dílčí smlouvu povinnost jejího zveřejnění v registru smluv, nabývá Dílčí smlouva účinnosti dnem zveřejnění v registru smluv, přičemž zveřejnění Dílčí smlouvy v registru smluv zajistí Objednatel. V Dílčí smlouvě může být výslovně uvedeno pozdější datum nabytí účinnosti než dnem jejího uzavření/zveřejnění v registru smluv (dle relevance).\n\nCena\n\nCena za poskytnutí Plnění Poskytovatelem odpovídá součinu skutečného časového rozsahu poskytnutého Plnění a hodinové sazby dle Přílohy č. 1 této Smlouvy na základě konkrétní Dílčí smlouvy (dále jen „Cena“). Nejnižší časová jednotka odpracovaného času, za kterou náleží Poskytovateli odměna za poskytnuté Plnění, je jedna (1) hodina.\n\nSkutečný časový rozsah Plnění je limitován odhadovaným maximálním časovým rozsahem Plnění uvedeným v Dílčí smlouvě. Skutečný časový rozsah Plnění bude Poskytovatelem Objednateli dokladován v rámci akceptační procedury dle článku 6 Smlouvy, jejíž průběh bude stvrzen Smluvními stranami podpisem Akceptačního protokolu, jehož vzor tvoří Přílohu č. 2 této Smlouvy a je její nedílnou součástí. \n\nObjednatel si vyhrazuje právo uznat v rámci fakturace pouze takový časový rozsah Plnění, který byl na poskytování Plnění účelně vynaložen. \n\nCena každé jednotlivé složky Plnění zahrnuje veškeré náklady Poskytovatele spojené s plněním Smlouvy, Dílčí smlouvy a poskytnutím Plnění Objednateli, vyjma pravomocně přiznané odměny za zastupování v soudním řízení, která připadá Poskytovateli. Tato Cena je cenou konečnou.\n\nCelková cena Plnění poskytnutého na základě této Smlouvy a Dílčích smluv nesmí převýšit částku [MONETARY_AMOUNT_1] bez DPH. DPH bude připočítána k ceně v souladu s platnými právními předpisy ke dni uskutečnění zdanitelného plnění.\n\nPlatební podmínky\n\nDaňové doklady za poskytování Plnění budou Poskytovatelem vystavovány vždy k poslednímu dni příslušného kalendářního měsíce, ve kterém bylo Plnění poskytováno, a bude v nich vyúčtováno Plnění poskytnuté Objednateli bez jakýchkoli vad v příslušném kalendářním měsíci. Za den uskutečnění zdanitelného plnění se považuje den podpisu Akceptačního protokolu Objednatelem.\n\nDaňový doklad (faktura) musí obsahovat náležitosti řádného daňového dokladu podle příslušných právních předpisů, zejména podle § 29 zákona č. 235/2004 Sb., o dani z přidané hodnoty, ve znění pozdějších předpisů (dále jen „Zákon o DPH“), dle zákona č. 563/1991 Sb., o účetnictví, ve znění pozdějších předpisů, dle § 435 Občanského zákoníku a níže uvedené údaje:\n\nčíslo Smlouvy a Dílčí smlouvy (Objednávky),\n\nplatební podmínky v souladu se Smlouvou a Dílčí smlouvou,\n\nmísto a datum předání a převzetí Plnění,\n\npopis fakturovaného Plnění, rozsah, jednotkovou a celkovou cenu,\n\npřílohou je kopie Akceptačního protokolu s výrokem „Akceptováno“, odsouhlaseného a potvrzeného Objednatelem.\n\nSplatnost daňového dokladu (faktury) vystaveného Poskytovatelem je třicet (30) kalendářních dní ode dne jeho doručení Objednateli. \n\nPoskytovatel zašle daňový doklad spolu s veškerými požadovanými dokumenty Objednateli nejpozději do pěti (5) kalendářních dnů od podpisu Akceptačního protokolu, jedním z následujících způsobů: \n\nbuď v elektronické podobě na adresu:\n\nxxx\n\nnebo doporučeným dopisem na následující adresu: \n\n[ORGANIZATION_1]\n\n[ADDRESS_1].\n\nV případě, že faktura nebude obsahovat stanovené náležitosti, přílohy nebo nebude vystavena v souladu s touto Smlouvou, je Objednatel oprávněn vrátit ji ve lhůtě splatnosti Poskytovateli k doplnění či opravě, aniž se tím dostane do prodlení. Lhůta splatnosti v délce třicet (30) kalendářních dní počíná běžet znovu ode dne doručení náležitě doplněné či opravené faktury Objednateli.\n\nPlatba bude provedena v české měně formou bankovního převodu na účet Poskytovatele uvedený v záhlaví této Smlouvy. Cena se považuje za uhrazenou dnem odepsání fakturované částky z účtu Objednatele ve prospěch účtu Poskytovatele.\n\nObjednatel neposkytuje Poskytovateli jakékoliv zálohy na cenu za Služby / Plnění.\n\nSmluvní strany se dohodly, že pokud bude v okamžiku uskutečnění zdanitelného plnění správcem daně zveřejněna způsobem umožňujícím dálkový přístup skutečnost, že poskytovatel zdanitelného plnění (Poskytovatel) je nespolehlivým plátcem ve smyslu ust. § 106a Zákona o DPH nebo že úplata za toto plnění má být poskytnuta zcela nebo zčásti bezhotovostním převodem na jiný účet než účet Poskytovatele, který je správcem daně zveřejněn způsobem umožňujícím dálkový přístup ve smyslu ust. § 96 Zákona o DPH, je příjemce zdanitelného plnění (Objednatel) oprávněn část ceny odpovídající dani z přidané hodnoty zaplatit přímo na bankovní účet správce daně ve smyslu ust. § 109a Zákona o DPH. Na bankovní účet Poskytovatele bude v tomto případě uhrazena část ceny odpovídající výši základu daně z přidané hodnoty. Úhrada ceny plnění (základu daně) provedená Objednatelem v souladu s ustanovením tohoto odstavce bude považována za řádnou úhradu ceny plnění poskytnutého dle Smlouvy.\n\nDoba, místo a podmínky plnění\n\nPoskytovatel je povinen poskytnout Objednateli Plnění a předat Objednateli výstup/y nejdéle do pěti (5) kalendářních dnů ode dne doručení požadavku a/nebo pokynu ve smyslu čl. 1 odst. 1.13 Smlouvy, nedohodnou-li se Smluvní strany písemně (např. e-mailem) na jiném termínu poskytnutí Plnění, nebo nevyplývá-li jiný čas poskytnutí Plnění z platných právních předpisů nebo z požadavku či výzvy příslušného orgánu. \n\nPoskytovatel se zavazuje poskytovat Služby dle této Smlouvy na celém území [COUNTRY_1]. Místem předání veškerých výstupů dle této Smlouvy je [ADDRESS_3], nebude-li v konkrétním případě dohodnuto jinak. \n\nDalší práva a povinnosti Smluvních stran\n\nPoskytovatel je povinen postupovat při poskytování Služeb / Plnění s odbornou péčí podle svých nejlepších odborných znalostí a schopností, v souladu s právním řádem [COUNTRY_1] a se Smlouvou, přičemž je při své činnosti povinen sledovat a chránit zájmy a dobré jméno Objednatele a postupovat v souladu s jeho aktuálními potřebami a pokyny. V případě nevhodných pokynů Objednatele je Poskytovatel povinen na nevhodnost těchto pokynů Objednatele písemně upozornit, v opačném případě nese Poskytovatel zejména odpovědnost za vady a za škodu, které v důsledku nevhodných pokynů Objednatele Poskytovateli a/nebo třetím osobám vznikly.\n\nPoskytovatel je dále povinen bezodkladně oznámit Objednateli všechny okolnosti, o kterých se při poskytování Služeb / Plnění dozví, a které by mohly mít vliv na změnu pokynů Objednatele nebo na poskytování Služeb / Plnění dle této Smlouvy a Dílčí smlouvy. \n\nPoskytovatel je povinen informovat Objednatele na jeho žádost o průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, a akceptovat jeho doplňující pokyny a připomínky k poskytovanému Plnění. V případě, že Objednatel zjistí v průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, nedostatky, Poskytovatel je povinen na písemnou výzvu Objednatele tyto nedostatky odstranit bez nároku na navýšení ceny poskytovaného Plnění bezodkladně, nejdéle však do pěti (5) pracovních dnů ode dne obdržení výzvy.\n\nObjednatel poskytne Poskytovateli k plnění požadovaného Plnění:\n\nveškerou jemu dostupnou dokumentaci;\n\nb)\tpravdivé a včasné informace potřebné k řádnému poskytování Plnění;\n\nc)\tveškerou součinnost nezbytnou pro řádné poskytování Plnění.\n\nPoskytovatel je povinen řádně pečovat o věci a dokumenty, které od Objednatele k poskytnutí požadovaného Plnění obdrží. \n\nSmluvní strany se zavazují vzájemně se písemně informovat o případných změnách sídla, právní formy, změně bankovního spojení, zrušení registrace k DPH, a dalších významných skutečností rozhodných pro plnění ze Smlouvy, resp. Dílčí smlouvy, a to bezodkladně po uskutečnění takovéto změny. \n\nPoskytovatel je povinen neprodleně informovat Objednatele o kybernetických bezpečnostních incidentech (dále jen „KBI“) na straně Poskytovatele souvisejících s plněním dle Smlouvy a/nebo Dílčích smluv, které by mohly mít dopad na kybernetickou bezpečnost u Objednatele. KBI je definován v § 2 odst. 2 písm. f) zákona č. 264/2025 Sb., o kybernetické bezpečnosti. \n\nPoskytovatel poskytne Objednateli součinnost při zvládání KBI v souvislosti s poskytovaným plněním dle Smlouvy a/nebo Dílčích smluv, a bude se v této souvislosti řídit pokyny Objednatele.\n\nPoskytovatel prohlašuje, že si je vědom předpisů týkajících se mezinárodních sankcí, zejm. pak čl. 5 k nařízení Rady EU č. 833/2014 o omezujících opatřeních vzhledem k činnostem Ruska destabilizujícím situaci na Ukrajině, ve znění pozdějších předpisů a nařízení Rady EU č. 269/2014 o omezujících opatřeních vzhledem k činnostem narušujícím nebo ohrožujícím územní celistvost, svrchovanost a nezávislost Ukrajiny, ve znění pozdějších předpisů, vč. prováděcího nařízení Rady EU 2022/581 ze dne [DATE_1], ve znění pozdějších předpisů (dále jen „Předpisy o mezinárodních sankcích“). Poskytovatel prohlašuje, že u něho, jakož ani u okruhu sledovaných subjektů dle právních Předpisů o mezinárodních sankcích vztahujícího se k plnění Smlouvy a/nebo Dílčích smluv není dána překážka uzavření či plnění Smlouvy a/nebo Dílčích smluv. Dále výslovně Poskytovatel zvláště prohlašuje, že nezpřístupní žádné finanční prostředky ani hospodářské zdroje sankcionovaným subjektům ve smyslu tohoto odstavce Smlouvy. Pro vyloučení pochybností se stanoví, že: (i) prohlášení musí být v platnosti po celou dobu plnění Smlouvy, resp. Dílčích smluv, a (ii) jsou-li do tohoto prohlášení zahrnuti poddodavatelé či jiné třetí osoby, je Poskytovatel povinen zjistit skutečnosti vztahující se k těmto třetím osobám s řádnou péčí, přinejmenším ověřením informace u třetích osob a prověřením veřejných rejstříků a evidencí. Poskytovatel je povinen zajistit smluvně dodržování příslušných povinností a omezovat rizika vyplývajících z okolností vedoucích k mezinárodním sankcím, a zavazuje se zajistit, aby jeho prohlášení dle tohoto odstavce Smlouvy zůstala pravdivá a v platnosti po celou dobu účinnosti Smlouvy a/nebo Dílčích smluv. V případě, že Poskytovatel zjistí, že pravdivost jeho prohlášení je, byť jen ohrožena, je povinen o tom Objednatele bezodkladně písemně vyrozumět.\n\nSmluvní strany se dohodly, že pokud to bude potřebné ke splnění požadavků v oblasti kybernetické bezpečnosti stanovených obecně závaznými právními předpisy, zejména v návaznosti na nový zákon č. 264/2025 Sb., o kybernetické bezpečnosti, který nabyl účinnosti dne [DATE_2], a související prováděcí právní předpisy, uzavřou bez zbytečného odkladu po výzvě Objednatele písemný dodatek k této Smlouvě a/nebo Dílčí smlouvě zohledňující takové požadavky, a to formou úpravy či doplnění ustanovení týkajících se zajištění bezpečnostních požadavků v souladu s novou právní úpravou a implementovaným systémem řízení bezpečnosti informací na straně Objednatele a/nebo koncového zákazníka Objednatele. Náklady na bezpečnost informací v důsledku změny legislativy v oblasti bezpečnosti informací nese Poskytovatel.\n\nSchválení poskytnutého Plnění a převzetí výstupů\n\nPoskytovatel splní svou povinnost řádně poskytnout Plnění dnem, kdy je příslušná činnost řádně vykonána a její výstup v Objednatelem požadované formě (dále jen „výstup“) řádně předán Objednateli. Poskytovatel je povinen vypracovat písemnou zprávu, která bude obsahovat zejména údaje o Objednateli a Poskytovateli, číslo této Smlouvy a Dílčí smlouvy, obsah a rozsah poskytnutého Plnění, závěr z poskytnutého Plnění, popř. doporučení Poskytovatele pro další postup Objednatele. Výstup bude Objednateli Poskytovatelem předán v českém jazyce v dohodnutých termínech buď v listinné podobě vytištěné v jednom (1) originálu nebo v elektronické podobě ve formátu požadovaném Objednatelem. \n\nSplnění povinnosti Poskytovatele podle odstavce 6.1 Smlouvy Smluvní strany osvědčí sepsáním protokolu o schválení poskytnutého Plnění a předání a převzetí výstupu, obsahujícího soupis poskytnutého Plnění, včetně rozpisu hodin odpracovaných Poskytovatelem při plnění jednotlivých úkolů, a označení veškerých předávaných výstupů, který bude vyhotoven ve dvou (2) vyhotoveních s platností originálu a bude opatřen podpisem oprávněných osob obou Smluvních stran (dále jen „Akceptační protokol“), přičemž každá ze Smluvních stran obdrží po jednom (1) vyhotovení. Takto vyhotovený Akceptační protokol předá Poskytovatel Objednateli vždy do pěti (5) kalendářních dnů od skončení příslušného měsíce, za který se Akceptační protokol vyhotovuje. \n\nObjednatel je oprávněn odmítnout převzetí výstupu, a tedy podepsat Akceptační protokol s výrokem „Neakceptováno“, pokud Plnění nebylo poskytnuto řádně v souladu s touto Smlouvou a Dílčí smlouvou a/nebo ve sjednané kvalitě a/nebo pokud výstup neobsahoval veškeré údaje požadované Objednatelem a/nebo Objednatel nesouhlasí s počtem hodin poskytnutého Plnění, které budou Objednateli účtovány, přičemž v takovém případě Objednatel důvody odmítnutí převzetí výstupu písemně Poskytovateli sdělí, a to nejpozději do pěti (5) pracovních dnů od předání Akceptačního protokolu. Na následné předání výstupu se použijí výše uvedená ustanovení tohoto článku Smlouvy. Pokud Objednatel uplatní písemný nárok na odstranění vad výstupu, zavazuje se Poskytovatel tyto vady odstranit bez zbytečného odkladu, nejpozději však do pěti (5) pracovních dnů, nestanoví-li Objednatel jinak. \n\nV případě zjevných vad poskytnutého Plnění nebo jeho výstupů není Objednatel povinen Plnění schválit a výstupy převzít a do odstranění těchto vad není povinen podepsat Akceptační protokol s výrokem „Akceptováno“ a zaplatit fakturovanou cenu Plnění. \n\nPovinnost mlčenlivosti a zpracování osobních údajů\n\nSmluvní strany sjednávají, že za důvěrné informace považují takové informace, které získají od druhé Smluvní strany, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, nebo které nejsou v obchodních kruzích běžně dostupné, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, zejména pak informace, údaje a skutečnosti o jakýchkoliv obchodních, finančních, technických, právních a jiných skutečnostech, které by s ohledem na dané podmínky mohly být považovány za důvěrné, poskytnuté či jakkoliv zpřístupněné jednou ze Smluvních stran či jejími zástupci druhé Smluvní straně či jejím zástupcům, ať v ústní, písemné, grafické, elektronické či jiné formě, které se Smluvní strany dozvěděly v souvislosti se Smlouvou a/nebo Dílčí smlouvou, a to bez ohledu zda jsou nebo nejsou označené za důvěrné informace (dále jen „Důvěrné informace“). \n\nS těmito Důvěrnými informacemi budou nakládat jako s vlastním obchodním tajemstvím, aniž by bylo nutné takové informace jako Důvěrné vždy jednotlivě označovat. Výše uvedené nevylučuje možnost v jednotlivých případech při zvýšeném zájmu toto označení pro jednotlivé informace použít. Smluvní strany berou zároveň na vědomí, že některé z Důvěrných informací jsou také předmětem obchodního tajemství druhé Smluvní strany, chráněným dle příslušných ustanovení Občanského zákoníku. \n\nKaždá ze Smluvních stran se zavazuje vynaložit maximální úsilí, které lze spravedlivě požadovat, aby důvěrnost Důvěrných informací druhé Smluvní strany byla důsledně dodržována jejími pracovníky i osobami, které případně, v souladu s dohodou uzavřenou s druhou Smluvní stranou, k plnění účelu spolupráce použije. Použije-li některá ze Smluvních stran k plnění třetí osoby, je oprávněna zpřístupnit jí Důvěrné informace získané od druhé Smluvní strany pouze s jejím souhlasem a v rozsahu nezbytně nutném pro jí poskytované plnění, a je rovněž povinna zavázat třetí osobu povinností mlčenlivosti v rozsahu dle Smlouvy. Za porušení povinností třetí osobou odpovídá Smluvní strana, která jí Důvěrné informace zpřístupnila.\n\nSmluvní strany se dále zavazují:\n\nzachovat mlčenlivost o Důvěrných informací, a to až do doby, kdy se informace této povahy stanou obecně známými za předpokladu, že se tak nestane porušením povinnosti mlčenlivosti;\n\npoužít informace uvedené povahy pouze pro činnosti související s přípravou a plněním Smlouvy a/nebo Dílčí smlouvy, dále tyto informace nerozšiřovat ani nereprodukovat, nezpracovávat je v systémech umělé inteligence (systémech AI), nezpřístupnit je jiným osobám ani je nevyužít pro sebe či pro jinou osobu;\n\nomezit počet svých pracovníků pro styk s těmito Důvěrnými informacemi a přijmout účinná opatření pro zamezení jejich úniku, případně zabezpečit, aby i tyto osoby považovaly uvedené informace za Důvěrné a zachovávaly o nich mlčenlivost.\n\nPovinnost plnit ustanovení dle čl. 7 odst. 7.1 až 7.4 této Smlouvy se nevztahuje na informace, které:\n\nje Smluvní strana povinna zveřejnit na základě zákonem stanovené povinnosti;\n\nmohou být zveřejněny bez porušení Smlouvy;\n\nbyly písemným souhlasem obou Smluvních stran zproštěny těchto omezení;\n\njsou známé nebo byly zveřejněny jinak, než následkem zanedbání povinnosti jedné ze Smluvních stran;\n\npříjemce je zná dříve, než je sdělí Smluvní strana;\n\njsou vyžádány soudem, státním zastupitelstvím nebo příslušným správním orgánem na základě zákona;\n\nSmluvní strana je sdělí osobě vázané zákonnou povinností mlčenlivosti (např. advokátovi nebo daňovému poradci) za účelem uplatňování svých práv;\n\nje Objednatel povinen sdělit svému zakladateli.\n\nV případě, že se kterákoliv Smluvní strana hodnověrným způsobem dozví, popřípadě bude mít důvodné podezření, že došlo k prozrazení či zpřístupnění Důvěrné informace neoprávněné osobě, je povinna neprodleně tuto skutečnost druhé Smluvní straně oznámit.\n\nPokud Smluvní strana, která poruší svůj závazek vyplývající z tohoto článku Smlouvy, takto způsobí druhé Smluvní straně škodu nebo ona či jiná třetí osoba získá na základě takové skutečnosti majetkový prospěch, má druhá Smluvní strana vůči porušující Smluvní straně nárok na náhradu veškeré jí vzniklé škody a na zaplacení částky odpovídající majetkovému prospěchu získanému v souvislosti s touto skutečností porušující Smluvní stranou či jinou třetí osobou. Nárok na náhradu případné škody není sjednáním ani zaplacením kterékoliv smluvní pokuty dle Smlouvy a/nebo Dílčí smlouvy dotčen.\n\nPovinnost ochrany Důvěrných informací trvá bez ohledu na ukončení platnosti a účinnosti Smlouvy a/nebo Dílčích smluv.\n\nPokud řádné poskytování Služeb vyžaduje zpracování osobních údajů zaměstnanců Objednatele, budou tyto osobní údaje zaměstnanců Objednatele v postavení kontaktních osob zpracovávány Poskytovatelem v rozsahu:\n\njméno, příjmení a titul,\n\ne-mailová adresa,\n\ntelefonní číslo.\n\nZpracováním osobních údajů ve smyslu tohoto odstavce Smlouvy se rozumí zejména jejich shromažďování, ukládání na nosiče informací, používání, třídění nebo kombinování, blokování a likvidace s využitím manuálních a automatizovaných prostředků v rozsahu nezbytném pro zajištění řádného poskytování Služeb.\n\nOsobní údaje budou zpracovány po dobu poskytování Služeb. Ukončením této Smlouvy / Dílčí smlouvy nezanikají povinnosti Poskytovatele týkající se bezpečnosti a ochrany osobních údajů až do okamžiku jejich úplné likvidace či předání jinému zpracovateli.\n\nSmluvní strany se dohodly, že Poskytovatel nemá nárok na náhradu nákladů spojených se zpracováním osobních údajů či s plněním povinností vyplývajících z příslušné právní úpravy.\n\nObjednatel je povinen přijmout vhodná opatření na to, aby poskytl subjektům údajů stručným, transparentním, srozumitelným a snadno přístupným způsobem za použití jasných a jednoduchých jazykových prostředků veškeré informace a učinil veškerá sdělení požadovaná Nařízením Evropského parlamentu a Rady (EU) č. 2016/679 ze dne [DATE_3], obecného nařízení o ochraně osobních údajů (dále jen „Nařízení“) ve spojení se zákonem o zpracování osobních údajů.\n\n\n\nPoskytovatel je při plnění této povinnosti povinen:\n\nzpracovávat osobní údaje pouze na základě doložených pokynů Objednatele;\n\nzohledňovat povahu zpracování osobních údajů a být Objednateli nápomocen pro splnění Objednatelovy povinnosti reagovat na žádosti o výkon práv subjektu údajů, jakož i pro splnění dalších povinností ve smyslu Nařízení;\n\nzajistit, že jeho zaměstnanci budou zpracovávat osobní údaje pouze za podmínek a v rozsahu Poskytovatelem stanoveném;\n\nPoskytovatel je při plnění této povinnosti oprávněn v rozsahu nezbytném pro plnění předmětu Smlouvy / Dílčí smlouvy zapojit do zpracování i další případné zpracovatele, k čemuž mu Objednatel tímto uděluje povolení. \n\nSmluvní strany jsou při zpracování povinny:\n\nzavést technická, organizační, personální a jiná vhodná opatření ve smyslu Nařízení, aby zajistily a byly schopny kdykoliv doložit, že zpracování osobních údajů je prováděno v souladu s Nařízením a zákonem o zpracování osobních údajů tak, aby nemohlo dojít k neoprávněnému nebo nahodilému přístupu k osobním údajům a k datovým nosičům, které tyto údaje obsahují, k jejich změně, zničení či ztrátě, neoprávněným přenosům, k jejich jinému neoprávněnému zpracování, jakož i k jinému zneužití, a tato opatření podle potřeby průběžné revidovat a aktualizovat;\n\nvést a průběžné revidovat a aktualizovat záznamy o zpracování osobních údajů ve smyslu Nařízení;\n\nřádně a včas ohlašovat případná porušení zabezpečení osobních údajů Úřadu pro ochranu osobních údajů a spolupracovat s tímto úřadem v nezbytném rozsahu;\n\nnavzájem se informovat o všech okolnostech významných pro plnění dle tohoto článku Smlouvy;\n\nzachovávat mlčenlivost o osobních údajích a o bezpečnostních opatřeních, jejichž zveřejnění by ohrozilo zabezpečení osobních údajů, a to i po skončení této Smlouvy / Dílčí smlouvy;\n\npostupovat v souladu s dalšími požadavky Nařízení a zákona o zpracování osobních údajů, zejména dodržovat obecné zásady zpracování osobních údajů, plnit své informační povinnosti, nepředávat osobní údaje třetím osobám bez potřebného oprávnění, respektovat práva subjektů údajů a poskytovat v této souvislosti nezbytnou součinnost.\n\nOchrana autorských práv\n\nPodpisem Smlouvy Poskytovatel poskytuje Objednateli na dobu trvání majetkových práv autorských nevypověditelnou, převoditelnou, výhradní a územně neomezenou licenci k vytváření kopií, užívání a zpřístupnění dalším osobám všech výstupů a dále jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, jež podle obecně závazných právních předpisů představují autorská díla nebo práva pořizovatele k jím pořízené databázi, včetně práva upravovat a )měnit takováto autorská díla nebo databáze.\n\nObjednatel není ve svých právech k užití výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, nijak omezen. Objednatel je oprávněn bez souhlasu Poskytovatele výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořené v souvislosti s ní Poskytovatelem či jeho poddodavateli, nebo jejich části upravovat či doplňovat.\n\nPoskytovatel není oprávněn výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich části, jakkoliv rozšiřovat bez předchozího písemného souhlasu Objednatele. Přenechání výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich částí Poskytovatelem třetí osobě bez předchozího písemného souhlasu Objednatele se považuje za podstatné porušení Smlouvy.\n\nPoskytovatel odpovídá za to, že plnění předmětu Smlouvy / Dílčí smlouvy, nezasahuje a nebude zasahovat do práv jiných osob, zejména práv z průmyslového nebo jiného duševního vlastnictví, a to pro jakékoliv využití plnění v [COUNTRY_2] i v zahraničí.\n\nSmluvní strany tímto sjednávají, že veškerá finanční vyrovnání za poskytnutí licence dle tohoto článku 8 Smlouvy jsou zahrnuta v Ceně dle článku 2 Smlouvy.\n\nPoddodavatelé\n\nPoskytuje-li Poskytovatel Objednateli část plnění předmětu Smlouvy, resp. Dílčí smlouvy, prostřednictvím poddodavatele písemně schváleným ze strany Objednatele, je za veškerá taková plnění poddodavatele odpovědný Poskytovatel sám, jako kdyby tato plnění byla poskytována Poskytovatelem. \n\nPoskytovatel je povinen zajistit, aby všichni poddodavatelé měli platná příslušná oprávnění, odbornou kvalifikaci a dostatek odborných zkušeností, jež jsou nezbytné pro poskytování příslušných částí Služeb dle jejich smluv s Poskytovatelem. Žádná poddodavatelská smlouva nezakládá smluvní vztahy mezi Objednatelem a poddodavatelem. \n\nPoskytovatel je dále povinen smluvně zajistit, že i jeho poddodavatelé, kteří se budou podílet na plnění dle Smlouvy, resp. Dílčích smluv, se zaváží dodržovat v plném rozsahu ujednání mezi Poskytovatelem a Objednatelem a nebudou v rozporu s požadavky Objednatele uvedenými ve Smlouvě a/nebo Dílčích smlouvách.\n\nPokud Objednatel shledá, že plnění předmětu Smlouvy, resp. Dílčí smlouvy, uskutečněné poddodavatelem nedosahuje potřebných kvalit, je vadné nebo nevykazuje jiné náležitosti požadované Objednatelem, nebo že sám poddodavatel není subjektem kompetentním pro provádění plnění z této Smlouvy, resp. Dílčí smlouvy, je oprávněn požadovat, aby Poskytovatel neprodleně svěřil takto identifikovanou část plnění jinému poddodavateli, nebo se ujal této části plnění sám. \n\nZajištění plnění, která Poskytovatel svěří poddodavateli, není poddodavatel oprávněn zadat třetím osobám. Poskytovatel je povinen na tuto skutečnost poddodavatele upozornit před uzavřením poddodavatelské smlouvy a odpovídá za její dodržování. \n\nUzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, budou považovány za podstatné porušení Smlouvy.\n\nCompliance ujednání\n\nSmluvní strany se zavazují dodržovat právní předpisy a chovat se tak, aby jejich jednání nemohlo vzbudit důvodné podezření ze spáchání nebo páchání trestného činu, a to ani takového, který by mohl být přičitatelný Objednateli podle zákona č. 418/2011 Sb., o trestní odpovědnosti právnických osob a řízení proti nim, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že učiní všechna opatření k tomu, aby se nedopustily ony a ani nikdo z jejich zaměstnanců či zástupců jakékoliv formy korupčního jednání, zejména jednání, které by mohlo být vnímáno jako přijetí úplatku, podplácení nebo nepřímé úplatkářství či jiný trestný čin spojený s korupcí dle zákona č. 40/2009 Sb., trestní zákoník, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že:\n\nneposkytnou, nenabídnou ani neslíbí úplatek jinému nebo pro jiného v souvislosti s obstaráváním věcí obecného zájmu anebo v souvislosti s podnikáním svým nebo jiného; \n\núplatek nepřijmou, ani si jej nedají slíbit, ať už pro sebe nebo pro jiného v souvislosti s obstaráním věcí obecného zájmu nebo v souvislosti s podnikáním svým nebo jiného. \n\nÚplatkem se přitom rozumí neoprávněná výhoda spočívající v přímém majetkovém obohacení nebo jiném zvýhodnění, které se dostává nebo má dostat uplácené osobě nebo s jejím souhlasem jiné osobě, a na kterou není nárok.\n\nSmluvní strany nebudou ani u svých obchodních partnerů tolerovat jakoukoliv formu korupce či uplácení.\n\nV případě, že je zahájeno trestní stíhání Poskytovatele, zavazuje se Poskytovatel o tomto bez zbytečného odkladu Objednatele písemně informovat.\n\nSankce\n\nV případě nedodržení termínu poskytnutí Plnění a/nebo předání výstupu a/nebo odstranění vad poskytnutého Plnění ve sjednané kvalitě podle této Smlouvy a Dílčí smlouvy ze strany Poskytovatele je Poskytovatel povinen uhradit Objednateli smluvní pokutu ve výši [MONETARY_AMOUNT_2]. \n\nZa každé jednotlivé porušení povinnosti mlčenlivosti podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinnosti Poskytovatele při zpracování osobních údajů podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinností Poskytovatele stanovených v čl. 5.7 až 5.9 této Smlouvy je Objednatel oprávněn požadovat po Poskytovateli zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nJestliže se jakékoli prohlášení Poskytovatele podle článku 8 Smlouvy ukáže nepravdivým nebo zavádějícím nebo Poskytovatel poruší jiné povinnosti podle článku 8 této Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nV případě prodlení Objednatele s úhradou řádně vystavených a doručených faktur, je Objednatel povinen uhradit Poskytovateli úrok z prodlení dle nařízení vlády č. 351/2013 Sb., kterým se určuje výše úroků z prodlení a nákladů spojených s uplatněním pohledávky, určuje odměna likvidátora, likvidačního správce a člena orgánu právnické osoby jmenovaného soudem a upravují některé otázky Obchodního věstníku a veřejných rejstříků právnických a fyzických osob, evidence svěřenských fondů a evidence údajů o skutečných majitelích.\n\nVyúčtování smluvní pokuty / úroků z prodlení – penalizační faktura, musí být druhé Smluvní straně zasláno datovou zprávou prostřednictvím datové schránky. Smluvní pokuta a úroky z prodlení jsou splatné ve lhůtě třiceti (30) kalendářních dnů ode dne doručení penalizační faktury povinné Smluvní straně. Úhrada smluvní pokuty / úroků z prodlení se provádí bankovním převodem na účet oprávněné Smluvní strany uvedený v penalizační faktuře. Částka se považuje za zaplacenou okamžikem jejího připsání ve prospěch účtu oprávněné Smluvní strany.\n\nZaplacením smluvní pokuty není dotčen nárok Objednatele na náhradu újmy v celém rozsahu způsobené újmy, ani povinnost Poskytovatele řádně dokončit plnění předmětu Smlouvy, popř. odstranit vady.\n\nObjednatel je v případě uplatnění smluvní pokuty vůči Poskytovateli dle této Smlouvy a v případě neuhrazení smluvní pokuty ze strany Poskytovatele oprávněn využít institut započtení vzájemných pohledávek.\n\nDoba trvání Smlouvy\n\nTato Smlouva nabývá platnosti dnem jejího podpisu oběma Smluvními stranami a účinnosti dnem jejího uveřejnění v registru smluv v souladu se zákonem č. 340/2015 Sb., o registru smluv, ve znění pozdějších předpisů. Zveřejnění Smlouvy v registru smluv zajistí Objednatel. \n\nTato Smlouva se uzavírá na dobu určitou, a to na dobu 48 měsíců od nabytí účinnosti. \n\nSmlouva / Dílčí smlouva může být ukončena dohodou Smluvních stran v písemné formě, přičemž účinky zrušení Smlouvy / Dílčí smlouvy nastanou k okamžiku stanoveném v této dohodě. Nebude-li takovýto okamžik dohodou stanoven, pak tyto účinky nastanou ke dni uzavření takovéto dohody.\n\nSmluvní strany jsou oprávněny od Smlouvy a/nebo Dílčí smlouvy odstoupit v případě jejího podstatného porušení druhou Smluvní stranou, za podmínek uvedených v § 2001 a násl. Občanského zákoníku.\n\nZa podstatné porušení této Smlouvy a/nebo Dílčí Smlouvy Poskytovatelem, které zakládá právo Objednatele na odstoupení od této Smlouvy a/nebo Dílčí smlouvy, se považuje zejména:\n\nnedodržení právních předpisů Poskytovatelem při poskytování Služeb / Plnění;\n\nprodlení Poskytovatele s poskytováním Služeb / Plnění z důvodů spočívajících výlučně na straně Poskytovatele a/nebo předáním výstupu po dobu delší než deset (10) kalendářních dnů;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 7 této Smlouvy;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 8 této Smlouvy;\n\nporušení jakékoliv povinnosti Poskytovatele stanovené v čl. 5.7 až 5.9 této Smlouvy;\n\nuzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, ve smyslu čl. 9 odst. 9.5 Smlouvy; \n\npostup Poskytovatele při poskytování Služeb / Plnění v rozporu s oprávněnými pokyny Objednatele.\n\nObjednatel je dále oprávněn od této Smlouvy a/nebo Dílčí Smlouvy odstoupit v případě, že \n\nPoskytovatel přestane splňovat požadavky na kvalifikaci uvedené ve výzvě k podání nabídky; \n\nvůči majetku Poskytovatele probíhá insolvenční řízení, v němž bylo vydáno rozhodnutí o úpadku, pokud to právní předpisy umožňují;\n\ninsolvenční návrh na Poskytovatele byl zamítnut proto, že majetek Poskytovatele nepostačuje k úhradě nákladů insolvenčního řízení;\n\nbyl Poskytovatel pravomocně odsouzen pro trestný čin. \n\nPoskytovatel je oprávněn odstoupit od této Smlouvy a/nebo Dílčí Smlouvy, pokud Objednatel bude přes písemné upozornění Poskytovatele déle než třicet (30) kalendářních dnů od písemného upozornění Poskytovatele v prodlení s plněním své platební povinnosti vůči Poskytovateli. \n\nÚčinky odstoupení nastávají uplynutím lhůty deseti (10) kalendářních dnů, která počíná běžet prvním dnem následujícím po doručení projevu vůle odstoupit od Smlouvy a/nebo Dílčí Smlouvy druhé Smluvní straně. Poskytovatel je v případě odstoupení od Smlouvy / Dílčí smlouvy povinen učinit již jen takové úkony, bez nichž by mohly být zájmy Objednatele vážně ohroženy.\n\nOdstoupení od Smlouvy a/nebo Dílčí Smlouvy se nedotýká zejména práva na náhradu újmy, smluvní pokuty a povinnosti mlčenlivosti, ani ujednání, které mají vzhledem ke své povaze zavazovat Smluvní strany i po ukončení Smlouvy / Dílčí smlouvy.\n\nSmluvní strany jsou oprávněny Smlouvu a/nebo Dílčí Smlouvu vypovědět, i bez uvedení důvodu, na základě písemné výpovědi. Výpovědní doba činí tři (3) měsíce a počíná běžet dnem doručení výpovědi druhé Smluvní straně.\n\nV případě jakéhokoliv skončení tohoto smluvního vztahu podle Smlouvy a/nebo Dílčí Smlouvy, je Poskytovatel vždy povinen neprodleně předat Objednateli veškeré věci a dokumenty, vztahující se k plnění této Smlouvy a/nebo Dílčí Smlouvy nebo poskytnuté za účelem plnění předmětu Smlouvy a/nebo Dílčí Smlouvy, nejpozději však do pěti (5) pracovních dnů ode dne ukončení smluvního vztahu. \n\nZávěrečná ustanovení\n\nSmluvní strany potvrzují, že si při uzavírání Smlouvy vzájemně sdělily všechny skutkové a právní okolnosti, o nichž ví nebo vědět musí, tak, aby se každá ze Smluvních stran mohla přesvědčit o možnosti uzavřít platnou Smlouvu a aby byl každé ze Smluvních stran zřejmý zájem druhé Smluvní strany Smlouvu uzavřít. \n\nSmluvní strany výslovně potvrzují, že si vzájemně sdělily veškeré okolnosti důležité pro uzavření Smlouvy. Smluvní strany prohlašují, že se dohodly o veškerých náležitostech Smlouvy.\n\nPoskytovatel prohlašuje a potvrzuje, že na sebe přebírá nebezpečí změny okolností ve smyslu ustanovení § 1765 odst. 2 Občanského zákoníku.\n\nSmluvní strany si ve smyslu ustanovení § 1794 odst. 2 Občanského zákoníku ujednaly, že se Poskytovatel výslovně vzdává jeho práva ve smyslu ustanovení § 1793 Občanského zákoníku a souhlasí s cenou tak, jak byla Smluvními stranami sjednána výše v této Smlouvě.\n\nSmluvní strany se zavazují vyvinout maximální úsilí k odstranění vzájemných sporů, vzniklých na základě této Smlouvy nebo v souvislosti s touto Smlouvou, a k jejich vyřešení zejména prostřednictvím jednání odpovědných pracovníků nebo jiných pověřených subjektů. Nedohodnou-li se na způsobu řešení vzájemného sporu, dohodly se Smluvní strany, že místně příslušným soudem pro řešení případných sporů bude soud příslušný dle místa sídla Objednatele.\n\nTato Smlouva může být měněna pouze vzestupně očíslovanými písemnými dodatky ke Smlouvě podepsanými oběma Smluvními stranami. \n\nDnem doručení písemností odeslaných na základě této Smlouvy nebo v souvislosti s touto Smlouvou prostřednictvím provozovatele poštovních služeb, pokud není prokázán jiný den doručení, se rozumí poslední den lhůty, ve které byla písemnost pro adresáta uložena u provozovatele poštovních služeb, a to i tehdy, jestliže se adresát o jejím uložení nedověděl. Smluvní strany tímto výslovně vylučují ust. § 573 Občanského zákoníku.\n\nPokud kterékoli ustanovení této Smlouvy nebo jeho část je nebo se stane neplatným či nevynutitelným, nebude mít tato neplatnost či nevynutitelnost vliv na platnost či vynutitelnost ostatních ustanovení této Smlouvy nebo jejích částí, pokud nevyplývá přímo z obsahu této Smlouvy, že toto ustanovení nebo jeho část nelze oddělit od dalšího obsahu. V takovém případě se obě Smluvní strany zavazují neúčinné a neplatné ustanovení nahradit novým ustanovením, které je svým účelem a významem co nejbližší ustanovení této Smlouvy, jež má být nahrazeno. \n\nPro případ, že tato Smlouva není uzavírána za přítomnosti všech Smluvních stran, platí, že Smlouva nebude uzavřena, pokud ji Poskytovatel podepíše s jakoukoliv změnou či odchylkou, byť nepodstatnou, nebo dodatkem, ledaže Objednatel takovou změnu či odchylku nebo dodatek následně schválí. To platí i v případě připojení obchodních podmínek Poskytovatele, které budou odporovat svým obsahem jakýmkoliv způsobem textu této Smlouvy. \n\nTato Smlouva je vyhotovena ve čtyřech (4) rovnocenných vyhotoveních, z nichž každé má platnost originálu. Každá ze Smluvních stran obdrží po dvou (2) stejnopisech. V případě, že bude Smlouva uzavírána elektronicky obdrží Poskytovatel/Objednatel elektronický dokument podepsaný v souladu s platnou právní úpravou.\n\nKontaktní údaje Smluvních stran pro doručování jsou následující:\n\nKontaktní osoba [PERSON_2]:\n\nxxx\n\ne-mail: xxx tel. xxx\n\nKontaktní osoba [PERSON_3]:\n\n xxx\n\n e-mail: xxx tel. xxx\n\n\n\n Nedílnou součástí této Smlouvy jsou následující přílohy:\n\nPříloha č. 1 – Cena\n\nPříloha č. 2 – Akceptační protokol\n\n\n\n\n\n\n\n\n\nSmluvní strany prohlašují, že tato Smlouva je projevem jejich pravé a svobodné vůle a nebyla sjednána v tísni ani za jinak jednostranně nevýhodných podmínek. Na důkaz toho připojují Smluvní strany své podpisy.\n\n\n\nV Praze dne: Dle elektronického podpisu\tV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\n__________________________________\n\nxxx\n\n[PERSON_1]\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\nV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\nxxx\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 1 – Cena\n\n\n\nHodinová sazba v Kč bez DPH za poskytování Služeb:\n\n\n\nHodinová sazba v Kč bez DPH\n\n2 190,- \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 2 – Akceptační protokol\n\n\n\nAkceptační protokol\n\n\n\n\n\nPlnění / výstup:\t……………………………………………………………………………………….\n\n\n\nPředáno dne: \n\n\n\n…………………………………………………………………………………………………………..\n\n\n\n\n\nPřevzal: \t\t\t\t\t\tPředal:\n\n\n\n…………………………………………… \t\t……………………………………………….\n\nObjednatel:\t\t\t\t\t\t[ORGANIZATION_3]:\n\nNárodní agentura pro komunikační a  \n\ninformační technologie, s. p. \n\nJméno, příjmení \t\t\t\t \n\nFunkce\t\t\t \n\n\t\n\n\n\nAkceptováno dne: datum akceptace\n\n\n\nVýsledek akceptace: \tAKCEPTOVÁNO / NEAKCEPTOVÁNO\n\n\n\nAkceptoval\n\nFunkce\n\nPodpis\n\n\n\n\n\n\n\n\n\n\n\n\n\nPřipomínky, výhrady:\n\n\n\npřipomínky a výhrady k předanému Plnění" } diff --git a/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json index 467f694e..d8cbf035 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json @@ -103,11 +103,11 @@ "source": "deny-list" }, { - "start": 687, + "start": 670, "end": 691, "label": "registration number", - "text": "1603", - "source": "trigger" + "text": "oddíl Pr, vložka 1603", + "source": "regex" }, { "start": 705, @@ -152,5 +152,5 @@ "source": "country" } ], - "redactedText": "Příloha č. 3a26 ke Smlouvě o poskytnutí obratového bonusu (COMMA CAF ID 266, ze dne 2.4. 2019 ) uzavřené mezi smluvními stranami, kterými jsou:\n\n\n\n\n\n\n\n[ORGANIZATION_1]\n\nSe sídlem: [ADDRESS_1]\n\nIČO: [REGISTRATION_NUMBER_1]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_1]\n\nBankovní spojení: [BANK_ACCOUNT_NUMBER_1] \n\nZapsaná v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2] \n\nZastoupená: [OU OU], Head of Trade Department [COUNTRY_1]\n\n(dále jen „Společnost“)\n\n \n\na \n\n\n\n[ORGANIZATION_3]\n\nSídlo: [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2]\n\nBankovní spojení: [XX XX]\n\nZapsaná v obchodním rejstříku vedeném u KS [ADDRESS_3], oddíl Pr, vložka [REGISTRATION_NUMBER_4]\n\nZastoupená: [PERSON_1], ředitelka\n\n(dále též „Zdravotnické zařízení“).\n\n\n\nObsahem této přílohy je dohoda o podmínkách dosažení a o výši obratového bonusu pro tyto produkty: \n\n\n\n[XX XX]\n\n\n\nReferenční období: [XX XX]\n\n\n\nPotřebná výše obratu v referenčním období:\n\n \n\n[XX XX] \n\n[XX XX] \n\n[XX XX] \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nObratový bonus poskytnutý Zdravotnickému zařízení bude vyplacen pouze podle jednoho pásma, a to v souladu s dosaženou výší obratu Zdravotnickým zařízením v referenčním období:\n\n\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nV Praze, dne ...[DATE_1]..........\tV Blansku, dne …[DATE_2].........\n\n\n\n\n\n_______________________________________\t __________________________________________\n\n[ORGANIZATION_4]\n\n[OU OU]\t[PERSON_1] \n\nHead of Trade Department [COUNTRY_1]\t ředitelka" + "redactedText": "Příloha č. 3a26 ke Smlouvě o poskytnutí obratového bonusu (COMMA CAF ID 266, ze dne 2.4. 2019 ) uzavřené mezi smluvními stranami, kterými jsou:\n\n\n\n\n\n\n\n[ORGANIZATION_1]\n\nSe sídlem: [ADDRESS_1]\n\nIČO: [REGISTRATION_NUMBER_1]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_1]\n\nBankovní spojení: [BANK_ACCOUNT_NUMBER_1] \n\nZapsaná v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2] \n\nZastoupená: [OU OU], Head of Trade Department [COUNTRY_1]\n\n(dále jen „Společnost“)\n\n \n\na \n\n\n\n[ORGANIZATION_3]\n\nSídlo: [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2]\n\nBankovní spojení: [XX XX]\n\nZapsaná v obchodním rejstříku vedeném u KS [ADDRESS_3], [REGISTRATION_NUMBER_4]\n\nZastoupená: [PERSON_1], ředitelka\n\n(dále též „Zdravotnické zařízení“).\n\n\n\nObsahem této přílohy je dohoda o podmínkách dosažení a o výši obratového bonusu pro tyto produkty: \n\n\n\n[XX XX]\n\n\n\nReferenční období: [XX XX]\n\n\n\nPotřebná výše obratu v referenčním období:\n\n \n\n[XX XX] \n\n[XX XX] \n\n[XX XX] \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nObratový bonus poskytnutý Zdravotnickému zařízení bude vyplacen pouze podle jednoho pásma, a to v souladu s dosaženou výší obratu Zdravotnickým zařízením v referenčním období:\n\n\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nV Praze, dne ...[DATE_1]..........\tV Blansku, dne …[DATE_2].........\n\n\n\n\n\n_______________________________________\t __________________________________________\n\n[ORGANIZATION_4]\n\n[OU OU]\t[PERSON_1] \n\nHead of Trade Department [COUNTRY_1]\t ředitelka" } diff --git a/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json index 1477f3bd..69689d1a 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json @@ -45,7 +45,7 @@ "end": 311, "label": "date of birth", "text": "21. März 1968", - "source": "regex" + "source": "trigger" }, { "start": 326, @@ -66,7 +66,7 @@ "end": 467, "label": "date of birth", "text": "09. Juli 1982", - "source": "regex" + "source": "trigger" }, { "start": 482, diff --git a/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json index c7b463db..48ef74e9 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json @@ -4,10 +4,11 @@ "date": 3, "organization": 8, "address": 6, - "tax identification number": 3, + "tax identification number": 2, "person": 4, "monetary amount": 1, "country": 1, + "bank account number": 1, "email address": 1, "phone number": 1 }, @@ -120,9 +121,9 @@ { "start": 1452, "end": 1462, - "label": "tax identification number", + "label": "bank account number", "text": "4537891022", - "source": "regex" + "source": "trigger" }, { "start": 1572, @@ -209,5 +210,5 @@ "source": "regex" } ], - "redactedText": "SOFTWARE LICENSE AGREEMENT\n\nThis Software License Agreement (the \"Agreement\") is entered into as of [DATE_1]\n(the \"Effective Date\") by and between:\n\n(1) [ORGANIZATION_1], a [ADDRESS_1] corporation\n with its principal place of business at 1209 [ADDRESS_2], DE 19801,\n EIN: [TAX_IDENTIFICATION_NUMBER_1]\n (the \"Licensor\"), represented by its Chief Executive Officer,\n [PERSON_1]; and\n\n(2) [ORGANIZATION_2], a [ADDRESS_3] limited liability company\n with offices at 200 West [ADDRESS_4], NY 10282,\n EIN: [TAX_IDENTIFICATION_NUMBER_2]\n (the \"Licensee\"), represented by its Managing Director,\n Dr. [PERSON_2].\n\nRecitals\nLicensor has developed certain proprietary analytics software (\"Software\") and\nis willing to grant Licensee a license under the terms set forth herein.\nLicensee desires to obtain such a license to integrate the Software with its\n[ORGANIZATION_3] treasury operations and to deploy it across the Acme &\nCompany Holdings group of subsidiaries.\n\n1. License Grant. Subject to the terms of this Agreement, Licensor hereby grants\n to Licensee a non-exclusive, non-transferable license to use the Software\n solely for internal business purposes.\n\n2. Fees. Licensee shall pay Licensor an annual license fee of [MONETARY_AMOUNT_1]\n (one million two hundred fifty thousand [COUNTRY_1] dollars), payable to the\n account designated by Licensor ([ORGANIZATION_4], routing 121000248,\n account [TAX_IDENTIFICATION_NUMBER_3]).\n\n3. Notices. Any notice required hereunder shall be sent to the following\n addresses:\n If to Licensor: [ORGANIZATION_1], Attn: Legal Department,\n 1209 [ADDRESS_2], DE 19801; with a copy to general counsel\n at [EMAIL_ADDRESS_1].\n If to Licensee: [ORGANIZATION_2], Attn: General Counsel,\n 200 West [ADDRESS_4], NY 10282; phone: ([PHONE_NUMBER_1].\n\nIN WITNESS WHEREOF, the parties have executed this Agreement as of the\nEffective Date.\n\nLICENSOR: [ORGANIZATION_1] LICENSEE: [ORGANIZATION_2]\nBy: ____________________________ By: ____________________________\nName: [PERSON_3] Name: [PERSON_2]\nTitle: Chief Executive Officer Title: Managing Director\nDate: [DATE_1] Date: [DATE_1]\n" + "redactedText": "SOFTWARE LICENSE AGREEMENT\n\nThis Software License Agreement (the \"Agreement\") is entered into as of [DATE_1]\n(the \"Effective Date\") by and between:\n\n(1) [ORGANIZATION_1], a [ADDRESS_1] corporation\n with its principal place of business at 1209 [ADDRESS_2], DE 19801,\n EIN: [TAX_IDENTIFICATION_NUMBER_1]\n (the \"Licensor\"), represented by its Chief Executive Officer,\n [PERSON_1]; and\n\n(2) [ORGANIZATION_2], a [ADDRESS_3] limited liability company\n with offices at 200 West [ADDRESS_4], NY 10282,\n EIN: [TAX_IDENTIFICATION_NUMBER_2]\n (the \"Licensee\"), represented by its Managing Director,\n Dr. [PERSON_2].\n\nRecitals\nLicensor has developed certain proprietary analytics software (\"Software\") and\nis willing to grant Licensee a license under the terms set forth herein.\nLicensee desires to obtain such a license to integrate the Software with its\n[ORGANIZATION_3] treasury operations and to deploy it across the Acme &\nCompany Holdings group of subsidiaries.\n\n1. License Grant. Subject to the terms of this Agreement, Licensor hereby grants\n to Licensee a non-exclusive, non-transferable license to use the Software\n solely for internal business purposes.\n\n2. Fees. Licensee shall pay Licensor an annual license fee of [MONETARY_AMOUNT_1]\n (one million two hundred fifty thousand [COUNTRY_1] dollars), payable to the\n account designated by Licensor ([ORGANIZATION_4], routing 121000248,\n account [BANK_ACCOUNT_NUMBER_1]).\n\n3. Notices. Any notice required hereunder shall be sent to the following\n addresses:\n If to Licensor: [ORGANIZATION_1], Attn: Legal Department,\n 1209 [ADDRESS_2], DE 19801; with a copy to general counsel\n at [EMAIL_ADDRESS_1].\n If to Licensee: [ORGANIZATION_2], Attn: General Counsel,\n 200 West [ADDRESS_4], NY 10282; phone: ([PHONE_NUMBER_1].\n\nIN WITNESS WHEREOF, the parties have executed this Agreement as of the\nEffective Date.\n\nLICENSOR: [ORGANIZATION_1] LICENSEE: [ORGANIZATION_2]\nBy: ____________________________ By: ____________________________\nName: [PERSON_3] Name: [PERSON_2]\nTitle: Chief Executive Officer Title: Managing Director\nDate: [DATE_1] Date: [DATE_1]\n" } diff --git a/packages/anonymize/src/__test__/load-dictionaries.ts b/packages/anonymize/src/__test__/load-dictionaries.ts index 228c72b4..8bf74c04 100644 --- a/packages/anonymize/src/__test__/load-dictionaries.ts +++ b/packages/anonymize/src/__test__/load-dictionaries.ts @@ -6,131 +6,42 @@ * Only used in tests — production consumers load and pass * dictionaries themselves. */ -import type { Dictionaries, DictionaryMeta } from "../types"; +import type { Dictionaries } from "../types"; -let cached: Dictionaries | null = null; - -export const loadTestDictionaries = async (): Promise => { - if (cached) return cached; - - const dataModule = await import("@stll/anonymize-data"); +type TestDictionaryScope = { + denyListCountries?: readonly string[]; + nameCorpusLanguages?: readonly string[]; +}; - // Load all dictionaries - const allIds = [...dataModule.ALL_DICTIONARY_IDS]; - const denyList: Record = {}; - const denyListMeta: Record = {}; +const cache = new Map(); - const results = await Promise.all( - allIds.map(async (id) => { - const entries = await dataModule.loadDictionary(id); - return { id, entries }; - }), - ); +const scopeKey = (scope: TestDictionaryScope): string => + JSON.stringify({ + denyListCountries: [...(scope.denyListCountries ?? [])].toSorted(), + nameCorpusLanguages: [...(scope.nameCorpusLanguages ?? [])].toSorted(), + }); - for (const { id, entries } of results) { - const meta = dataModule.DICTIONARY_META[id]; - if (!meta) continue; - denyList[id] = entries; - // SAFETY: anonymize-data categories match DenyListCategory at runtime - denyListMeta[id] = meta as DictionaryMeta; +export const loadTestDictionaries = async ( + scope: TestDictionaryScope = {}, +): Promise => { + const key = scopeKey(scope); + const cached = cache.get(key); + if (cached) return cached; + const dataModule = await import("../../../data/dictionaries/index"); + const bundleOptions: Parameters[0] = + {}; + if (scope.denyListCountries !== undefined) { + bundleOptions.countries = scope.denyListCountries; + bundleOptions.cityCountries = scope.denyListCountries; } - - // Load per-language first names and surnames - const NAME_LANGUAGES = [ - "cs", - "sk", - "de", - "pl", - "hu", - "ro", - "fr", - "es", - "it", - "en", - "sv", - ] as const; - - const firstNames: Record = {}; - const surnames: Record = {}; - - await Promise.all( - NAME_LANGUAGES.map(async (lang) => { - try { - const mod = await import( - `@stll/anonymize-data/dictionaries/names/first/${lang}.json` - ); - firstNames[lang] = mod.default; - } catch { - // Not available for this language - } - try { - const mod = await import( - `@stll/anonymize-data/dictionaries/names/surnames/${lang}.json` - ); - surnames[lang] = mod.default; - } catch { - // Not available for this language - } - }), - ); - - // Load city dictionaries for common countries - const CITY_COUNTRIES = [ - "AT", - "AU", - "BE", - "BG", - "BR", - "CA", - "CH", - "CZ", - "DE", - "DK", - "ES", - "FI", - "FR", - "GB", - "GR", - "HR", - "HU", - "IE", - "IT", - "LU", - "NL", - "NO", - "NZ", - "PL", - "PT", - "RO", - "SE", - "SI", - "SK", - "US", - ]; - const cityResults = await Promise.all( - CITY_COUNTRIES.map(async (country) => ({ - country, - entries: await dataModule.loadCityDictionary(country), - })), - ); - const citiesByCountry: Record = {}; - const mergedCities: string[] = []; - for (const { country, entries } of cityResults) { - citiesByCountry[country] = entries; - for (const entry of entries) { - mergedCities.push(entry); - } + if (scope.nameCorpusLanguages !== undefined) { + bundleOptions.nameLanguages = scope.nameCorpusLanguages; } const result: Dictionaries = { - firstNames, - surnames, - denyList, - denyListMeta, - cities: mergedCities, - citiesByCountry, + ...(await dataModule.loadDictionaryBundle(bundleOptions)), }; - cached = result; + cache.set(key, result); return result; }; diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index e19b73c0..d63fb4d1 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -9,7 +9,39 @@ import fc from "fast-check"; setDefaultTimeout(120_000); type NativeAdapter = { + NativePreparedSearch: { + new (configJson: string): { + redactStaticEntities: ( + fullText: string, + operators?: Record, + ) => StaticRedactionResult; + }; + fromConfigJsonBytes: (configJson: Buffer) => { + redactStaticEntities: ( + fullText: string, + operators?: Record, + ) => StaticRedactionResult; + }; + fromConfigJsonAndArtifactBytes: ( + configJson: Buffer, + artifactBytes: Buffer, + ) => { + redactStaticEntities: ( + fullText: string, + operators?: Record, + ) => StaticRedactionResult; + }; + fromPreparedPackageBytes: (packageBytes: Buffer) => { + redactStaticEntities: ( + fullText: string, + operators?: Record, + ) => StaticRedactionResult; + }; + }; normalizeForSearch: (text: string) => string; + prepareStaticSearchArtifactsBytes: (configJson: Buffer) => Buffer; + prepareStaticSearchPackageBytes: (configJson: Buffer) => Buffer; + prepareStaticSearchCompressedPackageBytes: (configJson: Buffer) => Buffer; redactStaticEntitiesJson: ( configJson: string, fullText: string, @@ -183,6 +215,69 @@ results = [ print(json.dumps(results)) `; +const PYTHON_PREPARED_ARTIFACT_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +artifact_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_ARTIFACTS"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +artifact_bytes = artifact_path.read_bytes() +if module.prepare_static_search_artifacts_bytes(payload["config_json"]) != artifact_bytes: + raise AssertionError("prepared artifact bytes differ") +prepared = module.PreparedSearch.from_config_json_and_artifact_bytes( + payload["config_json"], + artifact_bytes, +) +print( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) +) +`; + +const PYTHON_PREPARED_PACKAGE_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +prepare_fn_name = os.environ.get( + "STELLA_ANONYMIZE_PACKAGE_PREPARE_FN", + "prepare_static_search_package_bytes", +) +if getattr(module, prepare_fn_name)(payload["config_json"]) != package_bytes: + raise AssertionError("prepared package bytes differ") +prepared = module.PreparedSearch.from_prepared_package_bytes(package_bytes) +print( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) +) +`; + let loadedAdapters: { native: NativeAdapter; pythonModulePath: string; @@ -318,6 +413,120 @@ describe("native adapter parity", () => { ); }); + test("prepared search accepts config JSON bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const stringPrepared = new adapters.native.NativePreparedSearch( + CONFIG_JSON, + ); + const bytesPrepared = + adapters.native.NativePreparedSearch.fromConfigJsonBytes( + Buffer.from(CONFIG_JSON), + ); + + expect(bytesPrepared.redactStaticEntities(text)).toEqual( + stringPrepared.redactStaticEntities(text), + ); + }); + + test("prepared search accepts artifact bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const artifactBytes = + adapters.native.prepareStaticSearchArtifactsBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + configBytes, + artifactBytes, + ); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithArtifacts( + adapters.pythonModulePath, + adapters.tempDir, + artifactBytes, + text, + null, + ), + ).toEqual(expectedJson); + }); + + test("prepared search accepts package bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchPackageBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithPackage( + adapters.pythonModulePath, + adapters.tempDir, + packageBytes, + text, + null, + ), + ).toEqual(expectedJson); + }); + + test("prepared search accepts compressed package bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchCompressedPackageBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithPackage( + adapters.pythonModulePath, + adapters.tempDir, + packageBytes, + text, + null, + "prepare_static_search_compressed_package_bytes", + ), + ).toEqual(expectedJson); + }); + test("diagnostics JSON is identical through TS and Python adapters", () => { const adapters = getAdapters(); const text = @@ -401,6 +610,10 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { const nativeRequire = createRequire(import.meta.url); const loaded: unknown = nativeRequire(nativePath); const normalizeForSearch = Reflect.get(Object(loaded), "normalizeForSearch"); + const NativePreparedSearch = Reflect.get( + Object(loaded), + "NativePreparedSearch", + ); const redactStaticEntitiesJson = Reflect.get( Object(loaded), "redactStaticEntitiesJson", @@ -409,15 +622,36 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { Object(loaded), "redactStaticEntitiesDiagnosticsJson", ); + const prepareStaticSearchArtifactsBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchArtifactsBytes", + ); + const prepareStaticSearchPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchPackageBytes", + ); + const prepareStaticSearchCompressedPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchCompressedPackageBytes", + ); if ( + typeof NativePreparedSearch !== "function" || typeof normalizeForSearch !== "function" || + typeof prepareStaticSearchArtifactsBytes !== "function" || + typeof prepareStaticSearchPackageBytes !== "function" || + typeof prepareStaticSearchCompressedPackageBytes !== "function" || typeof redactStaticEntitiesJson !== "function" || typeof redactStaticEntitiesDiagnosticsJson !== "function" ) { throw new TypeError("Native anonymize adapter exports are incomplete"); } return { + NativePreparedSearch: + NativePreparedSearch as NativeAdapter["NativePreparedSearch"], normalizeForSearch, + prepareStaticSearchArtifactsBytes, + prepareStaticSearchPackageBytes, + prepareStaticSearchCompressedPackageBytes, redactStaticEntitiesJson, redactStaticEntitiesDiagnosticsJson, }; @@ -513,6 +747,64 @@ print(module.normalize_for_search(payload["text"])) } }; +const callPythonPreparedWithArtifacts = ( + pythonModulePath: string, + tempDir: string, + artifactBytes: Buffer, + text: string, + operators: Record | null, +): StaticRedactionResult => { + const payloadPath = join(tempDir, "prepared-artifacts-payload.json"); + const artifactPath = join(tempDir, "prepared-artifacts.bin"); + writeFileSync(artifactPath, artifactBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_PREPARED_ARTIFACT_SCRIPT], + { + STELLA_ANONYMIZE_ARTIFACTS: artifactPath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + +const callPythonPreparedWithPackage = ( + pythonModulePath: string, + tempDir: string, + packageBytes: Buffer, + text: string, + operators: Record | null, + prepareFn = "prepare_static_search_package_bytes", +): StaticRedactionResult => { + const payloadPath = join(tempDir, "prepared-package-payload.json"); + const packagePath = join(tempDir, "prepared-package.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + }), + ); + const output = runCommand("python3", ["-c", PYTHON_PREPARED_PACKAGE_SCRIPT], { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PACKAGE_PREPARE_FN: prepareFn, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + const callPythonDiagnostics = ( pythonModulePath: string, text: string, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 7cb37d56..eac150db 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -9,6 +9,7 @@ import { } from "../index"; import { buildUnifiedSearch } from "../build-unified-search"; import { REGEX_META } from "../detectors/regex"; +import { applyPipelineLanguageScope } from "../language-scope"; import type { Dictionaries, PipelineConfig } from "../types"; import { loadTestDictionaries } from "./load-dictionaries"; @@ -51,6 +52,32 @@ const detect = async (fullText: string, config: Partial) => }); describe("pipeline config semantics", () => { + test("content language derives dictionary scopes", () => { + expect( + applyPipelineLanguageScope({ + ...BASE_CONFIG, + language: "en-US", + }), + ).toMatchObject({ + nameCorpusLanguages: ["en"], + denyListCountries: ["US", "GB", "CA", "AU", "IE"], + }); + }); + + test("explicit dictionary scopes override content language", () => { + expect( + applyPipelineLanguageScope({ + ...BASE_CONFIG, + language: "en", + denyListCountries: ["CZ"], + nameCorpusLanguages: ["cs"], + }), + ).toMatchObject({ + nameCorpusLanguages: ["cs"], + denyListCountries: ["CZ"], + }); + }); + test("empty labels do not suppress deterministic detectors", async () => { const entities = await detect("Datum narození: 2024-01-02", { enableRegex: true, @@ -85,6 +112,94 @@ describe("pipeline config semantics", () => { expect(regexCount).toBe(expected); }); + test("content language scopes deny-list search build", async () => { + const testDictionaries = await getDictionaries(); + const config = { + ...BASE_CONFIG, + dictionaries: testDictionaries, + enableDenyList: true, + enableNameCorpus: true, + labels: ["address", "person"], + }; + + const unscoped = await buildUnifiedSearch( + config, + [], + createPipelineContext(), + ); + const scoped = await buildUnifiedSearch( + { ...config, language: "en" }, + [], + createPipelineContext(), + ); + + expect( + scoped.slices.denyList.end - scoped.slices.denyList.start, + ).toBeLessThan( + unscoped.slices.denyList.end - unscoped.slices.denyList.start, + ); + }); + + test("native config keeps alphanumeric custom deny-list overlays compact", async () => { + const testDictionaries = await getDictionaries(); + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + dictionaries: testDictionaries, + enableDenyList: true, + customDenyList: [ + { + value: "Widget X", + label: "organization", + }, + ], + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.literal_patterns_from_deny_list_data).toBe( + true, + ); + expect(search.nativeStaticConfig.literal_patterns).toHaveLength(0); + expect(search.nativeStaticConfig.deny_list_data?.originals).toContain( + "Widget X", + ); + expect( + search.nativeStaticConfig.deny_list_data?.originals.length ?? 0, + ).toBeGreaterThan(1); + }); + + test("native config inlines punctuation-edged custom deny-list overlays", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableDenyList: true, + customDenyList: [ + { + value: ".env", + label: "file", + }, + ], + labels: ["file"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.literal_patterns_from_deny_list_data).toBe( + false, + ); + expect(search.nativeStaticConfig.literal_patterns).toEqual([ + expect.objectContaining({ + kind: "literal-with-options", + pattern: ".env", + whole_words: false, + }), + ]); + }); + test("preparePipelineSearch reuses the context search cache", async () => { const context = createPipelineContext(); const config = { diff --git a/packages/anonymize/src/__test__/us-bank-routing.test.ts b/packages/anonymize/src/__test__/us-bank-routing.test.ts index bd65978a..b2e706a3 100644 --- a/packages/anonymize/src/__test__/us-bank-routing.test.ts +++ b/packages/anonymize/src/__test__/us-bank-routing.test.ts @@ -85,4 +85,14 @@ describe("US ABA routing number — cue + checksum recognizer", () => { bankAccounts(await run("The reference 122100024 appears in section 5.")), ).toHaveLength(0); }); + + test("a labelled account number in payment instructions is a bank account number", async () => { + expect( + bankAccounts( + await run( + "Pay Wells Fargo Bank, N.A., routing 121000248, account 4537891022.", + ), + ), + ).toContain("4537891022"); + }); }); diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index ea927fa1..380c39f2 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -18,6 +18,7 @@ */ import type { PatternEntry, TextSearch } from "@stll/text-search"; +import legalFormRuleWords from "./data/legal-form-rule-words.json"; import { getTextSearch } from "./search-engine"; @@ -27,6 +28,7 @@ import { type GazetteerEntry, type PipelineConfig, } from "./types"; +import { applyPipelineLanguageScope } from "./language-scope"; import type { RegexMeta } from "./detectors/regex"; import type { TriggerRule } from "./types"; import type { DenyListData } from "./detectors/deny-list"; @@ -36,19 +38,46 @@ import { defaultContext } from "./context"; import { REGEX_PATTERNS, REGEX_META, + NATIVE_REGEX_VALIDATOR_IDS, getCurrencyPatternEntries, CURRENCY_PATTERN_META, + getDateMonthData, getDatePatterns, + getYearWordData, + getMonetaryData, DATE_PATTERN_META, getSigningClausePatterns, + getNativeSigningClausePatterns, SIGNING_CLAUSE_META, + type DateMonthData, + type YearWordData, + type MonetaryData, } from "./detectors/regex"; -import { buildTriggerPatterns } from "./detectors/triggers"; +import { + buildTriggerPatterns, + getAddressStopKeywordsSync, +} from "./detectors/triggers"; import { buildDenyList } from "./detectors/deny-list"; -import { buildStreetTypePatterns } from "./detectors/address-seeds"; +import { + buildStreetTypePatterns, + getAddressSeedData, + type AddressSeedData, +} from "./detectors/address-seeds"; import { buildGazetteerPatterns } from "./detectors/gazetteer"; import { buildCountryPatterns, type CountryData } from "./detectors/countries"; import { expandLabelsForHotwordRules } from "./filters/hotword-rules"; +import { + getClauseNounHeadsSync, + getConnectorProseHeadsSync, + getKnownLegalSuffixes, + getLeadingClauseTrimsSync, + getLegalRoleHeadsSync, + getNormalizedInNameLegalFormWordsSync, + getNormalizedLegalBoundarySuffixesSync, + getSentenceVerbIndicatorsSync, + getStructuralSingleCapPrefixesSync, + warmLegalRoleHeads, +} from "./detectors/legal-forms"; const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; const ALNUM_RE = /[\p{L}\p{N}]/u; @@ -90,6 +119,8 @@ export type NativeRegexMatchMeta = { score: number; source_detail?: string; requires_validation?: boolean; + validator_id?: string; + validator_input?: string; min_byte_length?: number; }; @@ -97,23 +128,94 @@ export type NativeDenyListFilterData = { stopwords: string[]; allow_list: string[]; person_stopwords: string[]; + person_trailing_nouns: string[]; address_stopwords: string[]; + address_jurisdiction_prefixes: string[]; street_types: string[]; first_names: string[]; generic_roles: string[]; sentence_starters: string[]; trailing_address_word_exclusions: string[]; defined_term_cues: string[]; + signing_place_guards: NativeSigningPlaceGuardData[]; +}; + +export type NativeSigningPlaceGuardData = { + prefix_phrases: string[]; + suffix_phrases: string[]; }; export type NativeDenyListMatchData = { - labels: string[][]; - custom_labels: string[][]; + labels?: string[][]; + label_table?: string[]; + label_indices?: number[][]; + custom_labels?: string[][]; + custom_label_indices?: number[][]; originals: string[]; - sources: string[][]; + sources?: string[][]; + source_table?: string[]; + source_indices?: number[][]; filters?: NativeDenyListFilterData; }; +export type NativeTriggerStrategy = + | { type: "to-next-comma"; stop_words?: string[]; max_length?: number } + | { type: "to-end-of-line" } + | { type: "n-words"; count: number } + | { type: "company-id-value" } + | { type: "address"; max_chars?: number } + | { type: "match-pattern"; pattern: string; flags?: string }; + +export type NativeTriggerValidation = + | { type: "starts-uppercase" } + | { type: "min-length"; min: number } + | { type: "max-length"; max: number } + | { type: "no-digits" } + | { type: "has-digits" } + | { type: "matches-pattern"; pattern: string; flags?: string } + | { type: "valid-id"; validator: string }; + +export type NativeTriggerRule = { + trigger: string; + label: string; + strategy: NativeTriggerStrategy; + validations: NativeTriggerValidation[]; + include_trigger: boolean; +}; + +export type NativeTriggerData = { + rules: NativeTriggerRule[]; + address_stop_keywords: string[]; + party_position_terms: string[]; +}; + +export type NativeLegalFormData = { + suffixes: string[]; + normalized_boundary_suffixes: string[]; + normalized_in_name_words: string[]; + normalized_suffix_words: string[]; + role_heads: string[]; + sentence_verb_indicators: string[]; + clause_noun_heads: string[]; + connector_prose_heads: string[]; + structural_single_cap_prefixes: string[]; + leading_clause_phrases: string[]; + leading_clause_direct_prefixes: string[]; + connector_words: string[]; + and_connector_words: string[]; + in_name_prepositions: string[]; + company_suffix_words: string[]; + comma_gated_direct_prefixes: string[]; +}; + +export type NativeDateData = { + month_names_by_language: DateMonthData; + year_words_by_language: YearWordData; +}; + +export type NativeMonetaryData = MonetaryData; +export type NativeAddressSeedData = AddressSeedData; + export type NativePreparedSearchConfig = { regex_patterns: NativeSearchPattern[]; custom_regex_patterns: NativeSearchPattern[]; @@ -121,6 +223,7 @@ export type NativePreparedSearchConfig = { regex_options: NativeSearchOptions; custom_regex_options: NativeSearchOptions; literal_options: NativeSearchOptions; + literal_patterns_from_deny_list_data?: boolean; slices: { regex: PatternSlice; custom_regex: PatternSlice; @@ -136,6 +239,11 @@ export type NativePreparedSearchConfig = { deny_list_data?: NativeDenyListMatchData; gazetteer_data?: GazetteerData; country_data?: CountryData; + trigger_data?: NativeTriggerData; + legal_form_data?: NativeLegalFormData; + address_seed_data?: NativeAddressSeedData; + date_data?: NativeDateData; + monetary_data?: NativeMonetaryData; }; const createAllowedLabelSet = ( @@ -147,6 +255,9 @@ const labelIsAllowed = ( allowedLabels: ReadonlySet | null, ): boolean => allowedLabels === null || allowedLabels.has(label); +const sliceContains = (slice: PatternSlice, index: number): boolean => + index >= slice.start && index < slice.end; + export type GazetteerData = { /** Maps local pattern index to entry label. */ labels: string[]; @@ -207,6 +318,16 @@ type UnifiedSearchSources = { streetTypes: string[]; gazResult: GazetteerPatternResult | null; countryResult: CountryPatternResult | null; + nativeLegalFormPatterns: string[]; + nativeLegalFormData: NativeLegalFormData | null; + nativeDateData: NativeDateData | null; + nativeMonetaryData: NativeMonetaryData | null; + nativeAddressSeedData: NativeAddressSeedData | null; + nativeSigningPatterns: readonly string[]; + partyPositionTerms: string[]; + nativeCurrencyPatternRange: PatternSlice; + nativeDatePatternRange: PatternSlice; + nativeSigningPatternRange: PatternSlice; slices: UnifiedSearchInstance["slices"]; literalAllPatterns: PatternEntry[] | string[]; canUseGlobalWholeWordLiterals: boolean; @@ -226,6 +347,7 @@ const buildUnifiedSearchSources = async ( gazetteerEntries: GazetteerEntry[] = [], ctx: PipelineContext = defaultContext, ): Promise => { + config = applyPipelineLanguageScope(config); const legalFormsEnabled = isLegalFormsEnabled(config); const searchLabels = config.enableHotwordRules === true @@ -243,13 +365,22 @@ const buildUnifiedSearchSources = async ( // still gates whether the v2 detector runs in the pipeline, but // its pattern slice is always empty. const [ + _legalFormWarmup, triggers, denyListData, streetTypes, currencyPatterns, datePatterns, signingPatterns, + nativeSigningPatterns, + dateMonthData, + yearWordData, + monetaryData, + addressSeedData, ] = await Promise.all([ + legalFormsEnabled || config.enableTriggerPhrases + ? warmLegalRoleHeads() + : Promise.resolve(), config.enableTriggerPhrases ? buildTriggerPatterns() : Promise.resolve({ @@ -267,6 +398,21 @@ const buildUnifiedSearchSources = async ( config.enableRegex && labelIsAllowed("address", allowedLabels) ? getSigningClausePatterns() : Promise.resolve([] as string[]), + config.enableRegex && labelIsAllowed("address", allowedLabels) + ? getNativeSigningClausePatterns() + : Promise.resolve([] as string[]), + config.enableRegex && labelIsAllowed("date", allowedLabels) + ? getDateMonthData() + : Promise.resolve(null), + config.enableRegex && labelIsAllowed("date", allowedLabels) + ? getYearWordData() + : Promise.resolve(null), + config.enableRegex && labelIsAllowed("monetary amount", allowedLabels) + ? getMonetaryData() + : Promise.resolve(null), + labelIsAllowed("address", allowedLabels) + ? getAddressSeedData() + : Promise.resolve(null), ]); // Read but never populated: the legal-form slice in the unified // search is permanently empty after the v2 rewrite. Tracking it @@ -275,6 +421,44 @@ const buildUnifiedSearchSources = async ( // that hasn't migrated to v2-aware indexing yet. const legalForms: readonly string[] = []; void legalFormsEnabled; + const nativeLegalFormPatterns = legalFormsEnabled + ? [...getKnownLegalSuffixes()] + : []; + const nativeLegalFormData = + nativeLegalFormPatterns.length > 0 + ? { + suffixes: nativeLegalFormPatterns, + normalized_boundary_suffixes: [ + ...getNormalizedLegalBoundarySuffixesSync(), + ], + normalized_in_name_words: [ + ...getNormalizedInNameLegalFormWordsSync(), + ], + normalized_suffix_words: nativeLegalFormPatterns + .map((suffix) => suffix.replaceAll(/[.,\s]/g, "").toLowerCase()) + .filter((suffix) => suffix.length > 0), + role_heads: [...getLegalRoleHeadsSync()], + sentence_verb_indicators: [...getSentenceVerbIndicatorsSync()], + clause_noun_heads: [...getClauseNounHeadsSync()], + connector_prose_heads: [...getConnectorProseHeadsSync()], + structural_single_cap_prefixes: [ + ...getStructuralSingleCapPrefixesSync(), + ], + leading_clause_phrases: [...getLeadingClauseTrimsSync().phrases], + leading_clause_direct_prefixes: [ + ...getLeadingClauseTrimsSync().directPrefixes, + ], + connector_words: legalFormRuleWords.connectorWords, + and_connector_words: legalFormRuleWords.andConnectorWords, + in_name_prepositions: legalFormRuleWords.inNamePrepositions, + company_suffix_words: legalFormRuleWords.companySuffixWords, + comma_gated_direct_prefixes: + legalFormRuleWords.commaGatedDirectPrefixes, + } + : null; + const partyPositionTerms = config.enableTriggerPhrases + ? [...getLegalRoleHeadsSync()] + : []; // ── Instance 1: regex + triggers + legal-forms ── // Trigger patterns are lowercased strings with @@ -298,14 +482,26 @@ const buildUnifiedSearchSources = async ( regexMeta.push(meta); } } + const nativeCurrencyPatternRange = { + start: allRegex.length, + end: allRegex.length + currencyPatterns.length, + }; for (const pattern of currencyPatterns) { allRegex.push(pattern); regexMeta.push(CURRENCY_PATTERN_META); } + const nativeDatePatternRange = { + start: allRegex.length, + end: allRegex.length + datePatterns.length, + }; for (const pattern of datePatterns) { allRegex.push(pattern); regexMeta.push(DATE_PATTERN_META); } + const nativeSigningPatternRange = { + start: allRegex.length, + end: allRegex.length + signingPatterns.length, + }; for (const pattern of signingPatterns) { allRegex.push(pattern); regexMeta.push(SIGNING_CLAUSE_META); @@ -315,6 +511,14 @@ const buildUnifiedSearchSources = async ( score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, sourceDetail: "custom-regex" as const, })); + const nativeDateData = + dateMonthData === null + ? null + : { + month_names_by_language: dateMonthData, + year_words_by_language: yearWordData ?? {}, + }; + const nativeMonetaryData = monetaryData; let offset = 0; @@ -414,12 +618,14 @@ const buildUnifiedSearchSources = async ( } return entry.pattern; }; - const hasCustomDenyListPatterns = - denyListData?.sources.some((sources) => - sources.includes("custom-deny-list"), + const hasCustomLiteralBoundaryOverride = + denyListData?.originals.some( + (pattern, index) => + (denyListData.sources[index] ?? []).includes("custom-deny-list") && + !customDenyListNeedsWholeWords(pattern), ) ?? false; const canUseGlobalWholeWordLiterals = - !hasCustomDenyListPatterns && gazResult === null; + !hasCustomLiteralBoundaryOverride && gazResult === null; const literalAllPatterns: PatternEntry[] | string[] = canUseGlobalWholeWordLiterals ? [ @@ -452,6 +658,16 @@ const buildUnifiedSearchSources = async ( streetTypes, gazResult, countryResult, + nativeLegalFormPatterns, + nativeLegalFormData, + nativeDateData, + nativeMonetaryData, + nativeAddressSeedData: addressSeedData, + nativeSigningPatterns, + partyPositionTerms, + nativeCurrencyPatternRange, + nativeDatePatternRange, + nativeSigningPatternRange, slices: { regex: regexSlice, customRegex: customRegexSlice, @@ -485,10 +701,26 @@ export const buildNativeStaticSearchBundle = async ( customRegexes: sources.customRegexes, customRegexMeta: sources.customRegexMeta, denyListData: sources.denyListData, + triggerPatterns: sources.triggers.patterns, + triggerRules: sources.triggers.rules, + legalFormPatterns: sources.nativeLegalFormPatterns, + legalFormData: sources.nativeLegalFormData, + dateData: sources.nativeDateData, + monetaryData: sources.nativeMonetaryData, + addressSeedData: sources.nativeAddressSeedData, + nativeSigningPatterns: sources.nativeSigningPatterns, + partyPositionTerms: sources.partyPositionTerms, + streetTypes: sources.streetTypes, + omitRegexRanges: [ + sources.nativeCurrencyPatternRange, + sources.nativeDatePatternRange, + sources.nativeSigningPatternRange, + ], gazetteerPatterns: sources.gazResult?.patterns ?? [], gazetteerData: sources.gazResult?.data ?? null, countryPatterns: sources.countryResult?.patterns ?? [], countryData: sources.countryResult?.data ?? null, + canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, }), slices: sources.slices, @@ -548,10 +780,26 @@ export const buildUnifiedSearch = async ( customRegexes: sources.customRegexes, customRegexMeta: sources.customRegexMeta, denyListData: sources.denyListData, + triggerPatterns: sources.triggers.patterns, + triggerRules: sources.triggers.rules, + legalFormPatterns: sources.nativeLegalFormPatterns, + legalFormData: sources.nativeLegalFormData, + dateData: sources.nativeDateData, + monetaryData: sources.nativeMonetaryData, + addressSeedData: sources.nativeAddressSeedData, + nativeSigningPatterns: sources.nativeSigningPatterns, + partyPositionTerms: sources.partyPositionTerms, + streetTypes: sources.streetTypes, + omitRegexRanges: [ + sources.nativeCurrencyPatternRange, + sources.nativeDatePatternRange, + sources.nativeSigningPatternRange, + ], gazetteerPatterns: sources.gazResult?.patterns ?? [], gazetteerData: sources.gazResult?.data ?? null, countryPatterns: sources.countryResult?.patterns ?? [], countryData: sources.countryResult?.data ?? null, + canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, }); @@ -576,10 +824,22 @@ type BuildNativeStaticConfigArgs = { customRegexes: readonly { pattern: string }[]; customRegexMeta: readonly RegexMeta[]; denyListData: DenyListData | null; + triggerPatterns: readonly string[]; + triggerRules: readonly TriggerRule[]; + legalFormPatterns: readonly string[]; + legalFormData: NativeLegalFormData | null; + dateData: NativeDateData | null; + monetaryData: NativeMonetaryData | null; + addressSeedData: NativeAddressSeedData | null; + nativeSigningPatterns: readonly string[]; + partyPositionTerms: readonly string[]; + omitRegexRanges?: readonly PatternSlice[]; + streetTypes: readonly string[]; gazetteerPatterns: readonly PatternEntry[]; gazetteerData: GazetteerData | null; countryPatterns: readonly PatternEntry[]; countryData: CountryData | null; + canUseGlobalWholeWordLiterals: boolean; customDenyListNeedsWholeWords: (pattern: string) => boolean; }; @@ -589,49 +849,99 @@ const buildNativeStaticConfig = ({ customRegexes, customRegexMeta, denyListData, + triggerPatterns, + triggerRules, + legalFormPatterns, + legalFormData, + dateData, + monetaryData, + addressSeedData, + nativeSigningPatterns, + partyPositionTerms, + omitRegexRanges, + streetTypes, gazetteerPatterns, gazetteerData, countryPatterns, countryData, + canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords, }: BuildNativeStaticConfigArgs): NativePreparedSearchConfig => { const nativeRegexPatterns: NativeSearchPattern[] = []; const nativeRegexMeta: NativeRegexMatchMeta[] = []; for (const [index, pattern] of regexPatterns.entries()) { + if (omitRegexRanges?.some((range) => sliceContains(range, index))) { + continue; + } const meta = regexMeta[index]; - if (!meta || meta.validator) { + if (!meta || !nativeSupportsRegexMeta(meta)) { continue; } nativeRegexPatterns.push(toNativeRegexPattern(pattern)); nativeRegexMeta.push(toNativeRegexMeta(meta)); } + for (const pattern of nativeSigningPatterns) { + nativeRegexPatterns.push(toNativeRegexPattern(pattern)); + nativeRegexMeta.push(toNativeRegexMeta(SIGNING_CLAUSE_META)); + } const nativeCustomRegexPatterns = customRegexes.map((entry) => ({ kind: "regex" as const, pattern: entry.pattern, })); const nativeCustomRegexMeta = customRegexMeta.map(toNativeRegexMeta); + const legalFormNativePatterns = legalFormPatterns.map( + toNativeLegalFormPattern, + ); + const triggerNativePatterns = triggerPatterns.map(toNativeTriggerPattern); + const streetTypeNativePatterns = addressSeedData + ? streetTypes.map((pattern) => + canUseGlobalWholeWordLiterals + ? toNativeGlobalLiteralPattern(pattern) + : toNativeDenyListPattern(pattern, true), + ) + : []; + const denyListPatternsFromData = + canUseGlobalWholeWordLiterals && denyListData !== null; const denyPatterns = - denyListData?.originals.map((pattern, index) => - toNativeDenyListPattern( - pattern, - stringArrayValue(denyListData.sources[index]).includes( - "custom-deny-list", - ) - ? customDenyListNeedsWholeWords(pattern) - : true, - ), - ) ?? []; + denyListData?.originals + .map((pattern, index) => { + if (denyListPatternsFromData) { + return null; + } + return toNativeDenyListPattern( + pattern, + stringArrayValue(denyListData.sources[index]).includes( + "custom-deny-list", + ) + ? customDenyListNeedsWholeWords(pattern) + : true, + ); + }) + .filter((pattern): pattern is NativeSearchPattern => pattern !== null) ?? + []; const gazetteerNativePatterns = gazetteerPatterns.map(toNativeLiteralPattern); - const countryNativePatterns = countryPatterns.map(toNativeLiteralPattern); + const countryNativePatterns = countryPatterns.map((pattern) => + canUseGlobalWholeWordLiterals + ? toNativeGlobalLiteralPattern(patternEntryText(pattern)) + : toNativeLiteralPattern(pattern), + ); let literalOffset = 0; + const denyListPatternCount = denyListPatternsFromData + ? (denyListData?.originals.length ?? 0) + : denyPatterns.length; const denyListSlice = { start: literalOffset, - end: literalOffset + denyPatterns.length, + end: literalOffset + denyListPatternCount, }; literalOffset = denyListSlice.end; + const streetTypesSlice = { + start: literalOffset, + end: literalOffset + streetTypeNativePatterns.length, + }; + literalOffset = streetTypesSlice.end; const gazetteerSlice = { start: literalOffset, end: literalOffset + gazetteerNativePatterns.length, @@ -647,31 +957,50 @@ const buildNativeStaticConfig = ({ custom_regex_patterns: nativeCustomRegexPatterns, literal_patterns: [ ...denyPatterns, + ...streetTypeNativePatterns, ...gazetteerNativePatterns, ...countryNativePatterns, ], - regex_options: { regex_whole_words: false }, + regex_options: { + literal_case_insensitive: true, + literal_whole_words: false, + regex_whole_words: false, + }, custom_regex_options: { regex_whole_words: false }, literal_options: { literal_case_insensitive: true, - literal_whole_words: false, + literal_whole_words: canUseGlobalWholeWordLiterals, fuzzy_case_insensitive: true, fuzzy_whole_words: true, fuzzy_normalize_diacritics: true, }, + literal_patterns_from_deny_list_data: denyListPatternsFromData, slices: { regex: { start: 0, end: nativeRegexPatterns.length }, custom_regex: { start: 0, end: nativeCustomRegexPatterns.length }, - legal_forms: { start: 0, end: 0 }, - triggers: { start: 0, end: 0 }, + legal_forms: { + start: nativeRegexPatterns.length, + end: nativeRegexPatterns.length + legalFormNativePatterns.length, + }, + triggers: { + start: nativeRegexPatterns.length + legalFormNativePatterns.length, + end: + nativeRegexPatterns.length + + legalFormNativePatterns.length + + triggerNativePatterns.length, + }, deny_list: denyListSlice, - street_types: { start: 0, end: 0 }, + street_types: streetTypesSlice, gazetteer: gazetteerSlice, countries: countriesSlice, }, regex_meta: nativeRegexMeta, custom_regex_meta: nativeCustomRegexMeta, }; + nativeConfig.regex_patterns.push( + ...legalFormNativePatterns, + ...triggerNativePatterns, + ); if (denyListData) { nativeConfig.deny_list_data = toNativeDenyListData(denyListData); } @@ -681,9 +1010,127 @@ const buildNativeStaticConfig = ({ if (countryData) { nativeConfig.country_data = countryData; } + if (triggerRules.length > 0) { + nativeConfig.trigger_data = { + rules: triggerRules.map(toNativeTriggerRule), + address_stop_keywords: [...getAddressStopKeywordsSync()], + party_position_terms: [...partyPositionTerms], + }; + } + if (legalFormData) { + nativeConfig.legal_form_data = legalFormData; + } + if (addressSeedData) { + nativeConfig.address_seed_data = addressSeedData; + } + if (dateData) { + nativeConfig.date_data = dateData; + } + if (monetaryData) { + nativeConfig.monetary_data = monetaryData; + } return nativeConfig; }; +const toNativeLegalFormPattern = (pattern: string): NativeSearchPattern => ({ + kind: "literal", + pattern, +}); + +const toNativeTriggerPattern = (pattern: string): NativeSearchPattern => ({ + kind: "literal-with-options", + pattern, + case_insensitive: true, +}); + +const toNativeTriggerRule = (rule: TriggerRule): NativeTriggerRule => ({ + trigger: rule.trigger, + label: rule.label, + strategy: toNativeTriggerStrategy(rule.strategy), + validations: rule.validations.map(toNativeTriggerValidation), + include_trigger: rule.includeTrigger, +}); + +const toNativeTriggerStrategy = ( + strategy: TriggerRule["strategy"], +): NativeTriggerStrategy => { + switch (strategy.type) { + case "to-next-comma": { + const result: NativeTriggerStrategy = { type: "to-next-comma" }; + if (strategy.stopWords !== undefined) { + result.stop_words = [...strategy.stopWords]; + } + if (strategy.maxLength !== undefined) { + result.max_length = strategy.maxLength; + } + return result; + } + case "to-end-of-line": + return { type: "to-end-of-line" }; + case "n-words": + return { type: "n-words", count: strategy.count }; + case "company-id-value": + return { type: "company-id-value" }; + case "address": { + const result: NativeTriggerStrategy = { type: "address" }; + if (strategy.maxChars !== undefined) { + result.max_chars = strategy.maxChars; + } + return result; + } + case "match-pattern": { + const result: NativeTriggerStrategy = { + type: "match-pattern", + pattern: strategy.pattern, + }; + if (strategy.flags !== undefined) { + result.flags = strategy.flags; + } + return result; + } + default: { + const _exhaustive: never = strategy; + throw new Error(`Unknown trigger strategy: ${String(_exhaustive)}`); + } + } +}; + +const toNativeTriggerValidation = ( + validation: TriggerRule["validations"][number], +): NativeTriggerValidation => { + switch (validation.type) { + case "starts-uppercase": + return { type: "starts-uppercase" }; + case "min-length": + return { type: "min-length", min: validation.min }; + case "max-length": + return { type: "max-length", max: validation.max }; + case "no-digits": + return { type: "no-digits" }; + case "has-digits": + return { type: "has-digits" }; + case "matches-pattern": { + const result: NativeTriggerValidation = { + type: "matches-pattern", + pattern: validation.re.source, + }; + if (validation.re.flags.length > 0) { + result.flags = validation.re.flags; + } + return result; + } + case "valid-id": + return { + type: "valid-id", + validator: validation.validator, + }; + default: { + const _exhaustive: never = validation; + throw new Error(`Unknown trigger validation: ${String(_exhaustive)}`); + } + } +}; + const toNativeDenyListPattern = ( pattern: string, wholeWords: boolean, @@ -694,6 +1141,13 @@ const toNativeDenyListPattern = ( whole_words: wholeWords, }); +const toNativeGlobalLiteralPattern = ( + pattern: string, +): NativeSearchPattern => ({ + kind: "literal", + pattern, +}); + const toNativeRegexPattern = (entry: PatternEntry): NativeSearchPattern => { const pattern: NativeSearchPattern = { kind: "regex", @@ -790,6 +1244,12 @@ const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { } if (meta.validator) { result.requires_validation = true; + if (meta.validatorId) { + result.validator_id = meta.validatorId; + } + if (meta.validatorInputKind) { + result.validator_input = meta.validatorInputKind; + } } if (meta.minByteLength !== undefined) { result.min_byte_length = meta.minByteLength; @@ -797,15 +1257,73 @@ const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { return result; }; -const toNativeDenyListData = (data: DenyListData): NativeDenyListMatchData => ({ - labels: data.labels.map(stringArrayValue), - custom_labels: data.originals.map((_, index) => - stringArrayValue(data.customLabels[index]), - ), - originals: data.originals, - sources: data.sources.map(stringArrayValue), - filters: toNativeDenyListFilters(data.filters), -}); +const nativeSupportsRegexMeta = (meta: RegexMeta): boolean => { + if (!meta.validator) { + return true; + } + return ( + meta.validatorId !== undefined && + NATIVE_REGEX_VALIDATOR_IDS.has(meta.validatorId) && + (meta.validatorInputKind === undefined || + meta.validatorInputKind === "digits-only") + ); +}; + +const toNativeDenyListData = (data: DenyListData): NativeDenyListMatchData => { + const labelEncoder = createStringGroupEncoder(); + const sourceEncoder = createStringGroupEncoder(); + const result: NativeDenyListMatchData = { + label_table: labelEncoder.table, + label_indices: data.labels.map(labelEncoder.encode), + originals: data.originals, + source_table: sourceEncoder.table, + source_indices: data.sources.map(sourceEncoder.encode), + filters: toNativeDenyListFilters(data.filters), + }; + if (data.customLabels.length > 0) { + const customLabelIndices = data.originals.map((_, index) => + labelEncoder.encode(data.customLabels[index]), + ); + if (customLabelIndices.some((indices) => indices.length > 0)) { + result.custom_label_indices = customLabelIndices; + } + } + return result; +}; + +const createStringGroupEncoder = (): { + table: string[]; + encode: (values: string | readonly string[] | undefined) => number[]; +} => { + const table: string[] = []; + const indexes = new Map(); + const encodeValue = (value: string): number => { + const existing = indexes.get(value); + if (existing !== undefined) { + return existing; + } + const index = table.length; + table.push(value); + indexes.set(value, index); + return index; + }; + return { + table, + encode: (values) => { + if (values === undefined) { + return []; + } + if (typeof values === "string") { + return [encodeValue(values)]; + } + const encoded: number[] = []; + for (const value of values) { + encoded.push(encodeValue(value)); + } + return encoded; + }, + }; +}; const toNativeDenyListFilters = ( filters: DenyListData["filters"], @@ -813,13 +1331,19 @@ const toNativeDenyListFilters = ( stopwords: filters.stopwords, allow_list: filters.allowList, person_stopwords: filters.personStopwords, + person_trailing_nouns: filters.personTrailingNouns, address_stopwords: filters.addressStopwords, + address_jurisdiction_prefixes: filters.addressJurisdictionPrefixes, street_types: filters.streetTypes, first_names: filters.firstNames, generic_roles: filters.genericRoles, sentence_starters: filters.sentenceStarters, trailing_address_word_exclusions: filters.trailingAddressWordExclusions, defined_term_cues: filters.definedTermCues, + signing_place_guards: filters.signingPlaceGuards.map((entry) => ({ + prefix_phrases: entry.prefixPhrases, + suffix_phrases: entry.suffixPhrases, + })), }); const stringArrayValue = ( diff --git a/packages/anonymize/src/context.ts b/packages/anonymize/src/context.ts index 470e0c93..ca5ce8dd 100644 --- a/packages/anonymize/src/context.ts +++ b/packages/anonymize/src/context.ts @@ -82,6 +82,8 @@ export type PipelineContext = { allowListPromise: Promise> | null; personStopwords: ReadonlySet | null; personStopwordsPromise: Promise> | null; + definedTermHeads: ReadonlySet | null; + definedTermHeadsPromise: Promise> | null; addressStopwords: ReadonlySet | null; addressStopwordsPromise: Promise> | null; /** First-name exclusions for stopword filtering. */ @@ -121,6 +123,8 @@ export const createPipelineContext = (): PipelineContext => ({ allowListPromise: null, personStopwords: null, personStopwordsPromise: null, + definedTermHeads: null, + definedTermHeadsPromise: null, addressStopwords: null, addressStopwordsPromise: null, firstNameExclusions: null, diff --git a/packages/anonymize/src/data/address-boundaries.json b/packages/anonymize/src/data/address-boundaries.json index 12c5cc35..5d71897f 100644 --- a/packages/anonymize/src/data/address-boundaries.json +++ b/packages/anonymize/src/data/address-boundaries.json @@ -5,6 +5,9 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "nebude-li", + "nebudou-li", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +38,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/anonymize/src/data/address-jurisdiction-prefixes.json b/packages/anonymize/src/data/address-jurisdiction-prefixes.json new file mode 100644 index 00000000..f26e4fc1 --- /dev/null +++ b/packages/anonymize/src/data/address-jurisdiction-prefixes.json @@ -0,0 +1,4 @@ +{ + "_comment": "Address-like jurisdiction prefixes that are valid location/address spans without digits or street-type words. Lowercased and organized per language.", + "en": ["commonwealth of", "district of", "state of", "territory of"] +} diff --git a/packages/anonymize/src/data/address-stop-keywords.json b/packages/anonymize/src/data/address-stop-keywords.json index 98d9de5a..f7a0180c 100644 --- a/packages/anonymize/src/data/address-stop-keywords.json +++ b/packages/anonymize/src/data/address-stop-keywords.json @@ -17,6 +17,16 @@ "ičo", "ič" ], + "de": [ + "bank", + "bic", + "iban", + "steuer-id", + "steueridentifikationsnummer", + "steuernummer", + "ust-idnr", + "ust-idnr." + ], "en": ["e-mail", "email", "tel", "swift", "iban", "bic"], "pl": [ "nip", diff --git a/packages/anonymize/src/data/ambiguous-country-surfaces.json b/packages/anonymize/src/data/ambiguous-country-surfaces.json new file mode 100644 index 00000000..04962dd5 --- /dev/null +++ b/packages/anonymize/src/data/ambiguous-country-surfaces.json @@ -0,0 +1,4 @@ +{ + "_comment": "Country surface forms that collide with much more common non-country usage. Full country names and aliases remain registered separately when present.", + "words": ["indie", "island", "man", "norfolk"] +} diff --git a/packages/anonymize/src/data/clause-noun-heads.json b/packages/anonymize/src/data/clause-noun-heads.json index 937dc32a..b11bad4f 100644 --- a/packages/anonymize/src/data/clause-noun-heads.json +++ b/packages/anonymize/src/data/clause-noun-heads.json @@ -33,7 +33,11 @@ "přílohu", "dodatek", "dodatku", - "oznámení" + "článek", + "oznámení", + "podmínky", + "předmět", + "ustanovení" ], "de": [ "vertrag", diff --git a/packages/anonymize/src/data/defined-term-heads.json b/packages/anonymize/src/data/defined-term-heads.json new file mode 100644 index 00000000..aa1fffc2 --- /dev/null +++ b/packages/anonymize/src/data/defined-term-heads.json @@ -0,0 +1,4 @@ +{ + "_comment": "Common head nouns for capitalized defined/legal concepts. These are not person names by themselves; detector-specific filters assemble this vocabulary where needed. Lowercased and organized per language.", + "en": ["association", "period", "reform"] +} diff --git a/packages/anonymize/src/data/deny-list-filters.json b/packages/anonymize/src/data/deny-list-filters.json index fa30d32f..51b89152 100644 --- a/packages/anonymize/src/data/deny-list-filters.json +++ b/packages/anonymize/src/data/deny-list-filters.json @@ -1,37 +1,4 @@ { - "cs": { - "trailingAddressWordExclusions": [ - "nájemce", - "pronajímatel", - "kupující", - "prodávající", - "objednatel", - "zhotovitel", - "dodavatel", - "odběratel", - "věřitel", - "dlužník", - "zadavatel", - "uchazeč", - "příjemce", - "plátce", - "správa", - "sekretariát", - "kancelář", - "odbor", - "oddělení", - "úřad", - "inspekce", - "agentura", - "článek", - "smlouva", - "dodatek", - "příloha", - "předmět", - "podmínky", - "ustanovení" - ] - }, "en": { "definedTermCues": [ "mean", diff --git a/packages/anonymize/src/data/language-scopes.json b/packages/anonymize/src/data/language-scopes.json new file mode 100644 index 00000000..5d9b85a5 --- /dev/null +++ b/packages/anonymize/src/data/language-scopes.json @@ -0,0 +1,73 @@ +{ + "_comment": "Default dictionary scopes for content language hints. Lower-level caller config can still override name corpus languages and deny-list countries independently.", + "languages": { + "cs": { + "nameCorpusLanguages": ["cs", "sk"], + "denyListCountries": ["CZ", "SK"] + }, + "de": { + "nameCorpusLanguages": ["de"], + "denyListCountries": ["DE", "AT", "CH"] + }, + "en": { + "nameCorpusLanguages": ["en"], + "denyListCountries": ["US", "GB", "CA", "AU", "IE"] + }, + "es": { + "nameCorpusLanguages": ["es"], + "denyListCountries": [ + "ES", + "MX", + "AR", + "CL", + "CO", + "PE", + "EC", + "VE", + "UY", + "PY", + "BO", + "CR", + "PA", + "DO", + "GT", + "HN", + "SV", + "NI", + "CU" + ] + }, + "fr": { + "nameCorpusLanguages": ["fr"], + "denyListCountries": ["FR", "BE", "CH", "CA", "LU", "MC"] + }, + "hu": { + "nameCorpusLanguages": ["hu"], + "denyListCountries": ["HU"] + }, + "it": { + "nameCorpusLanguages": ["it"], + "denyListCountries": ["IT", "CH"] + }, + "pl": { + "nameCorpusLanguages": ["pl"], + "denyListCountries": ["PL"] + }, + "pt-br": { + "nameCorpusLanguages": ["pt-br"], + "denyListCountries": ["BR"] + }, + "ro": { + "nameCorpusLanguages": ["ro"], + "denyListCountries": ["RO", "MD"] + }, + "sk": { + "nameCorpusLanguages": ["sk", "cs"], + "denyListCountries": ["SK", "CZ"] + }, + "sv": { + "nameCorpusLanguages": ["sv"], + "denyListCountries": ["SE", "FI"] + } + } +} diff --git a/packages/anonymize/src/data/legal-form-rule-words.json b/packages/anonymize/src/data/legal-form-rule-words.json new file mode 100644 index 00000000..d2d1c4fc --- /dev/null +++ b/packages/anonymize/src/data/legal-form-rule-words.json @@ -0,0 +1,27 @@ +{ + "connectorWords": ["a", "and", "und", "et", "e", "y", "i", "&"], + "andConnectorWords": ["and", "und", "et"], + "inNamePrepositions": ["of", "the"], + "companySuffixWords": [ + "Company", + "Co", + "Bank", + "Brothers", + "Bros", + "Sons", + "Group", + "Holdings", + "Trust", + "Partners", + "Associates", + "Corporation", + "Industries", + "Enterprises", + "Solutions", + "Systems", + "Services", + "Foundation", + "Institute" + ], + "commaGatedDirectPrefixes": ["among", "amongst", "between"] +} diff --git a/packages/anonymize/src/data/legal-role-heads.cs.json b/packages/anonymize/src/data/legal-role-heads.cs.json index ffab15d9..8c3debe5 100644 --- a/packages/anonymize/src/data/legal-role-heads.cs.json +++ b/packages/anonymize/src/data/legal-role-heads.cs.json @@ -28,6 +28,12 @@ "dodavatele", "odběratel", "odběratele", + "plátce", + "příjemce", + "uchazeč", + "uchazeče", + "zadavatel", + "zadavatele", "smluvní", "strana", "strany" diff --git a/packages/anonymize/src/data/organization-unit-heads.json b/packages/anonymize/src/data/organization-unit-heads.json new file mode 100644 index 00000000..78e4c8f9 --- /dev/null +++ b/packages/anonymize/src/data/organization-unit-heads.json @@ -0,0 +1,13 @@ +{ + "_comment": "Administrative or organizational unit nouns that can appear in legal prose without denoting a person or a street/city suffix. Lowercased and organized per language.", + "cs": [ + "agentura", + "inspekce", + "kancelář", + "odbor", + "oddělení", + "sekretariát", + "správa", + "úřad" + ] +} diff --git a/packages/anonymize/src/data/person-stopwords.json b/packages/anonymize/src/data/person-stopwords.json index ae496fd5..6fd797b8 100644 --- a/packages/anonymize/src/data/person-stopwords.json +++ b/packages/anonymize/src/data/person-stopwords.json @@ -1,5 +1,7 @@ { "_comment": "Words that are valid in other labels (address, org) but should never be classified as person. Checked only in person chain scoring.", + "cs": ["cena"], + "en": ["dodd-frank"], "words": [ "addendum", "agent", diff --git a/packages/anonymize/src/data/signing-clauses.json b/packages/anonymize/src/data/signing-clauses.json index e8c31718..c4a72d55 100644 --- a/packages/anonymize/src/data/signing-clauses.json +++ b/packages/anonymize/src/data/signing-clauses.json @@ -1,53 +1,69 @@ { - "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix (before place), suffix (after place), prepositions (allowed inside multi-word place names).", + "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix/suffix build regexes; guardPrefixPhrases/guardSuffixPhrases suppress deny-list place hits in the same signing context.", "patterns": [ { "lang": "cs", "prefix": "(?:V|Ve)\\s+", "suffix": "\\s*,?\\s*dne", - "prepositions": ["nad", "pod", "u", "ve", "na"] + "prepositions": ["nad", "pod", "u", "ve", "na"], + "guardPrefixPhrases": ["v", "ve"], + "guardSuffixPhrases": ["dne"] }, { "lang": "sk", "prefix": "(?:V|Vo)\\s+", "suffix": "\\s*,?\\s*dňa", - "prepositions": ["nad", "pod", "pri"] + "prepositions": ["nad", "pod", "pri"], + "guardPrefixPhrases": ["v", "vo"], + "guardSuffixPhrases": ["dňa"] }, { "lang": "de", "prefix": "", "suffix": "\\s*,\\s*den", - "prepositions": ["am", "an", "im"] + "prepositions": ["am", "an", "im"], + "guardPrefixPhrases": [""], + "guardSuffixPhrases": ["den"] }, { "lang": "fr", "prefix": "(?:Fait\\s+)?[Àà]\\s+", "suffix": "\\s*,?\\s*le", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["à", "fait à"], + "guardSuffixPhrases": ["le"] }, { "lang": "en", "prefix": "(?:Signed|Executed)\\s+in\\s+", "suffix": "", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["signed in", "executed in"], + "guardSuffixPhrases": [""] }, { "lang": "pl", "prefix": "(?:W|We)\\s+", "suffix": "\\s*,?\\s*dnia", - "prepositions": ["nad", "pod", "przy"] + "prepositions": ["nad", "pod", "przy"], + "guardPrefixPhrases": ["w", "we"], + "guardSuffixPhrases": ["dnia"] }, { "lang": "it", "prefix": "(?:Fatto\\s+)?[Aa]\\s+", "suffix": "\\s*,?\\s*(?:il|lì)", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["a", "fatto a"], + "guardSuffixPhrases": ["il", "lì"] }, { "lang": "es", "prefix": "(?:Firmado\\s+)?[Ee]n\\s+", "suffix": "\\s*,?\\s*(?:a|el)", - "prepositions": ["de", "del"] + "prepositions": ["de", "del"], + "guardPrefixPhrases": ["en", "firmado en"], + "guardSuffixPhrases": ["a", "el"] } ] } diff --git a/packages/anonymize/src/data/triggers.de.json b/packages/anonymize/src/data/triggers.de.json index 2de00fe0..2dafb148 100644 --- a/packages/anonymize/src/data/triggers.de.json +++ b/packages/anonymize/src/data/triggers.de.json @@ -10,7 +10,7 @@ "label": "date of birth", "strategy": { "type": "n-words", - "count": 1 + "count": 3 }, "triggers": ["geboren am", "geb."], "extensions": ["add-colon"] diff --git a/packages/anonymize/src/data/triggers.en.json b/packages/anonymize/src/data/triggers.en.json index e7f5a975..084b0851 100644 --- a/packages/anonymize/src/data/triggers.en.json +++ b/packages/anonymize/src/data/triggers.en.json @@ -65,6 +65,17 @@ "strategy": { "type": "company-id-value" }, "triggers": ["VAT number", "VAT ID", "tax identification number", "tax id"] }, + { + "id": "en-bank-account", + "label": "bank account number", + "strategy": { "type": "company-id-value" }, + "triggers": ["account", "account number", "account no.", "account #"], + "validations": [ + { "type": "has-digits" }, + { "type": "min-length", "min": 5 }, + { "type": "max-length", "max": 34 } + ] + }, { "id": "en-uk-companies-house", "label": "registration number", diff --git a/packages/anonymize/src/detectors/address-seeds.ts b/packages/anonymize/src/detectors/address-seeds.ts index 45878665..5dbeca77 100644 --- a/packages/anonymize/src/detectors/address-seeds.ts +++ b/packages/anonymize/src/detectors/address-seeds.ts @@ -80,7 +80,13 @@ type Seed = { type DictionaryConfig = Record; +export type AddressSeedData = { + boundary_words: string[]; + br_cep_cue_words: string[]; +}; + let cachedBoundaryRe: RegExp | null = null; +let addressSeedDataPromise: Promise | null = null; const loadBoundaryWords = async (): Promise => { try { @@ -91,6 +97,15 @@ const loadBoundaryWords = async (): Promise => { } }; +const loadFieldStopWords = async (): Promise => { + try { + const mod = await import("../data/address-stop-keywords.json"); + return mod.default as DictionaryConfig; + } catch { + return {}; + } +}; + // ── pt-BR CEP context gating ──────────────────────── // // The bare `\d{5}-\d{3}` CEP shape collides with non- @@ -339,6 +354,61 @@ export const buildStreetTypePatterns = async (): Promise => { return streetTypePatternsPromise; }; +export const getAddressSeedData = async (): Promise => { + addressSeedDataPromise ??= (async () => { + const [boundaryWords, fieldStopWords, brCueWords] = await Promise.all([ + loadBoundaryWords(), + loadFieldStopWords(), + loadBrCueWords(), + ]); + return { + boundary_words: flattenDictionaries([boundaryWords, fieldStopWords]), + br_cep_cue_words: [...brCueWords], + }; + })(); + return addressSeedDataPromise; +}; + +const flattenDictionaries = ( + configs: readonly DictionaryConfig[], +): string[] => { + const words: string[] = []; + const seen = new Set(); + for (const config of configs) { + for (const word of flattenDictionary(config)) { + const key = word.toLowerCase(); + if (seen.has(key)) { + continue; + } + seen.add(key); + words.push(word); + } + } + return words; +}; + +const flattenDictionary = (config: DictionaryConfig): string[] => { + const words: string[] = []; + const seen = new Set(); + for (const values of Object.values(config)) { + if (!Array.isArray(values)) { + continue; + } + for (const word of values) { + if (typeof word !== "string" || word.length === 0) { + continue; + } + const key = word.toLowerCase(); + if (seen.has(key)) { + continue; + } + seen.add(key); + words.push(word); + } + } + return words; +}; + // ── Seed collection ───────────────────────────────── const collectSeeds = ( diff --git a/packages/anonymize/src/detectors/countries.ts b/packages/anonymize/src/detectors/countries.ts index a21834aa..385a932f 100644 --- a/packages/anonymize/src/detectors/countries.ts +++ b/packages/anonymize/src/detectors/countries.ts @@ -4,6 +4,7 @@ import { DETECTION_SOURCES } from "../constants"; import type { Entity } from "../types"; import { normalizeForSearch } from "../util/normalize"; +import ambiguousCountrySurfaces from "../data/ambiguous-country-surfaces.json" with { type: "json" }; import countriesData from "../data/countries.json" with { type: "json" }; const ENTITY_LABEL = "country"; @@ -28,7 +29,7 @@ const INCLUDE_ALPHA2 = false; * All would flag every English occurrence as a country. */ const NAME_BLOCKLIST: ReadonlySet = new Set( - ["man", "island", "indie"].map((s) => s.toLowerCase()), + ambiguousCountrySurfaces.words.map((surface) => surface.toLowerCase()), ); /** diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index 3b396e85..2f7a64bf 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -19,6 +19,11 @@ import type { PipelineContext } from "../context"; import { defaultContext } from "../context"; import { loadGenericRoles } from "../filters/false-positives"; import { buildStreetTypePatterns } from "./address-seeds"; +import { + getClauseNounHeadsSync, + getLegalRoleHeadsSync, + warmLegalRoleHeads, +} from "./legal-forms"; import { normalizeForSearch } from "../util/normalize"; import { ALL_UPPER_RE, UPPER_START_RE } from "../util/text"; import { DASH } from "../util/char-groups"; @@ -37,6 +42,33 @@ export type DenyListConfig = Pick< | "enableCountries" >; +const lowerSortedUnique = (values: Iterable): string[] => + [...new Set([...values].map((value) => value.toLowerCase()))].toSorted(); + +const collectLanguageWordValues = (data: Record): string[] => { + const words: string[] = []; + const append = (value: unknown): void => { + if (!Array.isArray(value)) { + return; + } + for (const word of value) { + if (typeof word === "string" && word.length > 0) { + words.push(word); + } + } + }; + + append(data["words"]); + for (const [key, value] of Object.entries(data)) { + if (key === "words" || key.startsWith("_")) { + continue; + } + append(value); + } + + return lowerSortedUnique(words); +}; + // ── Allow list (lazy-loaded from JSON) ─────────────── const loadAllowList = (ctx: PipelineContext): Promise> => { @@ -270,10 +302,12 @@ const loadPersonStopwords = ( } ctx.personStopwordsPromise = (async () => { try { - const mod: { - default?: { words?: string[] }; - } = await import("../data/person-stopwords.json"); - const set: ReadonlySet = new Set(mod.default?.words ?? []); + const mod = await import("../data/person-stopwords.json"); + const parsed = + (mod as { default?: Record }).default ?? mod; + const set: ReadonlySet = new Set( + collectLanguageWordValues(parsed as Record), + ); ctx.personStopwords = set; return set; } catch { @@ -291,6 +325,37 @@ const EMPTY_PERSON_STOPWORDS: ReadonlySet = new Set(); export const getPersonStopwords = (ctx: PipelineContext): ReadonlySet => ctx.personStopwords ?? EMPTY_PERSON_STOPWORDS; +export const loadDefinedTermHeads = ( + ctx: PipelineContext, +): Promise> => { + if (ctx.definedTermHeadsPromise) { + return ctx.definedTermHeadsPromise; + } + ctx.definedTermHeadsPromise = (async () => { + try { + const mod = await import("../data/defined-term-heads.json"); + const parsed = + (mod as { default?: Record }).default ?? mod; + const set: ReadonlySet = new Set( + collectLanguageWordValues(parsed as Record), + ); + ctx.definedTermHeads = set; + return set; + } catch { + const empty: ReadonlySet = new Set(); + ctx.definedTermHeads = empty; + return empty; + } + })(); + return ctx.definedTermHeadsPromise; +}; + +const EMPTY_DEFINED_TERM_HEADS: ReadonlySet = new Set(); + +export const getDefinedTermHeads = ( + ctx: PipelineContext, +): ReadonlySet => ctx.definedTermHeads ?? EMPTY_DEFINED_TERM_HEADS; + // ── Address stopwords (single-token city collisions) ── const loadAddressStopwords = ( @@ -424,29 +489,40 @@ const hasAdjacentAddressEvidence = ( type DenyListLanguageFilters = { sentenceStarters?: readonly string[]; - trailingAddressWordExclusions?: readonly string[]; definedTermCues?: readonly string[]; }; +type SigningClauseData = { + patterns: readonly { + guardPrefixPhrases?: readonly string[]; + guardSuffixPhrases?: readonly string[]; + }[]; +}; + export type DenyListFilterData = { stopwords: string[]; allowList: string[]; personStopwords: string[]; + personTrailingNouns: string[]; addressStopwords: string[]; + addressJurisdictionPrefixes: string[]; streetTypes: string[]; firstNames: string[]; genericRoles: string[]; sentenceStarters: string[]; trailingAddressWordExclusions: string[]; definedTermCues: string[]; + signingPlaceGuards: DenyListSigningPlaceGuardData[]; +}; + +export type DenyListSigningPlaceGuardData = { + prefixPhrases: string[]; + suffixPhrases: string[]; }; const DENY_LIST_FILTER_GROUPS: readonly DenyListLanguageFilters[] = Object.values(denyListFiltersByLanguage); -const lowerSortedUnique = (values: Iterable): string[] => - [...new Set([...values].map((value) => value.toLowerCase()))].toSorted(); - const escapeRegExp = (value: string): string => value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); @@ -464,17 +540,11 @@ const DENY_LIST_STATIC_FILTERS = { sentenceStarters: collectLanguageFilterValues( (filters) => filters.sentenceStarters, ), - trailingAddressWordExclusions: collectLanguageFilterValues( - (filters) => filters.trailingAddressWordExclusions, - ), }; const SENTENCE_STARTER_WORDS: ReadonlySet = new Set( DENY_LIST_STATIC_FILTERS.sentenceStarters, ); -const TRAILING_ADDRESS_WORD_EXCLUSIONS: ReadonlySet = new Set( - DENY_LIST_STATIC_FILTERS.trailingAddressWordExclusions, -); const buildDefinedTermCueRe = (): RegExp => { const cues = DENY_LIST_STATIC_FILTERS.definedTermCues.toSorted( @@ -690,11 +760,14 @@ export const buildDenyList = async ( loadStopwords(ctx), loadAllowList(ctx), loadPersonStopwords(ctx), + loadDefinedTermHeads(ctx), loadAddressStopwords(ctx), loadCommonWords(), loadMonthNames(), loadStreetTypeRe(), loadGenericRoles(ctx), + warmLegalRoleHeads(), + loadTrailingAddressWordExclusions(), ]); const commonWords = await loadCommonWords(); const monthNames = await loadMonthNames(); @@ -1016,22 +1089,134 @@ type RawMatch = { const buildStreetTypeFilterValues = async (): Promise => lowerSortedUnique(await buildStreetTypePatterns()); +type SigningPlaceFilters = { + guards: DenyListSigningPlaceGuardData[]; +}; + +let signingPlaceFiltersPromise: Promise | null = null; + +const loadSigningPlaceFilters = (): Promise => { + if (signingPlaceFiltersPromise) { + return signingPlaceFiltersPromise; + } + + signingPlaceFiltersPromise = (async () => { + const mod = await import("../data/signing-clauses.json"); + const data: SigningClauseData = mod.default ?? mod; + return { + guards: data.patterns + .map((entry) => ({ + prefixPhrases: lowerSortedUnique(entry.guardPrefixPhrases ?? []), + suffixPhrases: lowerSortedUnique(entry.guardSuffixPhrases ?? []), + })) + .filter( + (entry) => + entry.prefixPhrases.length > 0 && entry.suffixPhrases.length > 0, + ), + }; + })().catch((error) => { + signingPlaceFiltersPromise = null; + throw error; + }); + + return signingPlaceFiltersPromise; +}; + +let trailingAddressWordExclusionsPromise: Promise> | null = + null; +let addressJurisdictionPrefixesPromise: Promise | null = null; + +const loadLanguageWordFile = async ( + importer: () => Promise, +): Promise => { + const mod = await importer(); + const parsed = (mod as { default?: Record }).default ?? mod; + return collectLanguageWordValues(parsed as Record); +}; + +const loadTrailingAddressWordExclusions = async (): Promise< + ReadonlySet +> => { + if (trailingAddressWordExclusionsPromise) { + return trailingAddressWordExclusionsPromise; + } + + trailingAddressWordExclusionsPromise = (async () => { + await warmLegalRoleHeads(); + const [organizationUnits, documentHeadings] = await Promise.all([ + loadLanguageWordFile( + () => import("../data/organization-unit-heads.json"), + ), + loadLanguageWordFile( + () => import("../data/document-structure-headings.json"), + ), + ]); + return new Set( + lowerSortedUnique([ + ...getLegalRoleHeadsSync(), + ...getClauseNounHeadsSync(), + ...organizationUnits, + ...documentHeadings, + ]), + ); + })().catch((error) => { + trailingAddressWordExclusionsPromise = null; + throw error; + }); + + return trailingAddressWordExclusionsPromise; +}; + +const loadAddressJurisdictionPrefixes = (): Promise => { + if (addressJurisdictionPrefixesPromise) { + return addressJurisdictionPrefixesPromise; + } + + addressJurisdictionPrefixesPromise = loadLanguageWordFile( + () => import("../data/address-jurisdiction-prefixes.json"), + ).catch((error) => { + addressJurisdictionPrefixesPromise = null; + throw error; + }); + + return addressJurisdictionPrefixesPromise; +}; + const buildDenyListFilterData = async ( ctx: PipelineContext, -): Promise => ({ - stopwords: [...getStopwords(ctx)], - allowList: [...getAllowList(ctx)], - personStopwords: [...getPersonStopwords(ctx)], - addressStopwords: [...getAddressStopwords(ctx)], - streetTypes: await buildStreetTypeFilterValues(), - firstNames: [...getNameCorpusFirstNames(ctx)], - genericRoles: [...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES)], - sentenceStarters: [...DENY_LIST_STATIC_FILTERS.sentenceStarters], - trailingAddressWordExclusions: [ - ...DENY_LIST_STATIC_FILTERS.trailingAddressWordExclusions, - ], - definedTermCues: [...DENY_LIST_STATIC_FILTERS.definedTermCues], -}); +): Promise => { + const [ + signingPlaceFilters, + trailingAddressWordExclusions, + addressJurisdictionPrefixes, + ] = await Promise.all([ + loadSigningPlaceFilters(), + loadTrailingAddressWordExclusions(), + loadAddressJurisdictionPrefixes(), + ]); + + return { + stopwords: [...getStopwords(ctx)], + allowList: [...getAllowList(ctx)], + personStopwords: [...getPersonStopwords(ctx)], + personTrailingNouns: [...getDefinedTermHeads(ctx)], + addressStopwords: [...getAddressStopwords(ctx)], + addressJurisdictionPrefixes, + streetTypes: await buildStreetTypeFilterValues(), + firstNames: [...getNameCorpusFirstNames(ctx)], + genericRoles: [ + ...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES), + ...getLegalRoleHeadsSync(), + ], + sentenceStarters: [...DENY_LIST_STATIC_FILTERS.sentenceStarters], + trailingAddressWordExclusions: [...trailingAddressWordExclusions], + definedTermCues: [...DENY_LIST_STATIC_FILTERS.definedTermCues], + signingPlaceGuards: signingPlaceFilters.guards.map((entry) => ({ + prefixPhrases: [...entry.prefixPhrases], + suffixPhrases: [...entry.suffixPhrases], + })), + }; +}; const customMatchHasValidEdges = ( fullText: string, @@ -1074,9 +1259,13 @@ export const ensureDenyListData = async ( loadStopwords(ctx), loadAllowList(ctx), loadPersonStopwords(ctx), + loadDefinedTermHeads(ctx), loadAddressStopwords(ctx), loadStreetTypeRe(), loadGenericRoles(ctx), + warmLegalRoleHeads(), + loadTrailingAddressWordExclusions(), + loadAddressJurisdictionPrefixes(), ]); }; @@ -1369,7 +1558,11 @@ export const processDenyListMatches = ( // "Praha 1", "Brno 2"). Czech and Slovak cities // commonly have numbered districts that are part of // the address. - extendCityDistricts(results, fullText); + extendCityDistricts( + results, + fullText, + new Set(data.filters.trailingAddressWordExclusions), + ); return results; }; @@ -1401,7 +1594,11 @@ const POSTAL_PREFIX_RE = new RegExp( `(?:\\d{5}|\\d{3}\\s\\d{2})\\s*${DASH}?\\s*$`, ); -const extendCityDistricts = (entities: Entity[], fullText: string): void => { +const extendCityDistricts = ( + entities: Entity[], + fullText: string, + trailingAddressWordExclusions: ReadonlySet, +): void => { for (const entity of entities) { if (entity.label !== "address") { continue; @@ -1451,7 +1648,7 @@ const extendCityDistricts = (entities: Entity[], fullText: string): void => { const trailingWordM = /^[\s]{1,4}(\p{Lu}\p{Ll}+)/u.exec(afterExt); if (trailingWordM && !trailingWordM[0].includes("\n")) { const candidate = (trailingWordM[1] ?? "").toLowerCase(); - if (!TRAILING_ADDRESS_WORD_EXCLUSIONS.has(candidate)) { + if (!trailingAddressWordExclusions.has(candidate)) { entity.end += trailingWordM[0].length; entity.text = fullText.slice(entity.start, entity.end); } diff --git a/packages/anonymize/src/detectors/legal-forms.ts b/packages/anonymize/src/detectors/legal-forms.ts index bfd0dad3..1e74d055 100644 --- a/packages/anonymize/src/detectors/legal-forms.ts +++ b/packages/anonymize/src/detectors/legal-forms.ts @@ -157,7 +157,7 @@ const loadLeadingClauseTrims = async (): Promise => { return leadingClauseTrimsPromise; }; -const getLeadingClauseTrimsSync = (): LeadingClauseTrims => +export const getLeadingClauseTrimsSync = (): LeadingClauseTrims => leadingClauseTrimsCache ?? EMPTY_LEADING_CLAUSE_TRIMS; // Generic legal/contract role words that should never appear @@ -301,7 +301,7 @@ const loadAllLegalSuffixes = async (): Promise => { const getAllLegalSuffixesSync = (): readonly string[] => allLegalSuffixesCache ?? LEGAL_SUFFIXES; -const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => +export const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => normalizedLegalBoundarySuffixesCache ?? new Set( LEGAL_SUFFIXES.map(normalizeLegalSuffixToken).filter( @@ -309,7 +309,7 @@ const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => ), ); -const getNormalizedInNameLegalFormWordsSync = (): ReadonlySet => +export const getNormalizedInNameLegalFormWordsSync = (): ReadonlySet => normalizedInNameLegalFormWordsCache ?? new Set(); /** @@ -418,7 +418,7 @@ const loadConnectorProseHeads = async (): Promise> => { return connectorProseHeadsPromise; }; -const getConnectorProseHeadsSync = (): ReadonlySet => +export const getConnectorProseHeadsSync = (): ReadonlySet => connectorProseHeadsCache ?? new Set(); let structuralSingleCapPrefixesCache: ReadonlySet | null = null; @@ -473,7 +473,7 @@ const loadStructuralSingleCapPrefixes = async (): Promise< return structuralSingleCapPrefixesPromise; }; -const getStructuralSingleCapPrefixesSync = (): ReadonlySet => +export const getStructuralSingleCapPrefixesSync = (): ReadonlySet => structuralSingleCapPrefixesCache ?? new Set(); // Used by the trim helpers below to escape literal suffix tokens diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index d3123242..29ece50b 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -164,8 +164,10 @@ export type RegexMeta = { minByteLength?: number; /** Post-match stdnum validator for confirmation. */ validator?: Validator; + validatorId?: string; /** Extract the identifier portion when context is part of the regex span. */ validatorInput?: (text: string) => string; + validatorInputKind?: "digits-only" | "crypto-wallet-candidate"; }; type RegexDef = { @@ -174,10 +176,16 @@ type RegexDef = { score: number; minByteLength?: number; validator?: Validator; + validatorId?: string; validatorInput?: (text: string) => string; + validatorInputKind?: "digits-only" | "crypto-wallet-candidate"; }; type AmountWordsConfig = { + patterns?: Array<{ + lang: string; + keywords: string[]; + }>; percentages?: Array<{ lang: string; keywords: string[]; @@ -202,6 +210,92 @@ type AmountWordsConfig = { const AMOUNT_WORDS = amountWordsConfig as AmountWordsConfig; +const DIGITS_ONLY_VALIDATOR_INPUT = (text: string): string => + text.replace(/\D/g, ""); + +const VALIDATOR_IDS = new Map([ + [at.businessid, "at.businessid"], + [at.tin, "at.tin"], + [at.uid, "at.uid"], + [au.abn, "au.abn"], + [au.acn, "au.acn"], + [be.nn, "be.nn"], + [be.vat, "be.vat"], + [bg.vat, "bg.vat"], + [br.cnpj, "br.cnpj"], + [br.cpf, "br.cpf"], + [ch.uid, "ch.uid"], + [cn.ric, "cn.ric"], + [crypto.wallet, "crypto.wallet"], + [cy.vat, "cy.vat"], + [cz.dic, "cz.dic"], + [cz.rc, "cz.rc"], + [de.idnr, "de.idnr"], + [de.stnr, "de.stnr"], + [de.svnr, "de.svnr"], + [de.vat, "de.vat"], + [dk.cpr, "dk.cpr"], + [dk.vat, "dk.vat"], + [ee.ik, "ee.ik"], + [ee.vat, "ee.vat"], + [es.cif, "es.cif"], + [es.dni, "es.dni"], + [es.nie, "es.nie"], + [es.nss, "es.nss"], + [es.vat, "es.vat"], + [fi.hetu, "fi.hetu"], + [fi.vat, "fi.vat"], + [fi.ytunnus, "fi.ytunnus"], + [fr.nir, "fr.nir"], + [fr.siren, "fr.siren"], + [fr.siret, "fr.siret"], + [fr.tva, "fr.tva"], + [gb.nhs, "gb.nhs"], + [gb.nino, "gb.nino"], + [gb.vat, "gb.vat"], + [gr.vat, "gr.vat"], + [hr.vat, "hr.vat"], + [hu.vat, "hu.vat"], + [ie.pps, "ie.pps"], + [ie.vat, "ie.vat"], + [it.codiceFiscale, "it.codiceFiscale"], + [it.iva, "it.iva"], + [lt.asmens, "lt.asmens"], + [lt.vat, "lt.vat"], + [lu.vat, "lu.vat"], + [lv.vat, "lv.vat"], + [mt.vat, "mt.vat"], + [nl.vat, "nl.vat"], + [no.mva, "no.mva"], + [no.orgnr, "no.orgnr"], + [pl.nip, "pl.nip"], + [pl.pesel, "pl.pesel"], + [pt.cc, "pt.cc"], + [pt.vat, "pt.vat"], + [ro.cnp, "ro.cnp"], + [ro.vat, "ro.vat"], + [se.personnummer, "se.personnummer"], + [si.vat, "si.vat"], + [sk.dic, "sk.dic"], + [us.ein, "us.ein"], +]); + +export const NATIVE_REGEX_VALIDATOR_IDS: ReadonlySet = new Set([ + "au.abn", + "br.cnpj", + "br.cpf", + "cz.dic", + "cz.rc", + "es.cif", + "es.dni", + "es.nie", + "gb.nhs", + "gb.nino", + "no.mva", + "no.orgnr", + "us.ein", +]); + // ── stdnum validator entries ──────────────────────── // Each entry pairs a @stll/stdnum validator with a // label and confidence score. The pattern derived via @@ -564,7 +658,7 @@ const CZ_BIRTH_NUMBER: RegexDef = { // Czech commercial-register reference. Every Czech // legal entity in the public registry is uniquely -// identified by a registry section letter ("oddíl X") +// identified by a registry section code ("oddíl X") // plus an insert number ("vložka NNN"). The full phrase // uniquely identifies the company, so we emit it as a // single registration-number entity rather than only @@ -575,11 +669,11 @@ const CZ_BIRTH_NUMBER: RegexDef = { // - optional whitespace around comma and after each // keyword (DOCX exports add NBSPs and double // spaces); -// - section letter is a single A-Z; insert number is -// a 1-6 digit integer. +// - section code is a short letter code; insert number +// is a 1-6 digit integer. const CZ_COMMERCIAL_REGISTER: RegexDef = { pattern: - `(?i)\\boddíl[^\\S\\n]+[A-Z]` + + `(?i)\\boddíl[^\\S\\n]+\\p{L}{1,3}` + `[^\\S\\n]*,[^\\S\\n]*` + `vložka[^\\S\\n]+\\d{1,6}\\b`, label: "registration number", @@ -695,7 +789,8 @@ const NHS_NUMBER_CONTEXT: RegexDef = { label: "national identification number", score: 0.95, validator: gb.nhs, - validatorInput: (text) => text.replace(/\D/g, ""), + validatorInput: DIGITS_ONLY_VALIDATOR_INPUT, + validatorInputKind: "digits-only", }; const PASSPORT_CONTEXT: RegexDef = { @@ -802,6 +897,7 @@ const CRYPTO_WALLET_ADDRESS: RegexDef = { score: 0.85, validator: crypto.wallet, validatorInput: getCryptoWalletCandidate, + validatorInputKind: "crypto-wallet-candidate", }; const AU_ABN_FORMATTED: RegexDef = { @@ -1165,12 +1261,21 @@ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( }; if (d.validator) { meta.validator = d.validator; + const validatorId = d.validatorId ?? VALIDATOR_IDS.get(d.validator); + if (!validatorId) { + throw new Error(`Missing regex validator id for ${d.label}`); + } + meta.validatorId = validatorId; } if (d.minByteLength) { meta.minByteLength = d.minByteLength; } if (d.validatorInput) { meta.validatorInput = d.validatorInput; + if (!d.validatorInputKind) { + throw new Error(`Missing regex validator input kind for ${d.label}`); + } + meta.validatorInputKind = d.validatorInputKind; } return meta; }, @@ -1185,6 +1290,9 @@ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( */ type DateMonths = Record; +export type DateMonthData = Record; +export type YearWordData = Record; + /** * Build month-name alternation from date-months.json. * Deduplicates across all 22 languages, filters names @@ -1213,6 +1321,18 @@ const buildMonthAlternation = (months: DateMonths): string => { .join("|"); }; +const buildDateMonthData = (months: DateMonths): DateMonthData => { + const result: DateMonthData = {}; + for (const [key, value] of Object.entries(months)) { + if (key.startsWith("_")) continue; + const names = Array.isArray(value) ? value : [value]; + result[key] = names.filter( + (name) => name.replace(/\.$/, "").length >= MIN_MONTH_NAME_LENGTH, + ); + } + return result; +}; + /** * Build date patterns from a month-name alternation. * Returns 6 patterns covering the major written-date @@ -1248,13 +1368,19 @@ const buildDatePatternsFromMonths = (alt: string): string[] => { /** Cached promise for date patterns. Loaded once. */ let datePatternPromise: Promise | null = null; +let dateMonthDataPromise: Promise | null = null; +let yearWordDataPromise: Promise | null = null; -const loadDatePatterns = async (): Promise => { +const loadDateMonths = async (): Promise => { const mod = await import("../data/date-months.json"); // Dynamic import of JSON returns { default, ...keys }. // Use `default` if present (ESM wrapper), else the // module itself. - const months: DateMonths = mod.default ?? mod; + return mod.default ?? mod; +}; + +const loadDatePatterns = async (): Promise => { + const months = await loadDateMonths(); const alt = buildMonthAlternation(months); return buildDatePatternsFromMonths(alt); }; @@ -1274,6 +1400,35 @@ export const getDatePatterns = (): Promise => { return datePatternPromise; }; +export const getDateMonthData = (): Promise => { + if (!dateMonthDataPromise) { + dateMonthDataPromise = loadDateMonths() + .then(buildDateMonthData) + .catch((err) => { + dateMonthDataPromise = null; + throw err; + }); + } + return dateMonthDataPromise; +}; + +export const getYearWordData = (): Promise => { + yearWordDataPromise ??= import("../data/year-words.json").then((mod) => { + const data = (mod.default ?? mod) as Record; + const result: YearWordData = {}; + for (const [key, words] of Object.entries(data)) { + if (key.startsWith("_") || !Array.isArray(words)) { + continue; + } + result[key] = words.filter( + (word): word is string => typeof word === "string" && word.length > 0, + ); + } + return result; + }); + return yearWordDataPromise; +}; + /** Date pattern metadata (all are score 1 dates). */ export const DATE_PATTERN_META: Readonly = Object.freeze({ label: "date", @@ -1292,6 +1447,28 @@ type CurrenciesData = { localNames?: string[]; }; +export type MonetaryData = { + currencies: { + codes: string[]; + symbols: string[]; + local_names: string[]; + }; + amount_words: { + written_amount_patterns: Array<{ + keywords: string[]; + }>; + magnitude_suffixes: Array<{ + words: string[]; + abbreviations_case_insensitive: string[]; + abbreviations_case_sensitive: string[]; + }>; + share_quantity_terms: Array<{ + modifiers: string[]; + nouns: string[]; + }>; + }; +}; + type FinancialLexicons = { magnitudeOptional: string; magnitudeRequired: string; @@ -1728,6 +1905,7 @@ const buildCurrencyPatternEntries = ( /** Cached promise for currency patterns. Loaded once. */ let currencyPatternPromise: Promise | null = null; let currencyPatternEntryPromise: Promise | null = null; +let monetaryDataPromise: Promise | null = null; const loadCurrencyPatternEntries = async (): Promise< CurrencyPatternEntry[] @@ -1740,6 +1918,37 @@ const loadCurrencyPatternEntries = async (): Promise< const loadCurrencyPatterns = async (): Promise => (await loadCurrencyPatternEntries()).map((entry) => entry.pattern); +const loadMonetaryData = async (): Promise => { + const mod = await import("../data/currencies.json"); + const currencies: CurrenciesData = mod.default ?? mod; + return { + currencies: { + codes: currencies.codes, + symbols: currencies.symbols, + local_names: currencies.localNames ?? [], + }, + amount_words: { + written_amount_patterns: (AMOUNT_WORDS.patterns ?? []).map((entry) => ({ + keywords: entry.keywords, + })), + magnitude_suffixes: (AMOUNT_WORDS.magnitudeSuffixes ?? []).map( + (entry) => ({ + words: entry.words ?? [], + abbreviations_case_insensitive: + entry.abbreviationsCaseInsensitive ?? [], + abbreviations_case_sensitive: entry.abbreviationsCaseSensitive ?? [], + }), + ), + share_quantity_terms: (AMOUNT_WORDS.shareQuantityTerms ?? []).map( + (entry) => ({ + modifiers: entry.modifiers ?? [], + nouns: entry.nouns, + }), + ), + }, + }; +}; + /** * Get dynamically built monetary amount patterns from * currencies.json. Returns a cached promise; the JSON @@ -1767,6 +1976,16 @@ export const getCurrencyPatternEntries = (): Promise< return currencyPatternEntryPromise; }; +export const getMonetaryData = (): Promise => { + if (!monetaryDataPromise) { + monetaryDataPromise = loadMonetaryData().catch((err) => { + monetaryDataPromise = null; + throw err; + }); + } + return monetaryDataPromise; +}; + /** Currency pattern metadata (score 0.9). */ export const CURRENCY_PATTERN_META: Readonly = Object.freeze({ label: "monetary amount", @@ -1854,6 +2073,8 @@ type SigningClauseConfig = { prefix: string; suffix: string; prepositions: string[]; + guardPrefixPhrases?: string[]; + guardSuffixPhrases?: string[]; }>; }; @@ -1902,6 +2123,7 @@ export const SIGNING_CLAUSE_META: Readonly = { }; let signingPatternPromise: Promise | null = null; +let nativeSigningPatternPromise: Promise | null = null; const loadSigningPatterns = async (): Promise => { const mod = await import("../data/signing-clauses.json"); @@ -1909,6 +2131,14 @@ const loadSigningPatterns = async (): Promise => { return buildSigningClausePatterns(data); }; +const loadNativeSigningPatterns = async (): Promise => { + const mod = await import("../data/signing-clauses.json"); + const data: SigningClauseConfig = mod.default ?? mod; + return buildSigningClausePatterns({ + patterns: data.patterns.filter((entry) => entry.lang === "de"), + }); +}; + export const getSigningClausePatterns = (): Promise => { if (!signingPatternPromise) { signingPatternPromise = loadSigningPatterns().catch((err) => { @@ -1918,3 +2148,13 @@ export const getSigningClausePatterns = (): Promise => { } return signingPatternPromise; }; + +export const getNativeSigningClausePatterns = (): Promise => { + if (!nativeSigningPatternPromise) { + nativeSigningPatternPromise = loadNativeSigningPatterns().catch((err) => { + nativeSigningPatternPromise = null; + throw err; + }); + } + return nativeSigningPatternPromise; +}; diff --git a/packages/anonymize/src/detectors/triggers.ts b/packages/anonymize/src/detectors/triggers.ts index a81d9ac1..3510328c 100644 --- a/packages/anonymize/src/detectors/triggers.ts +++ b/packages/anonymize/src/detectors/triggers.ts @@ -168,6 +168,7 @@ const compileValidations = ( } return { type: "valid-id", + validator: v.validator, check: (value) => { // stdnum validators expect compact digits only; // strip formatting (spaces, dots, dashes, @@ -637,14 +638,13 @@ const loadAddressStopKeywords = async (): Promise => { return addressStopKeywordsPromise; }; -const getAddressStopKeywordsSync = (): readonly string[] => +export const getAddressStopKeywordsSync = (): readonly string[] => addressStopKeywordsCache ?? ADDRESS_STOP_KEYWORDS_SEED; /** - * Warm the address-stop-keywords cache. Pipeline callers + * Warm address support data. Pipeline callers * await this before invoking trigger detection so the - * synchronous `extractValue` path uses the merged list - * instead of the seed fallback. + * synchronous `extractValue` path uses merged data. */ export const warmAddressStopKeywords = async (): Promise => { await loadAddressStopKeywords(); @@ -1128,7 +1128,10 @@ const extractValue = ( const idRaw = trailingLetterMatch ? idMatch[0] + trailingLetterMatch[0] : idMatch[0]; - const idText = idRaw.trim(); + const idText = idRaw.trim().replace(/[.,;:!?]+$/u, ""); + if (idText.length === 0) { + return null; + } const leadingSpaces = idMatch[0].length - idMatch[0].trimStart().length; const idStart = triggerEnd + diff --git a/packages/anonymize/src/filters/false-positives.ts b/packages/anonymize/src/filters/false-positives.ts index c9966061..f73a7879 100644 --- a/packages/anonymize/src/filters/false-positives.ts +++ b/packages/anonymize/src/filters/false-positives.ts @@ -2,7 +2,10 @@ import type { Entity } from "../types"; import type { PipelineContext } from "../context"; import { defaultContext } from "../context"; import { isCallerOwnedEntity } from "../util/entity-source"; -import { getPersonStopwords } from "../detectors/deny-list"; +import { + getDefinedTermHeads, + getPersonStopwords, +} from "../detectors/deny-list"; import { normalizeHomoglyphs } from "../util/homoglyphs"; const TEMPLATE_PLACEHOLDER_RE = /^(?:\.{3,}|_{3,}|\[[\w\s]+\]|\{[\w\s]+\})$/; @@ -182,11 +185,6 @@ const STANDALONE_YEAR_RE = /^(?:19|20)\d{2}$/; // by one of these, it's a reference number, not PII. const NUMBER_ABBREV_RE = /(?:^|[\s(])(?:č|čís|nr|no|n)\.\s*$/i; const SIGNING_CLAUSE_ADDRESS_RE = /^(?:v|ve)\s+[^\d,\n]{1,40},?\s+dne$/iu; -const PERSON_TRAILING_NOUNS: ReadonlySet = new Set([ - "association", - "period", - "reform", -]); const LEGAL_FORM_HEADING_RE = /\b(?:agreement|amendment|contract|exhibit)\b/iu; const LEADING_ARTIFACT_RE = /^(?:\.\s)+/u; const ADDRESS_ROLE_PREFIX_RE = @@ -506,6 +504,7 @@ export const filterFalsePositives = ( ): Entity[] => { const filtered: Entity[] = []; const roles = getGenericRoles(ctx); + const definedTermHeads = getDefinedTermHeads(ctx); for (const entity of entities) { if (isCallerOwnedEntity(entity)) { @@ -663,11 +662,7 @@ export const filterFalsePositives = ( const lastFolded = last ? normalizeHomoglyphs(last).toLowerCase() : undefined; - if ( - tokens.length > 1 && - lastFolded && - PERSON_TRAILING_NOUNS.has(lastFolded) - ) { + if (tokens.length > 1 && lastFolded && definedTermHeads.has(lastFolded)) { continue; } } diff --git a/packages/anonymize/src/language-scope.ts b/packages/anonymize/src/language-scope.ts new file mode 100644 index 00000000..7283345d --- /dev/null +++ b/packages/anonymize/src/language-scope.ts @@ -0,0 +1,86 @@ +import languageScopes from "./data/language-scopes.json"; + +import type { PipelineConfig } from "./types"; + +type LanguageScope = { + nameCorpusLanguages?: readonly string[]; + denyListCountries?: readonly string[]; +}; + +type LanguageScopeData = { + languages: Record; +}; + +const scopeData = languageScopes as LanguageScopeData; + +const normalizeLanguage = (language: string): string => + language.trim().toLowerCase(); + +const fallbackLanguage = (language: string): string | null => { + const index = language.indexOf("-"); + return index === -1 ? null : language.slice(0, index); +}; + +const uniquePush = (target: string[], values: readonly string[]): void => { + const seen = new Set(target); + for (const value of values) { + if (seen.has(value)) { + continue; + } + seen.add(value); + target.push(value); + } +}; + +const resolveLanguageScope = (language: string): LanguageScope | null => { + const normalized = normalizeLanguage(language); + if (normalized.length === 0) { + return null; + } + const exact = scopeData.languages[normalized]; + if (exact !== undefined) { + return exact; + } + const fallback = fallbackLanguage(normalized); + return fallback === null ? null : (scopeData.languages[fallback] ?? null); +}; + +const configuredLanguages = (config: PipelineConfig): readonly string[] => { + if (config.languages !== undefined) { + return config.languages; + } + return config.language === undefined ? [] : [config.language]; +}; + +export const applyPipelineLanguageScope = ( + config: PipelineConfig, +): PipelineConfig => { + const languages = configuredLanguages(config); + if (languages.length === 0) { + return config; + } + + const nameCorpusLanguages: string[] = []; + const denyListCountries: string[] = []; + for (const language of languages) { + const scope = resolveLanguageScope(language); + if (scope === null) { + continue; + } + uniquePush(nameCorpusLanguages, scope.nameCorpusLanguages ?? []); + uniquePush(denyListCountries, scope.denyListCountries ?? []); + } + + const next: Partial = {}; + if ( + config.nameCorpusLanguages === undefined && + nameCorpusLanguages.length > 0 + ) { + next.nameCorpusLanguages = nameCorpusLanguages; + } + if (config.denyListCountries === undefined && denyListCountries.length > 0) { + next.denyListCountries = denyListCountries; + } + + return Object.keys(next).length === 0 ? config : { ...config, ...next }; +}; diff --git a/packages/anonymize/src/pipeline.ts b/packages/anonymize/src/pipeline.ts index b0773d9d..0dd0bc53 100644 --- a/packages/anonymize/src/pipeline.ts +++ b/packages/anonymize/src/pipeline.ts @@ -19,6 +19,7 @@ import { } from "./detectors/triggers"; import { ensureDenyListData, + loadDefinedTermHeads, processDenyListMatches, } from "./detectors/deny-list"; import { processAddressSeeds } from "./detectors/address-seeds"; @@ -69,6 +70,7 @@ import { runUnifiedSearch } from "./unified-search"; import { maskDetectedSpans, unmaskNerEntities } from "./util/entity-masking"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; +import { applyPipelineLanguageScope } from "./language-scope"; /** * Sources backed by curated literal dictionaries. @@ -1027,7 +1029,11 @@ export const preparePipelineSearch = ({ gazetteerEntries = [], context, }: PipelineSearchOptions): Promise => - getCachedSearch(config, gazetteerEntries, context ?? defaultContext); + getCachedSearch( + applyPipelineLanguageScope(config), + gazetteerEntries, + context ?? defaultContext, + ); /** * Options for {@link runPipeline}. @@ -1068,7 +1074,7 @@ export const runPipeline = async ( ): Promise => { const { fullText, - config, + config: inputConfig, gazetteerEntries, nerInference = null, onProgress, @@ -1076,6 +1082,7 @@ export const runPipeline = async ( signal, context, } = options; + const config = applyPipelineLanguageScope(inputConfig); const ctx = context ?? defaultContext; const allowedLabels = createAllowedLabelSet(config); const legalFormsEnabled = isLegalFormsEnabled(config); @@ -1113,6 +1120,7 @@ export const runPipeline = async ( }); await Promise.all([ loadGenericRoles(ctx), + loadDefinedTermHeads(ctx), loadDocumentStructureHeadings(), initPrepositions(), initStreetAbbrevs(), @@ -1124,6 +1132,7 @@ export const runPipeline = async ( } else { await Promise.all([ loadGenericRoles(ctx), + loadDefinedTermHeads(ctx), loadDocumentStructureHeadings(), initPrepositions(), initStreetAbbrevs(), diff --git a/packages/anonymize/src/types.ts b/packages/anonymize/src/types.ts index eb0ae363..4eadd4e4 100644 --- a/packages/anonymize/src/types.ts +++ b/packages/anonymize/src/types.ts @@ -183,7 +183,11 @@ export type CompiledValidation = | { type: "no-digits"; re: RegExp } | { type: "has-digits"; re: RegExp } | { type: "matches-pattern"; re: RegExp } - | { type: "valid-id"; check: (value: string) => boolean }; + | { + type: "valid-id"; + validator: ValidIdValidator; + check: (value: string) => boolean; + }; /** * Runtime rule — one per trigger string after @@ -355,6 +359,18 @@ export type PipelineConfig = { threshold: number; enableTriggerPhrases: boolean; enableRegex: boolean; + /** + * Expected content language codes. When present, these + * derive default dictionary scopes for name corpus and + * deny-list matching unless the lower-level scope fields + * below are set explicitly. + */ + languages?: string[]; + /** + * Convenience form for single-language documents. Ignored + * when `languages` is also provided. + */ + language?: string; /** * Enables legal-form organization detection. * Required for typed callers; legacy untyped diff --git a/packages/data/config/address-boundaries.json b/packages/data/config/address-boundaries.json index 12c5cc35..5d71897f 100644 --- a/packages/data/config/address-boundaries.json +++ b/packages/data/config/address-boundaries.json @@ -5,6 +5,9 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "nebude-li", + "nebudou-li", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +38,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/data/config/address-jurisdiction-prefixes.json b/packages/data/config/address-jurisdiction-prefixes.json new file mode 100644 index 00000000..f26e4fc1 --- /dev/null +++ b/packages/data/config/address-jurisdiction-prefixes.json @@ -0,0 +1,4 @@ +{ + "_comment": "Address-like jurisdiction prefixes that are valid location/address spans without digits or street-type words. Lowercased and organized per language.", + "en": ["commonwealth of", "district of", "state of", "territory of"] +} diff --git a/packages/data/config/address-stop-keywords.json b/packages/data/config/address-stop-keywords.json index 98d9de5a..f7a0180c 100644 --- a/packages/data/config/address-stop-keywords.json +++ b/packages/data/config/address-stop-keywords.json @@ -17,6 +17,16 @@ "ičo", "ič" ], + "de": [ + "bank", + "bic", + "iban", + "steuer-id", + "steueridentifikationsnummer", + "steuernummer", + "ust-idnr", + "ust-idnr." + ], "en": ["e-mail", "email", "tel", "swift", "iban", "bic"], "pl": [ "nip", diff --git a/packages/data/config/ambiguous-country-surfaces.json b/packages/data/config/ambiguous-country-surfaces.json new file mode 100644 index 00000000..04962dd5 --- /dev/null +++ b/packages/data/config/ambiguous-country-surfaces.json @@ -0,0 +1,4 @@ +{ + "_comment": "Country surface forms that collide with much more common non-country usage. Full country names and aliases remain registered separately when present.", + "words": ["indie", "island", "man", "norfolk"] +} diff --git a/packages/data/config/clause-noun-heads.json b/packages/data/config/clause-noun-heads.json index 937dc32a..b11bad4f 100644 --- a/packages/data/config/clause-noun-heads.json +++ b/packages/data/config/clause-noun-heads.json @@ -33,7 +33,11 @@ "přílohu", "dodatek", "dodatku", - "oznámení" + "článek", + "oznámení", + "podmínky", + "předmět", + "ustanovení" ], "de": [ "vertrag", diff --git a/packages/data/config/defined-term-heads.json b/packages/data/config/defined-term-heads.json new file mode 100644 index 00000000..aa1fffc2 --- /dev/null +++ b/packages/data/config/defined-term-heads.json @@ -0,0 +1,4 @@ +{ + "_comment": "Common head nouns for capitalized defined/legal concepts. These are not person names by themselves; detector-specific filters assemble this vocabulary where needed. Lowercased and organized per language.", + "en": ["association", "period", "reform"] +} diff --git a/packages/data/config/deny-list-filters.json b/packages/data/config/deny-list-filters.json new file mode 100644 index 00000000..51b89152 --- /dev/null +++ b/packages/data/config/deny-list-filters.json @@ -0,0 +1,48 @@ +{ + "en": { + "definedTermCues": [ + "mean", + "means", + "shall mean", + "shall means", + "shall have the meaning", + "shall have the meanings", + "refer to", + "refers to", + "has the meaning", + "has the meanings", + "is defined" + ], + "sentenceStarters": [ + "the", + "this", + "these", + "those", + "an", + "any", + "all", + "each", + "every", + "no", + "now", + "whereas", + "whereby", + "wherein", + "whereof", + "notwithstanding", + "subject", + "in", + "on", + "at", + "by", + "for", + "if", + "upon", + "unless", + "until", + "provided", + "pursuant", + "such" + ] + } +} diff --git a/packages/data/config/language-scopes.json b/packages/data/config/language-scopes.json new file mode 100644 index 00000000..5d9b85a5 --- /dev/null +++ b/packages/data/config/language-scopes.json @@ -0,0 +1,73 @@ +{ + "_comment": "Default dictionary scopes for content language hints. Lower-level caller config can still override name corpus languages and deny-list countries independently.", + "languages": { + "cs": { + "nameCorpusLanguages": ["cs", "sk"], + "denyListCountries": ["CZ", "SK"] + }, + "de": { + "nameCorpusLanguages": ["de"], + "denyListCountries": ["DE", "AT", "CH"] + }, + "en": { + "nameCorpusLanguages": ["en"], + "denyListCountries": ["US", "GB", "CA", "AU", "IE"] + }, + "es": { + "nameCorpusLanguages": ["es"], + "denyListCountries": [ + "ES", + "MX", + "AR", + "CL", + "CO", + "PE", + "EC", + "VE", + "UY", + "PY", + "BO", + "CR", + "PA", + "DO", + "GT", + "HN", + "SV", + "NI", + "CU" + ] + }, + "fr": { + "nameCorpusLanguages": ["fr"], + "denyListCountries": ["FR", "BE", "CH", "CA", "LU", "MC"] + }, + "hu": { + "nameCorpusLanguages": ["hu"], + "denyListCountries": ["HU"] + }, + "it": { + "nameCorpusLanguages": ["it"], + "denyListCountries": ["IT", "CH"] + }, + "pl": { + "nameCorpusLanguages": ["pl"], + "denyListCountries": ["PL"] + }, + "pt-br": { + "nameCorpusLanguages": ["pt-br"], + "denyListCountries": ["BR"] + }, + "ro": { + "nameCorpusLanguages": ["ro"], + "denyListCountries": ["RO", "MD"] + }, + "sk": { + "nameCorpusLanguages": ["sk", "cs"], + "denyListCountries": ["SK", "CZ"] + }, + "sv": { + "nameCorpusLanguages": ["sv"], + "denyListCountries": ["SE", "FI"] + } + } +} diff --git a/packages/data/config/legal-form-rule-words.json b/packages/data/config/legal-form-rule-words.json new file mode 100644 index 00000000..d2d1c4fc --- /dev/null +++ b/packages/data/config/legal-form-rule-words.json @@ -0,0 +1,27 @@ +{ + "connectorWords": ["a", "and", "und", "et", "e", "y", "i", "&"], + "andConnectorWords": ["and", "und", "et"], + "inNamePrepositions": ["of", "the"], + "companySuffixWords": [ + "Company", + "Co", + "Bank", + "Brothers", + "Bros", + "Sons", + "Group", + "Holdings", + "Trust", + "Partners", + "Associates", + "Corporation", + "Industries", + "Enterprises", + "Solutions", + "Systems", + "Services", + "Foundation", + "Institute" + ], + "commaGatedDirectPrefixes": ["among", "amongst", "between"] +} diff --git a/packages/data/config/legal-role-heads.cs.json b/packages/data/config/legal-role-heads.cs.json index ffab15d9..8c3debe5 100644 --- a/packages/data/config/legal-role-heads.cs.json +++ b/packages/data/config/legal-role-heads.cs.json @@ -28,6 +28,12 @@ "dodavatele", "odběratel", "odběratele", + "plátce", + "příjemce", + "uchazeč", + "uchazeče", + "zadavatel", + "zadavatele", "smluvní", "strana", "strany" diff --git a/packages/data/config/organization-unit-heads.json b/packages/data/config/organization-unit-heads.json new file mode 100644 index 00000000..78e4c8f9 --- /dev/null +++ b/packages/data/config/organization-unit-heads.json @@ -0,0 +1,13 @@ +{ + "_comment": "Administrative or organizational unit nouns that can appear in legal prose without denoting a person or a street/city suffix. Lowercased and organized per language.", + "cs": [ + "agentura", + "inspekce", + "kancelář", + "odbor", + "oddělení", + "sekretariát", + "správa", + "úřad" + ] +} diff --git a/packages/data/config/person-stopwords.json b/packages/data/config/person-stopwords.json index ae496fd5..6fd797b8 100644 --- a/packages/data/config/person-stopwords.json +++ b/packages/data/config/person-stopwords.json @@ -1,5 +1,7 @@ { "_comment": "Words that are valid in other labels (address, org) but should never be classified as person. Checked only in person chain scoring.", + "cs": ["cena"], + "en": ["dodd-frank"], "words": [ "addendum", "agent", diff --git a/packages/data/config/signing-clauses.json b/packages/data/config/signing-clauses.json index e8c31718..c4a72d55 100644 --- a/packages/data/config/signing-clauses.json +++ b/packages/data/config/signing-clauses.json @@ -1,53 +1,69 @@ { - "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix (before place), suffix (after place), prepositions (allowed inside multi-word place names).", + "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix/suffix build regexes; guardPrefixPhrases/guardSuffixPhrases suppress deny-list place hits in the same signing context.", "patterns": [ { "lang": "cs", "prefix": "(?:V|Ve)\\s+", "suffix": "\\s*,?\\s*dne", - "prepositions": ["nad", "pod", "u", "ve", "na"] + "prepositions": ["nad", "pod", "u", "ve", "na"], + "guardPrefixPhrases": ["v", "ve"], + "guardSuffixPhrases": ["dne"] }, { "lang": "sk", "prefix": "(?:V|Vo)\\s+", "suffix": "\\s*,?\\s*dňa", - "prepositions": ["nad", "pod", "pri"] + "prepositions": ["nad", "pod", "pri"], + "guardPrefixPhrases": ["v", "vo"], + "guardSuffixPhrases": ["dňa"] }, { "lang": "de", "prefix": "", "suffix": "\\s*,\\s*den", - "prepositions": ["am", "an", "im"] + "prepositions": ["am", "an", "im"], + "guardPrefixPhrases": [""], + "guardSuffixPhrases": ["den"] }, { "lang": "fr", "prefix": "(?:Fait\\s+)?[Àà]\\s+", "suffix": "\\s*,?\\s*le", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["à", "fait à"], + "guardSuffixPhrases": ["le"] }, { "lang": "en", "prefix": "(?:Signed|Executed)\\s+in\\s+", "suffix": "", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["signed in", "executed in"], + "guardSuffixPhrases": [""] }, { "lang": "pl", "prefix": "(?:W|We)\\s+", "suffix": "\\s*,?\\s*dnia", - "prepositions": ["nad", "pod", "przy"] + "prepositions": ["nad", "pod", "przy"], + "guardPrefixPhrases": ["w", "we"], + "guardSuffixPhrases": ["dnia"] }, { "lang": "it", "prefix": "(?:Fatto\\s+)?[Aa]\\s+", "suffix": "\\s*,?\\s*(?:il|lì)", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["a", "fatto a"], + "guardSuffixPhrases": ["il", "lì"] }, { "lang": "es", "prefix": "(?:Firmado\\s+)?[Ee]n\\s+", "suffix": "\\s*,?\\s*(?:a|el)", - "prepositions": ["de", "del"] + "prepositions": ["de", "del"], + "guardPrefixPhrases": ["en", "firmado en"], + "guardSuffixPhrases": ["a", "el"] } ] } diff --git a/packages/data/config/triggers.de.json b/packages/data/config/triggers.de.json index 2de00fe0..2dafb148 100644 --- a/packages/data/config/triggers.de.json +++ b/packages/data/config/triggers.de.json @@ -10,7 +10,7 @@ "label": "date of birth", "strategy": { "type": "n-words", - "count": 1 + "count": 3 }, "triggers": ["geboren am", "geb."], "extensions": ["add-colon"] diff --git a/packages/data/config/triggers.en.json b/packages/data/config/triggers.en.json index e7f5a975..084b0851 100644 --- a/packages/data/config/triggers.en.json +++ b/packages/data/config/triggers.en.json @@ -65,6 +65,17 @@ "strategy": { "type": "company-id-value" }, "triggers": ["VAT number", "VAT ID", "tax identification number", "tax id"] }, + { + "id": "en-bank-account", + "label": "bank account number", + "strategy": { "type": "company-id-value" }, + "triggers": ["account", "account number", "account no.", "account #"], + "validations": [ + { "type": "has-digits" }, + { "type": "min-length", "min": 5 }, + { "type": "max-length", "max": 34 } + ] + }, { "id": "en-uk-companies-house", "label": "registration number", diff --git a/packages/data/dictionaries/index.ts b/packages/data/dictionaries/index.ts index f4f90629..98d8ce91 100644 --- a/packages/data/dictionaries/index.ts +++ b/packages/data/dictionaries/index.ts @@ -4,7 +4,7 @@ * via dynamic imports and cached after first load. */ -type DenyListCategory = +export type DenyListCategory = | "Names" | "Places" | "Addresses" @@ -17,7 +17,7 @@ type DenyListCategory = | "Organizations" | "International"; -type DictionaryMeta = { +export type DictionaryMeta = { label: string; category: DenyListCategory; country: string | null; @@ -886,7 +886,7 @@ export const CITY_DICTIONARY_META: DictionaryMeta = { // ── Name dictionaries (first + surnames by language) ─ -const NAME_LANGUAGES = [ +export const NAME_LANGUAGES = [ "cs", "sk", "de", @@ -902,6 +902,96 @@ const NAME_LANGUAGES = [ export type NameLanguage = (typeof NAME_LANGUAGES)[number]; +export type DictionaryBundle = { + firstNames: Record; + surnames: Record; + denyList: Record; + denyListMeta: Record; + cities: readonly string[]; + citiesByCountry: Record; +}; + +export type LoadDictionaryBundleOptions = { + countries?: readonly string[]; + cityCountries?: readonly string[]; + nameLanguages?: readonly string[]; +}; + +const DEFAULT_CITY_COUNTRIES = [ + "AT", + "AU", + "BE", + "BG", + "BR", + "CA", + "CH", + "CZ", + "DE", + "DK", + "ES", + "FI", + "FR", + "GB", + "GR", + "HR", + "HU", + "IE", + "IT", + "LU", + "NL", + "NO", + "NZ", + "PL", + "PT", + "RO", + "SE", + "SI", + "SK", + "US", +] as const; + +const normalizeCountryCodes = ( + countries: readonly string[] | undefined, +): Set | null => { + if (countries === undefined || countries.length === 0) { + return null; + } + return new Set(countries.map((country) => country.toUpperCase())); +}; + +const isNameLanguage = (language: string): language is NameLanguage => + NAME_LANGUAGES.some((supported) => supported === language); + +const normalizeNameLanguages = ( + languages: readonly string[] | undefined, +): NameLanguage[] => { + if (languages === undefined || languages.length === 0) { + return [...NAME_LANGUAGES]; + } + const result: NameLanguage[] = []; + for (const language of languages) { + const normalized = language.toLowerCase(); + if (isNameLanguage(normalized)) { + result.push(normalized); + } + } + return result; +}; + +const dictionaryIdIsInScope = ( + id: DictionaryId, + countries: Set | null, + hasScopedNames: boolean, +): boolean => { + const meta = DICTIONARY_META[id]; + if (hasScopedNames && meta.category === "Names") { + return false; + } + return ( + countries === null || meta.country === null || countries.has(meta.country) + ); +}; + /** * Load first-name and surname dictionaries for the * requested languages. Returns the shape expected by @@ -934,3 +1024,54 @@ export const loadNameDictionaries = async ( return { firstNames, surnames }; }; + +export const loadDictionaryBundle = async ({ + countries, + cityCountries, + nameLanguages, +}: LoadDictionaryBundleOptions = {}): Promise => { + const countryScope = normalizeCountryCodes(countries); + const scopedNameLanguages = normalizeNameLanguages(nameLanguages); + const hasScopedNames = + nameLanguages !== undefined && nameLanguages.length > 0; + const dictionaryIds = ALL_DICTIONARY_IDS.filter((id) => + dictionaryIdIsInScope(id, countryScope, hasScopedNames), + ); + const dictionaryResults = await Promise.all( + dictionaryIds.map(async (id) => ({ + id, + entries: await loadDictionary(id), + })), + ); + const denyList: Record = {}; + const denyListMeta: Record = {}; + for (const { id, entries } of dictionaryResults) { + denyList[id] = entries; + denyListMeta[id] = DICTIONARY_META[id]; + } + + const nameDictionaries = await loadNameDictionaries(scopedNameLanguages); + const cityScope = cityCountries ?? countries ?? DEFAULT_CITY_COUNTRIES; + const cityResults = await Promise.all( + cityScope.map(async (country) => ({ + country: country.toUpperCase(), + entries: await loadCityDictionary(country), + })), + ); + const citiesByCountry: Record = {}; + const cities: string[] = []; + for (const { country, entries } of cityResults) { + citiesByCountry[country] = entries; + for (const entry of entries) { + cities.push(entry); + } + } + + return { + ...nameDictionaries, + denyList, + denyListMeta, + cities, + citiesByCountry, + }; +}; From 3782e53b7b3d50e366e8575ec588b8a10ca093ee Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 10:48:08 +0200 Subject: [PATCH 027/130] feat: add core prepared packages --- Cargo.lock | 3 + crates/anonymize-adapter-contract/src/lib.rs | 360 +++++++++++++++--- crates/anonymize-core/Cargo.toml | 3 +- crates/anonymize-core/src/address_seeds.rs | 4 +- crates/anonymize-core/src/dates.rs | 4 +- crates/anonymize-core/src/diagnostics.rs | 1 + crates/anonymize-core/src/legal_forms.rs | 4 +- crates/anonymize-core/src/money.rs | 24 +- crates/anonymize-core/src/prepared.rs | 8 +- crates/anonymize-core/src/processors.rs | 31 +- crates/anonymize-core/src/resolution/types.rs | 4 +- crates/anonymize-core/src/search.rs | 86 ++++- crates/anonymize-core/src/triggers.rs | 8 +- crates/anonymize-napi/src/lib.rs | 132 +++++-- crates/anonymize-py/src/lib.rs | 32 +- .../scripts/migration-fixture-perf.mjs | 3 +- 16 files changed, 579 insertions(+), 128 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 47506a03..ab343050 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -573,6 +573,7 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" +source = "git+https://github.com/stella/aho-corasick?rev=28226295ca5df514cd915e7c26af6fd605348b81#28226295ca5df514cd915e7c26af6fd605348b81" dependencies = [ "daachorse", "unicode-case-mapping", @@ -596,6 +597,7 @@ version = "1.5.0" dependencies = [ "fancy-regex 0.16.2", "regex", + "serde", "stella-text-search-core", ] @@ -648,6 +650,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" +source = "git+https://github.com/stella/text-search?rev=8a42c28a8e7c5a32c838ae9dd443c21deab391ed#8a42c28a8e7c5a32c838ae9dd443c21deab391ed" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index c5b0015f..6676e7c3 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -21,6 +21,10 @@ const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 3; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 1; +const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 2; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 2; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; @@ -415,6 +419,18 @@ pub struct BindingPreparedSearchPackage { pub artifacts: Vec, } +#[derive(Clone, Debug, PartialEq)] +pub struct CorePreparedSearchPackage { + pub config: PreparedSearchConfig, + pub artifacts: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct CorePreparedSearchPackageView<'a> { + pub config: PreparedSearchConfig, + pub artifacts: Cow<'a, [u8]>, +} + #[derive(Deserialize, Serialize)] struct BinaryPreparedSearchConfig { regex_patterns: Vec, @@ -498,16 +514,11 @@ pub fn prepared_search_package_to_bytes( artifacts: &[u8], ) -> Result> { let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; - let digest = blake3::hash(&payload); - let mut bytes = Vec::with_capacity(raw_package_header_len(&payload)); - write_package_header( - &mut bytes, + Ok(prepared_search_package_raw_payload_to_bytes( PREPARED_SEARCH_PACKAGE_HEADER, PREPARED_SEARCH_PACKAGE_VERSION, - digest.as_bytes(), - ); - bytes.extend_from_slice(&payload); - Ok(bytes) + &payload, + )) } pub fn prepared_search_package_to_compressed_bytes( @@ -515,7 +526,47 @@ pub fn prepared_search_package_to_compressed_bytes( artifacts: &[u8], ) -> Result> { let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; - prepared_search_package_compress_payload(&payload) + prepared_search_package_compress_payload( + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + &payload, + ) +} + +pub fn prepared_search_core_package_to_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = + prepared_search_core_package_payload_to_bytes(config, artifacts)?; + Ok(prepared_search_package_raw_payload_to_bytes( + PREPARED_SEARCH_CORE_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_PACKAGE_VERSION, + &payload, + )) +} + +pub fn prepared_search_core_package_to_compressed_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = + prepared_search_core_package_payload_to_bytes(config, artifacts)?; + prepared_search_package_compress_payload( + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + &payload, + ) +} + +#[must_use] +pub fn prepared_search_package_has_core_payload(bytes: &[u8]) -> bool { + bytes + .get(..PREPARED_SEARCH_CORE_PACKAGE_HEADER.len()) + .is_some_and(|header| { + header == PREPARED_SEARCH_CORE_PACKAGE_HEADER + || header == PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER + }) } pub fn prepared_search_package_digest(bytes: &[u8]) -> Result<[u8; 32]> { @@ -526,8 +577,14 @@ pub fn prepared_search_package_from_bytes( bytes: &[u8], ) -> Result { let parts = prepared_search_package_parts(bytes)?; - let payload = parts.payload()?; - verify_prepared_search_package_digest(parts.digest(), payload.as_ref())?; + if parts.is_core() { + return Err(invalid_prepared_search_package( + "package does not contain a binding payload", + )); + } + let digest = parts.digest(); + let payload = parts.into_payload()?; + verify_prepared_search_package_digest(digest, payload.as_ref())?; let (package, read) = bincode::serde::decode_from_slice::< BinaryPreparedSearchPackageOwned, _, @@ -542,6 +599,31 @@ pub fn prepared_search_package_from_bytes( }) } +pub fn prepared_search_core_package_from_bytes( + bytes: &[u8], +) -> Result { + let package = prepared_search_core_package_view_from_bytes(bytes)?; + Ok(CorePreparedSearchPackage { + config: package.config, + artifacts: package.artifacts.into_owned(), + }) +} + +pub fn prepared_search_core_package_view_from_bytes( + bytes: &[u8], +) -> Result> { + let parts = prepared_search_package_parts(bytes)?; + if !parts.is_core() { + return Err(invalid_prepared_search_package( + "package does not contain a core payload", + )); + } + let digest = parts.digest(); + let payload = parts.into_payload()?; + verify_prepared_search_package_digest(digest, payload.as_ref())?; + core_package_view_from_payload(payload) +} + impl From for BinaryPreparedSearchConfig { fn from(config: BindingPreparedSearchConfig) -> Self { Self { @@ -750,7 +832,102 @@ fn prepared_search_package_payload_to_bytes( .map_err(|error| invalid_prepared_search_package(error.to_string())) } -fn prepared_search_package_compress_payload(payload: &[u8]) -> Result> { +fn prepared_search_core_package_payload_to_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let mut config = config.clone(); + if core_literal_patterns_are_identity_mapped(&config) { + config.literal_patterns.clear(); + } + let config_bytes = + bincode::serde::encode_to_vec(config, package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + let config_len = u64::try_from(config_bytes.len()).map_err(|_| { + invalid_prepared_search_package("core config length overflow") + })?; + let mut bytes = Vec::with_capacity( + std::mem::size_of::() + .saturating_add(config_bytes.len()) + .saturating_add(artifacts.len()), + ); + bytes.extend_from_slice(&config_len.to_le_bytes()); + bytes.extend_from_slice(&config_bytes); + bytes.extend_from_slice(artifacts); + Ok(bytes) +} + +fn core_package_view_from_payload( + payload: Cow<'_, [u8]>, +) -> Result> { + let len_end = std::mem::size_of::(); + let len_bytes = payload.as_ref().get(..len_end).ok_or_else(|| { + invalid_prepared_search_package("truncated config length") + })?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed config length"))?; + let config_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("config length overflow"))?; + let config_end = len_end + .checked_add(config_len) + .ok_or_else(|| invalid_prepared_search_package("config length overflow"))?; + let config_bytes = payload + .as_ref() + .get(len_end..config_end) + .ok_or_else(|| invalid_prepared_search_package("truncated config"))?; + let (config, read) = bincode::serde::decode_from_slice::< + PreparedSearchConfig, + _, + >(config_bytes, package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + if read != config_bytes.len() { + return Err(invalid_prepared_search_package("trailing config data")); + } + + let artifacts = match payload { + Cow::Borrowed(bytes) => Cow::Borrowed( + bytes + .get(config_end..) + .ok_or_else(|| invalid_prepared_search_package("missing artifacts"))?, + ), + Cow::Owned(bytes) => Cow::Owned( + bytes + .get(config_end..) + .ok_or_else(|| invalid_prepared_search_package("missing artifacts"))? + .to_vec(), + ), + }; + + Ok(CorePreparedSearchPackageView { config, artifacts }) +} + +fn core_literal_patterns_are_identity_mapped( + config: &PreparedSearchConfig, +) -> bool { + !config.literal_patterns.is_empty() + && config + .literal_patterns + .iter() + .all(|pattern| matches!(pattern, SearchPattern::Literal(_))) +} + +fn prepared_search_package_raw_payload_to_bytes( + header: [u8; 8], + version: u32, + payload: &[u8], +) -> Vec { + let digest = blake3::hash(payload); + let mut bytes = Vec::with_capacity(raw_package_header_len(payload)); + write_package_header(&mut bytes, header, version, digest.as_bytes()); + bytes.extend_from_slice(payload); + bytes +} + +fn prepared_search_package_compress_payload( + header: [u8; 8], + version: u32, + payload: &[u8], +) -> Result> { let compressed = zstd::bulk::compress(payload, PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL) .map_err(|error| invalid_prepared_search_package(error.to_string()))?; @@ -759,12 +936,7 @@ fn prepared_search_package_compress_payload(payload: &[u8]) -> Result> { raw_package_header_len(&compressed) .saturating_add(std::mem::size_of::()), ); - write_package_header( - &mut bytes, - PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, - PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, - digest.as_bytes(), - ); + write_package_header(&mut bytes, header, version, digest.as_bytes()); let payload_len = u64::try_from(payload.len()) .map_err(|_| invalid_prepared_search_package("payload length overflow"))?; bytes.extend_from_slice(&payload_len.to_le_bytes()); @@ -951,31 +1123,39 @@ pub fn prepared_search_config_from_binding( enum PreparedSearchPackageParts<'a> { Raw { + core: bool, digest: [u8; 32], payload: &'a [u8], }, Compressed { + core: bool, digest: [u8; 32], uncompressed_len: usize, payload: &'a [u8], }, } -impl PreparedSearchPackageParts<'_> { +impl<'a> PreparedSearchPackageParts<'a> { const fn digest(&self) -> [u8; 32] { match self { Self::Raw { digest, .. } | Self::Compressed { digest, .. } => *digest, } } - fn payload(&self) -> Result> { + const fn is_core(&self) -> bool { + match self { + Self::Raw { core, .. } | Self::Compressed { core, .. } => *core, + } + } + + fn into_payload(self) -> Result> { match self { Self::Raw { payload, .. } => Ok(Cow::Borrowed(payload)), Self::Compressed { uncompressed_len, payload, .. - } => zstd::bulk::decompress(payload, *uncompressed_len) + } => zstd::bulk::decompress(payload, uncompressed_len) .map(Cow::Owned) .map_err(|error| invalid_prepared_search_package(error.to_string())), } @@ -1000,6 +1180,19 @@ fn prepared_search_package_parts( PREPARED_SEARCH_PACKAGE_HEADER.len(), )?; return Ok(PreparedSearchPackageParts::Raw { + core: false, + digest: raw.digest, + payload: raw.payload, + }); + } + if header == PREPARED_SEARCH_CORE_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_CORE_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_PACKAGE_HEADER.len(), + )?; + return Ok(PreparedSearchPackageParts::Raw { + core: true, digest: raw.digest, payload: raw.payload, }); @@ -1024,6 +1217,33 @@ fn prepared_search_package_parts( .get(len_end..) .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; return Ok(PreparedSearchPackageParts::Compressed { + core: false, + digest: raw.digest, + uncompressed_len, + payload, + }); + } + if header == PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER.len(), + )?; + let len_end = std::mem::size_of::(); + let len_bytes = raw + .payload + .get(..len_end) + .ok_or_else(|| invalid_prepared_search_package("truncated length"))?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed length"))?; + let uncompressed_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("length overflow"))?; + let payload = raw + .payload + .get(len_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + return Ok(PreparedSearchPackageParts::Compressed { + core: true, digest: raw.digest, uncompressed_len, payload, @@ -1628,6 +1848,7 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::PrepareCacheHit => "prepare.cache.hit", DiagnosticStage::PrepareCacheMiss => "prepare.cache.miss", DiagnosticStage::PrepareBindingParse => "prepare.binding.parse", + DiagnosticStage::PreparePackageDecode => "prepare.package.decode", DiagnosticStage::PrepareBindingConvert => "prepare.binding.convert", DiagnosticStage::PrepareArtifactsDecode => "prepare.artifacts.decode", DiagnosticStage::PrepareTotal => "prepare.total", @@ -1688,26 +1909,18 @@ mod tests { use super::{ BindingPreparedSearchConfig, BindingSearchPattern, ContractError, - prepared_search_package_from_bytes, prepared_search_package_to_bytes, + prepared_search_config_from_binding, + prepared_search_core_package_from_bytes, + prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_package_from_bytes, + prepared_search_package_has_core_payload, prepared_search_package_to_bytes, prepared_search_package_to_compressed_bytes, }; #[test] fn prepared_search_package_roundtrips_config_and_artifacts() { - let config = BindingPreparedSearchConfig { - literal_patterns: vec![BindingSearchPattern { - kind: String::from("literal"), - pattern: String::from("Acme"), - distance: None, - case_insensitive: None, - whole_words: None, - lazy: None, - prefilter_any: None, - prefilter_case_insensitive: None, - prefilter_regex: None, - }], - ..BindingPreparedSearchConfig::default() - }; + let config = package_test_config(); let artifacts = b"prepared-artifacts"; let bytes = prepared_search_package_to_bytes(&config, artifacts).unwrap(); @@ -1745,20 +1958,7 @@ mod tests { #[test] fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { - let config = BindingPreparedSearchConfig { - literal_patterns: vec![BindingSearchPattern { - kind: String::from("literal"), - pattern: String::from("Acme"), - distance: None, - case_insensitive: None, - whole_words: None, - lazy: None, - prefilter_any: None, - prefilter_case_insensitive: None, - prefilter_regex: None, - }], - ..BindingPreparedSearchConfig::default() - }; + let config = package_test_config(); let artifacts = b"prepared-artifacts"; let bytes = @@ -1785,4 +1985,64 @@ mod tests { "corrupted compressed package should fail digest verification" ); } + + #[test] + fn prepared_search_core_package_roundtrips_config_and_artifacts() { + let config = + prepared_search_config_from_binding(package_test_config()).unwrap(); + let mut compact_config = config.clone(); + compact_config.literal_patterns.clear(); + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_core_package_to_bytes(&config, artifacts).unwrap(); + let package = prepared_search_core_package_from_bytes(&bytes).unwrap(); + let binding_error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!(prepared_search_package_has_core_payload(&bytes)); + assert_eq!(package.config, compact_config); + assert_eq!(package.artifacts, artifacts); + assert!( + matches!( + binding_error, + ContractError::InvalidPreparedSearchPackage { .. } + ), + "binding package loader should reject core payloads" + ); + } + + #[test] + fn prepared_search_core_compressed_package_roundtrips_config_and_artifacts() { + let config = + prepared_search_config_from_binding(package_test_config()).unwrap(); + let mut compact_config = config.clone(); + compact_config.literal_patterns.clear(); + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_core_package_to_compressed_bytes(&config, artifacts) + .unwrap(); + let package = prepared_search_core_package_from_bytes(&bytes).unwrap(); + + assert!(prepared_search_package_has_core_payload(&bytes)); + assert_eq!(package.config, compact_config); + assert_eq!(package.artifacts, artifacts); + } + + fn package_test_config() -> BindingPreparedSearchConfig { + BindingPreparedSearchConfig { + literal_patterns: vec![BindingSearchPattern { + kind: String::from("literal"), + pattern: String::from("Acme"), + distance: None, + case_insensitive: None, + whole_words: None, + lazy: None, + prefilter_any: None, + prefilter_case_insensitive: None, + prefilter_regex: None, + }], + ..BindingPreparedSearchConfig::default() + } + } } diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index e1bdbde2..95073902 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -12,7 +12,8 @@ categories = ["text-processing"] [dependencies] fancy-regex = "0.16" regex = "1" -stella-text-search-core = { path = "/Users/sok0/coding_projects/text-search-rust-core/crates/core" } +serde = { version = "1", features = ["derive"] } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } [lints] workspace = true diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index 00ab348b..e9cfc5a0 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -13,7 +13,9 @@ const BR_CEP_CONTEXT_WINDOW: usize = 200; const PLAIN_POSTAL_CONTEXT_WINDOW: usize = 120; const US_ZIP_CONTEXT_WINDOW: usize = 120; -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct AddressSeedData { pub boundary_words: Vec, pub br_cep_cue_words: Vec, diff --git a/crates/anonymize-core/src/dates.rs b/crates/anonymize-core/src/dates.rs index cf3126e0..7f02d9ee 100644 --- a/crates/anonymize-core/src/dates.rs +++ b/crates/anonymize-core/src/dates.rs @@ -9,7 +9,9 @@ use crate::types::Result; const DATE_LABEL: &str = "date"; const DATE_SCORE: f64 = 1.0; -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct DateData { pub month_names_by_language: BTreeMap>, pub year_words_by_language: BTreeMap>, diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 14ff55cc..a6e8cc78 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -7,6 +7,7 @@ pub enum DiagnosticStage { PrepareCacheHit, PrepareCacheMiss, PrepareBindingParse, + PreparePackageDecode, PrepareBindingConvert, PrepareArtifactsDecode, PrepareTotal, diff --git a/crates/anonymize-core/src/legal_forms.rs b/crates/anonymize-core/src/legal_forms.rs index ff169947..2b3cf66e 100644 --- a/crates/anonymize-core/src/legal_forms.rs +++ b/crates/anonymize-core/src/legal_forms.rs @@ -10,7 +10,9 @@ const HEAD_TOKEN_CAP: usize = 20; const MAX_LOWER_BRIDGE: usize = 4; const MAX_NAME_LOOKBACK: usize = 32; -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct LegalFormData { pub suffixes: Vec, pub normalized_boundary_suffixes: Vec, diff --git a/crates/anonymize-core/src/money.rs b/crates/anonymize-core/src/money.rs index 559937fe..985a74a9 100644 --- a/crates/anonymize-core/src/money.rs +++ b/crates/anonymize-core/src/money.rs @@ -10,39 +10,51 @@ const MONEY_LABEL: &str = "monetary amount"; const MONEY_SCORE: f64 = 0.9; const MAX_LEFT_SCAN_BYTES: usize = 96; -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct MonetaryData { pub currencies: CurrencyData, pub amount_words: AmountWordsData, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct CurrencyData { pub codes: Vec, pub symbols: Vec, pub local_names: Vec, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct AmountWordsData { pub written_amount_patterns: Vec, pub magnitude_suffixes: Vec, pub share_quantity_terms: Vec, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct WrittenAmountPatternData { pub keywords: Vec, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct MagnitudeSuffixData { pub words: Vec, pub abbreviations_case_insensitive: Vec, pub abbreviations_case_sensitive: Vec, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct ShareQuantityTermData { pub modifiers: Vec, pub nouns: Vec, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 7a1a5e00..a252892f 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -55,7 +55,9 @@ pub struct PreparedSearch { monetary_data: Option, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct PreparedSearchSlices { pub regex: PatternSlice, pub custom_regex: PatternSlice, @@ -67,7 +69,7 @@ pub struct PreparedSearchSlices { pub countries: PatternSlice, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] pub struct PreparedSearchConfig { pub regex_patterns: Vec, pub custom_regex_patterns: Vec, @@ -331,7 +333,6 @@ impl PreparedSearch { ); let legal_form_len = regex_groups.legal_forms.len(); let trigger_len = regex_groups.triggers.len(); - let literal_len = config.literal_patterns.len(); let (date_data, monetary_data) = prepare_anchored_data( config.date_data.as_ref(), @@ -368,6 +369,7 @@ impl PreparedSearch { indexes.triggers, indexes.literals, ); + let literal_len = literals.len(); record_search_index_prepare_stages( &mut diagnostics, &SearchIndexPrepareMetrics { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 073a34f3..01f61cea 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -19,7 +19,16 @@ const TITLE_SOURCE: &str = "title"; const PERSON_LABEL: &str = "person"; const ADDRESS_LABEL: &str = "address"; -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] pub struct PatternSlice { pub start: u32, pub end: u32, @@ -49,7 +58,7 @@ impl PatternSlice { } } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] pub struct RegexMatchMeta { pub label: String, pub score: f64, @@ -75,18 +84,18 @@ impl RegexMatchMeta { } } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub struct GazetteerMatchData { pub labels: Vec, pub is_fuzzy: Vec, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub struct CountryMatchData { pub labels: Vec, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub struct DenyListMatchData { pub labels: StringGroups, pub custom_labels: StringGroups, @@ -95,7 +104,9 @@ pub struct DenyListMatchData { pub filters: Option, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct StringGroups { table: Vec, groups: Vec>, @@ -232,7 +243,9 @@ fn string_table_index( index } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct DenyListFilterData { pub stopwords: BTreeSet, pub allow_list: BTreeSet, @@ -249,7 +262,9 @@ pub struct DenyListFilterData { pub signing_place_guards: Vec, } -#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct SigningPlaceGuardData { pub prefix_phrases: BTreeSet, pub suffix_phrases: BTreeSet, diff --git a/crates/anonymize-core/src/resolution/types.rs b/crates/anonymize-core/src/resolution/types.rs index 606b9683..903d33e9 100644 --- a/crates/anonymize-core/src/resolution/types.rs +++ b/crates/anonymize-core/src/resolution/types.rs @@ -24,7 +24,9 @@ impl DetectionSource { } } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive( + Clone, Copy, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub enum SourceDetail { CustomDenyList, CustomRegex, diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 28a8acff..6fa87ea0 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -6,7 +6,7 @@ use crate::types::{Error, Result, SearchEngine, SearchMatch}; const SEARCH_INDEX_ARTIFACTS_HEADER: [u8; 8] = *b"ANONIDX1"; const SEARCH_INDEX_ARTIFACTS_VERSION: u32 = 1; -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub enum SearchPattern { Literal(String), LiteralWithOptions { @@ -28,25 +28,56 @@ pub enum SearchPattern { }, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] pub struct SearchOptions { pub literal: LiteralSearchOptions, pub regex: RegexSearchOptions, pub fuzzy: FuzzySearchOptions, } -#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd)] +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + Ord, + PartialEq, + PartialOrd, + serde::Deserialize, + serde::Serialize, +)] pub struct LiteralSearchOptions { pub case_insensitive: bool, pub whole_words: bool, } -#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] pub struct RegexSearchOptions { pub whole_words: bool, } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive( + Clone, Copy, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] pub struct FuzzySearchOptions { pub case_insensitive: bool, pub whole_words: bool, @@ -201,6 +232,10 @@ impl SearchIndex { options: SearchOptions, artifacts: &SearchIndexArtifacts, ) -> Result { + if patterns.is_empty() && !artifacts.slots.is_empty() { + return Self::new_all_literal_with_artifacts(options, artifacts); + } + let parts = partition_patterns(patterns)?; let mut cursor = SearchIndexArtifactCursor::new(&artifacts.slots); let search = build_search_index(parts, options, Some(&mut cursor))?; @@ -208,6 +243,30 @@ impl SearchIndex { Ok(search) } + fn new_all_literal_with_artifacts( + options: SearchOptions, + artifacts: &SearchIndexArtifacts, + ) -> Result { + let mut cursor = SearchIndexArtifactCursor::new(&artifacts.slots); + let slot_artifacts = cursor.next()?; + let search = text_search::TextSearch::with_prepared_all_literal_artifacts( + literal_options(options.literal), + slot_artifacts, + ) + .map_err(|error| search_error(&error))?; + cursor.finish()?; + let pattern_indexes = (0..search.len()) + .map(pattern_index) + .collect::>>()?; + Ok(Self { + slots: vec![SearchSlot { + engine: SlotEngine::Literal, + search, + pattern_indexes, + }], + }) + } + pub fn find_iter(&self, haystack: &str) -> Result> { let mut matches = Vec::new(); for slot in &self.slots { @@ -261,6 +320,23 @@ impl SearchIndex { Ok(false) } + + #[must_use] + pub fn len(&self) -> usize { + self + .slots + .iter() + .map(|slot| slot.pattern_indexes.len()) + .fold(0usize, usize::saturating_add) + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self + .slots + .iter() + .all(|slot| slot.pattern_indexes.is_empty()) + } } fn partition_patterns( diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index ca3151a1..e9b84aaf 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -15,7 +15,7 @@ const TRIGGER_LOOKAHEAD_MARGIN: usize = 128; const LINE_TRIGGER_LOOKAHEAD: usize = 2_048; const MATCH_PATTERN_LOOKAHEAD: usize = 512; -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub struct TriggerData { pub rules: Vec, pub address_stop_keywords: Vec, @@ -23,7 +23,7 @@ pub struct TriggerData { pub legal_form_suffixes: Vec, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub struct TriggerRule { pub trigger: String, pub label: String, @@ -32,7 +32,7 @@ pub struct TriggerRule { pub include_trigger: bool, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub enum TriggerStrategy { ToNextComma { stop_words: Vec, @@ -52,7 +52,7 @@ pub enum TriggerStrategy { }, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] pub enum TriggerValidation { StartsUppercase, MinLength(u32), diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index b2efe6c8..1387e4bc 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -10,15 +10,16 @@ use stella_anonymize_adapter_contract::{ BindingOperatorConfig, BindingOperatorEntry, BindingPreparedSearchConfig, BindingRedactionResult, BindingStaticRedactionResult, ContractError, operator_config_from_binding, prepared_search_config_from_binding, - prepared_search_package_digest, prepared_search_package_from_bytes, - prepared_search_package_to_bytes, - prepared_search_package_to_compressed_bytes, + prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_core_package_view_from_bytes, prepared_search_package_digest, + prepared_search_package_from_bytes, prepared_search_package_has_core_payload, static_redaction_diagnostic_result_to_binding, static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, }; use stella_anonymize_core::{ DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, PreparedSearch, - PreparedSearchArtifacts, StaticRedactionDiagnostics, + PreparedSearchArtifacts, PreparedSearchConfig, StaticRedactionDiagnostics, }; const PREPARED_SEARCH_CACHE_LIMIT: usize = 8; @@ -331,15 +332,15 @@ fn prepare_static_search_package_bytes_with( let binding_config = serde_json::from_slice::(config_json) .map_err(|error| to_napi_serde_error(&error))?; - let core_config = prepared_search_config_from_binding(binding_config.clone()) + let core_config = prepared_search_config_from_binding(binding_config) .map_err(|error| to_napi_contract_error(&error))?; - let artifacts = PreparedSearch::prepare_artifacts(core_config) + let artifacts = PreparedSearch::prepare_artifacts(core_config.clone()) .and_then(|artifacts| artifacts.to_bytes()) .map_err(|error| to_napi_core_error(&error))?; let package = if compressed { - prepared_search_package_to_compressed_bytes(&binding_config, &artifacts) + prepared_search_core_package_to_compressed_bytes(&core_config, &artifacts) } else { - prepared_search_package_to_bytes(&binding_config, &artifacts) + prepared_search_core_package_to_bytes(&core_config, &artifacts) }; package .map(Buffer::from) @@ -352,6 +353,15 @@ pub struct NativePreparedSearch { prepare_diagnostics: StaticRedactionDiagnostics, } +#[derive(Clone, Copy)] +struct PrepareContext { + input_bytes_len: usize, + cache_key: [u8; 32], + cache_elapsed: u64, + parse_elapsed: u64, + parse_stage: DiagnosticStage, +} + #[napi] impl NativePreparedSearch { #[napi(constructor)] @@ -415,10 +425,13 @@ impl NativePreparedSearch { Self::from_binding_config( config, artifact_bytes, - input_bytes_len, - cache_key, - cache_elapsed, - parse_elapsed, + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PrepareBindingParse, + }, ) } @@ -442,36 +455,67 @@ impl NativePreparedSearch { let cache_elapsed = elapsed_us(cache_start); let parse_start = Instant::now(); + if prepared_search_package_has_core_payload(package_bytes) { + let package = prepared_search_core_package_view_from_bytes(package_bytes) + .map_err(|error| to_napi_contract_error(&error))?; + let parse_elapsed = elapsed_us(parse_start); + let config = package.config; + return Self::from_core_config( + config, + Some(package.artifacts.as_ref()), + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PreparePackageDecode, + }, + None, + ); + } + let package = prepared_search_package_from_bytes(package_bytes) .map_err(|error| to_napi_contract_error(&error))?; let parse_elapsed = elapsed_us(parse_start); + let config = package.config; + let artifacts = package.artifacts; Self::from_binding_config( - package.config, - Some(&package.artifacts), - input_bytes_len, - cache_key, - cache_elapsed, - parse_elapsed, + config, + Some(&artifacts), + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PreparePackageDecode, + }, ) } fn from_binding_config( config: BindingPreparedSearchConfig, artifact_bytes: Option<&[u8]>, - input_bytes_len: usize, - cache_key: [u8; 32], - cache_elapsed: u64, - parse_elapsed: u64, + context: PrepareContext, ) -> Result { let convert_start = Instant::now(); let config = prepared_search_config_from_binding(config) .map_err(|error| to_napi_contract_error(&error))?; - let pattern_count = config - .regex_patterns - .len() - .saturating_add(config.custom_regex_patterns.len()) - .saturating_add(config.literal_patterns.len()); + let pattern_count = prepared_search_pattern_count(&config); let convert_elapsed = elapsed_us(convert_start); + Self::from_core_config( + config, + artifact_bytes, + context, + Some((pattern_count, convert_elapsed)), + ) + } + + fn from_core_config( + config: PreparedSearchConfig, + artifact_bytes: Option<&[u8]>, + context: PrepareContext, + binding_convert: Option<(usize, u64)>, + ) -> Result { let artifact_decode_start = Instant::now(); let artifacts = artifact_bytes .map(PreparedSearchArtifacts::from_bytes) @@ -491,23 +535,25 @@ impl NativePreparedSearch { stage_event( DiagnosticStage::PrepareCacheMiss, Some(0), - Some(cache_elapsed), - Some(input_bytes_len), - ), - stage_event( - DiagnosticStage::PrepareBindingParse, - None, - Some(parse_elapsed), - Some(input_bytes_len), + Some(context.cache_elapsed), + Some(context.input_bytes_len), ), stage_event( - DiagnosticStage::PrepareBindingConvert, - Some(pattern_count), - Some(convert_elapsed), + context.parse_stage, None, + Some(context.parse_elapsed), + Some(context.input_bytes_len), ), ], }; + if let Some((pattern_count, convert_elapsed)) = binding_convert { + diagnostics.events.push(stage_event( + DiagnosticStage::PrepareBindingConvert, + Some(pattern_count), + Some(convert_elapsed), + None, + )); + } if let (Some(elapsed), Some(bytes)) = (artifact_decode_elapsed, artifact_bytes.map(<[u8]>::len)) { @@ -519,7 +565,7 @@ impl NativePreparedSearch { )); } diagnostics.extend(result.diagnostics); - prepared_search_cache_insert(cache_key, Arc::clone(&inner)); + prepared_search_cache_insert(context.cache_key, Arc::clone(&inner)); Ok(Self { inner, prepare_diagnostics: diagnostics, @@ -573,6 +619,14 @@ impl NativePreparedSearch { } } +const fn prepared_search_pattern_count(config: &PreparedSearchConfig) -> usize { + config + .regex_patterns + .len() + .saturating_add(config.custom_regex_patterns.len()) + .saturating_add(config.literal_patterns.len()) +} + fn prepared_search_cache_get(key: &[u8; 32]) -> Option> { with_prepared_search_cache(|cache| cache.get(key)) } diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index e0c8ad81..e967afc4 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -5,9 +5,10 @@ use stella_anonymize_adapter_contract::{ BindingOperatorConfig, BindingOperatorEntry, BindingPipelineEntity, BindingPreparedSearchConfig, BindingRedactionEntry, BindingRedactionResult, BindingStaticRedactionResult, ContractError, operator_config_from_binding, - prepared_search_config_from_binding, prepared_search_package_from_bytes, - prepared_search_package_to_bytes, - prepared_search_package_to_compressed_bytes, + prepared_search_config_from_binding, prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_core_package_view_from_bytes, + prepared_search_package_from_bytes, prepared_search_package_has_core_payload, static_redaction_diagnostic_result_to_binding, static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, }; @@ -96,6 +97,23 @@ impl PyPreparedSearch { #[staticmethod] fn from_prepared_package_bytes(package_bytes: &[u8]) -> PyResult { + if prepared_search_package_has_core_payload(package_bytes) { + let package = prepared_search_core_package_view_from_bytes(package_bytes) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = + PreparedSearchArtifacts::from_bytes(package.artifacts.as_ref()) + .map_err(|error| to_py_core_error(&error))?; + let result = CorePreparedSearch::new_with_artifacts_diagnostics( + package.config, + &artifacts, + ) + .map_err(|error| to_py_core_error(&error))?; + return Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }); + } + let package = prepared_search_package_from_bytes(package_bytes) .map_err(|error| to_py_contract_error(&error))?; let config = prepared_search_config_from_binding(package.config) @@ -214,15 +232,15 @@ fn prepare_static_search_package_bytes_with<'py>( compressed: bool, ) -> PyResult> { let binding_config = parse_prepared_search_config(config_json)?; - let core_config = prepared_search_config_from_binding(binding_config.clone()) + let core_config = prepared_search_config_from_binding(binding_config) .map_err(|error| to_py_contract_error(&error))?; - let artifacts = CorePreparedSearch::prepare_artifacts(core_config) + let artifacts = CorePreparedSearch::prepare_artifacts(core_config.clone()) .and_then(|artifacts| artifacts.to_bytes()) .map_err(|error| to_py_core_error(&error))?; let package = if compressed { - prepared_search_package_to_compressed_bytes(&binding_config, &artifacts) + prepared_search_core_package_to_compressed_bytes(&core_config, &artifacts) } else { - prepared_search_package_to_bytes(&binding_config, &artifacts) + prepared_search_core_package_to_bytes(&core_config, &artifacts) }; let bytes = package.map_err(|error| to_py_contract_error(&error))?; Ok(PyBytes::new(py, &bytes)) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 30566c7d..4af160f7 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1966,7 +1966,8 @@ function prepareNativePackageBytes(native, configBytes) { } function isCompressedNativePackage(packageBytes) { - return packageBytes.subarray(0, 8).toString("ascii") === "ANONPKZ1"; + const header = packageBytes.subarray(0, 8).toString("ascii"); + return header === "ANONPKZ1" || header === "ANONCPZ1"; } function nativeLibraryPath(name) { From 061bbeaf7103470cbdfdb3b2390c5cb09e639af0 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 10:52:53 +0200 Subject: [PATCH 028/130] chore: update prepared search core pin --- Cargo.lock | 4 ++-- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab343050..b8b123b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -573,7 +573,7 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" -source = "git+https://github.com/stella/aho-corasick?rev=28226295ca5df514cd915e7c26af6fd605348b81#28226295ca5df514cd915e7c26af6fd605348b81" +source = "git+https://github.com/stella/aho-corasick?rev=38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1#38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1" dependencies = [ "daachorse", "unicode-case-mapping", @@ -650,7 +650,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=8a42c28a8e7c5a32c838ae9dd443c21deab391ed#8a42c28a8e7c5a32c838ae9dd443c21deab391ed" +source = "git+https://github.com/stella/text-search?rev=a5d6e11f5c832be50cba42882d2844394adb9403#a5d6e11f5c832be50cba42882d2844394adb9403" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 95073902..e3af912c 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,7 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.16" regex = "1" serde = { version = "1", features = ["derive"] } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } [lints] workspace = true From ddf93b5d97164062ffcc092560b2738410327f5e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 10:56:56 +0200 Subject: [PATCH 029/130] fix: satisfy fixture perf lint --- .../scripts/migration-fixture-perf.mjs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 4af160f7..8d5ebce0 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -281,14 +281,17 @@ async function runWorker() { ); } } - const nativeRewrite = usePrebuiltNativePackage - ? describeNativeRewriteFromNativePackage(runtime) - : usePrebuiltNativeConfig && search.nativeStaticConfig - ? describeNativeRewriteFromNativeConfig( - search.nativeStaticConfig, - runtime, - ) - : describeNativeRewrite(config, search, runtime); + let nativeRewrite; + if (usePrebuiltNativePackage) { + nativeRewrite = describeNativeRewriteFromNativePackage(runtime); + } else if (usePrebuiltNativeConfig && search.nativeStaticConfig) { + nativeRewrite = describeNativeRewriteFromNativeConfig( + search.nativeStaticConfig, + runtime, + ); + } else { + nativeRewrite = describeNativeRewrite(config, search, runtime); + } let runtimeRunner = null; if (runtime === "native-static" && nativePackageBuffer !== null) { From aba881d780f1795a9ac8d49ba56ad1a1a907764b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 11:06:18 +0200 Subject: [PATCH 030/130] chore: update dependencies --- Cargo.lock | 15 +-- bun.lock | 150 ++++++++++++++------------- crates/anonymize-core/Cargo.toml | 2 +- package.json | 2 +- packages/anonymize/package.json | 6 +- packages/anonymize/wasm/package.json | 2 +- packages/cli/package.json | 4 +- packages/corpus/package.json | 2 +- packages/data/package.json | 2 +- 9 files changed, 90 insertions(+), 95 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b8b123b0..12cb1397 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -132,17 +132,6 @@ version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" -[[package]] -name = "fancy-regex" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" -dependencies = [ - "bit-set", - "regex-automata", - "regex-syntax", -] - [[package]] name = "fancy-regex" version = "0.18.0" @@ -595,7 +584,7 @@ dependencies = [ name = "stella-anonymize-core" version = "1.5.0" dependencies = [ - "fancy-regex 0.16.2", + "fancy-regex", "regex", "serde", "stella-text-search-core", @@ -640,7 +629,7 @@ name = "stella-regex-set-core" version = "1.0.5" source = "git+https://github.com/stella/regex-set?rev=8b80241a5a54cef8fdc6b6b34119981db0c6f597#8b80241a5a54cef8fdc6b6b34119981db0c6f597" dependencies = [ - "fancy-regex 0.18.0", + "fancy-regex", "regex", "regex-automata", "regex-syntax", diff --git a/bun.lock b/bun.lock index 89a3f51a..1f9a749e 100644 --- a/bun.lock +++ b/bun.lock @@ -10,7 +10,7 @@ "@stll/typescript-config": "^0.3.0", "lefthook": "^2.1.9", "oxfmt": "^0.54.0", - "oxlint": "^1.69.0", + "oxlint": "^1.70.0", "oxlint-tsgolint": "^0.23.0", "turbo": "^2.9.18", }, @@ -21,14 +21,14 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search": "^1.0.6", + "@stll/text-search": "^1.0.7", }, "devDependencies": { "@stll/anonymize-data": "workspace:*", - "@stll/text-search-wasm": "^1.0.6", + "@stll/text-search-wasm": "^1.0.7", "bun-types": "^1.3.14", "fast-check": "^4.8.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", "vite": "^8.0.16", }, @@ -45,7 +45,7 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search-wasm": "^1.0.5", + "@stll/text-search-wasm": "^1.0.7", }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6", @@ -68,9 +68,9 @@ }, "devDependencies": { "@stll/anonymize-wasm": "workspace:*", - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", }, }, @@ -82,7 +82,7 @@ "@stll/anonymize-data": "^0.0.6", }, "devDependencies": { - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", "typescript": "^6.0.3", }, @@ -92,21 +92,21 @@ "version": "0.0.6", "devDependencies": { "stopwords-iso": "1.1.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", }, }, }, "packages": { - "@babel/generator": ["@babel/generator@8.0.0-rc.6", "", { "dependencies": { "@babel/parser": "^8.0.0-rc.6", "@babel/types": "^8.0.0-rc.6", "@jridgewell/gen-mapping": "^0.3.12", "@jridgewell/trace-mapping": "^0.3.28", "@types/jsesc": "^2.5.0", "jsesc": "^3.0.2" } }, "sha512-6mIzgVK8DgEzvIapoQwhXTMnnkuE4STQmVv9H03i/tZ2ml8oev3TRvZJgTenK2Bsq0YWNtzOrFdTyNzCMFtjJQ=="], + "@babel/generator": ["@babel/generator@8.0.0", "", { "dependencies": { "@babel/parser": "^8.0.0", "@babel/types": "^8.0.0", "@jridgewell/gen-mapping": "^0.3.12", "@jridgewell/trace-mapping": "^0.3.28", "@types/jsesc": "^2.5.0", "jsesc": "^3.0.2" } }, "sha512-NT9NrVwJsbSV6Y2FSstWa71EETOnzrjkL5/wX3D2mYHtKM+qvqB1DvR4D0Setb/gDBsHzRICifwEWMO8CnTF6g=="], - "@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0-rc.6", "", {}, "sha512-BCkFy+zN6kXQed3YOT7aJl93NfDSzQc3pBfsvTVPs9gU9X3V0aefEF5kwBT0E+mDWH9QgKaZstYUQN9VdQZT4g=="], + "@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0", "", {}, "sha512-6mJgmFFFIIO82vvoLt9XtRC7/TkzXfts1t/SpRX4IHSzMgqoPYCWesVu1udUPUWioAE/2fcG6WuI8zrkE1gwrg=="], - "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.0-rc.6", "", {}, "sha512-nVJ+1JcCgntv8d78rRo++o2wuODT0Irknx2BF8Np4Ft2CRgjLqIs4qzSZ8b66yGbBdMWGmZBO9WEZv1hhNiSpg=="], + "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.2", "", {}, "sha512-9Fr9QeyCAyi1BR1jKZ6uYQ24EIhQUx5ReHfQU7drOE+TPOb+w11/dsqLkMOT2U29OdCT71XajrOT8xDc1C7orA=="], - "@babel/parser": ["@babel/parser@8.0.0-rc.6", "", { "dependencies": { "@babel/types": "^8.0.0-rc.6" }, "bin": "./bin/babel-parser.js" }, "sha512-rOS8IpdO7mQELkTPlCsTgPejO0bFuZdEDCGQJouYbYf9e1FLTym7Fei2pEjq8q7MWbX0ravcd7QQYKs1TxOuog=="], + "@babel/parser": ["@babel/parser@8.0.0", "", { "dependencies": { "@babel/types": "^8.0.0" }, "bin": "./bin/babel-parser.js" }, "sha512-aLxAE+imI9bCcyaPrUDjBv3uSkWieifjLe0kuFOZF0zli0L6GCsTmsePnTr55adbIAgYz2zhN1vnFimCBUYcRQ=="], - "@babel/types": ["@babel/types@8.0.0-rc.6", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0-rc.6", "@babel/helper-validator-identifier": "^8.0.0-rc.6" } }, "sha512-p7/ABylAYlexb31wtRdIfH9L9A0Z2T/9H6zAqzqndkY2PLkvNNc580wGhp/gGKN4Sp9sQvSkhc6Oga8/O+wTyw=="], + "@babel/types": ["@babel/types@8.0.0", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0", "@babel/helper-validator-identifier": "^8.0.0" } }, "sha512-K8ponJDxBwDHigkeFqaqT5wLGl4bTlwMafR8k7b5CPxr6Ww+UG9ls8Yx6Tcpboxu97eeGVEEyKcHmEyOwN1vSw=="], "@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], @@ -126,7 +126,7 @@ "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.4", "", { "dependencies": { "@tybys/wasm-util": "^0.10.1" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow=="], - "@oxc-project/types": ["@oxc-project/types@0.134.0", "", {}, "sha512-T0xuRRKrQFmocH8y+jGfpmSkGcheaJExY9lEihmR1Gm2aH+75B8CzgU2rABRQSzzDxLjZ15Sc0bRVLj5lVeNXQ=="], + "@oxc-project/types": ["@oxc-project/types@0.137.0", "", {}, "sha512-WT+Gb24i8hmvo85AIv2oEYouEXkRlKAlT9WaCa3TfLgNCN+GhrJOGZuIlMouAh38Qe4QOx26eUOVsq70qXrywA=="], "@oxfmt/binding-android-arm-eabi": ["@oxfmt/binding-android-arm-eabi@0.54.0", "", { "os": "android", "cpu": "arm" }, "sha512-NAtpl/SiaeU103e7/OmZw0MvUnsUUopW7hEm/ecegJg7YM0skQaA0IXEZoyTV6NUdiNPupdIUreRqUZTShbn/g=="], @@ -178,75 +178,75 @@ "@oxlint-tsgolint/win32-x64": ["@oxlint-tsgolint/win32-x64@0.23.0", "", { "os": "win32", "cpu": "x64" }, "sha512-5MyjFuqf+g8OUPJBSGWHJtmoWnzFJYyOg4To9WMQshZYEWig/vtu7JtJ03VWnzHv9LJkAUeApY0gVCOywFR/iQ=="], - "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.69.0", "", { "os": "android", "cpu": "arm" }, "sha512-DKQQbD5cZ/MYfDgDI7YGyGD9FSxABlsBsYFo5p26lloob543tP9+4N3guwdXIYJN+7HSZxLe8YJuwcOWw5qnHg=="], + "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.70.0", "", { "os": "android", "cpu": "arm" }, "sha512-zFh0P4cswmRvw6nkyb89dr18rRanuaCPAsEXsFDoQY8WdaquI8Pt4NWFjaMJg6L23cy5NeN8J9cBnREbWzZhaw=="], - "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.69.0", "", { "os": "android", "cpu": "arm64" }, "sha512-lEhb+I5pr4inux+JFwfCa1HRq3Os7NirEFQ0H1I35SVEHPm6byX0Ah47xmRha3qi6LAkxUcxViL8o/9PivjzBg=="], + "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.70.0", "", { "os": "android", "cpu": "arm64" }, "sha512-qI8o4HZjeGiBrWv+pJv4lH0Yi2Gl/JSp/EumBUApezJprIKa5PS4nU0lQsQngtky8k+SplQIOjv6hwu0SSxeyg=="], - "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.69.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-GY2YE8lOZW59BW1Ia1y+1gR0XyjrZRvVWHAr8LGeGhYHE0OQJ/7cRKXTkx1P+E9/6awEc3SX8a68SFTjh/E//A=="], + "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.70.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-8KjgVVHI5F9nVwHCRwwA78Ty7zNKP4Wd9OeN5PSv3iu/F/u1RVXoOCgLhWqust6HmwQG6xc8c+RCyaWENy24+w=="], - "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.69.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-ax1oZnOjHX3LB7myQyHEaQkDwfLb6str3/nSP6O7EVUviQGNkEGzGV0EqcBJWK+Ufwx0l4xPgyYayurvhAdl2Q=="], + "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.70.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-WVydssv5PSUBXFJTdNBWlmGkbNmvPGaFt/2SUT/EZRB6bq6bEOHmMlbnupZD5jmlEvi9+mZJHi8TCw15lyfSfQ=="], - "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.69.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-kHWeHv4g2h8NY+mpCxzCtY4uerMJWTN/TSnNj1CPbakFpHEJ6cTya2wWV0pDSYWOJ2+0UiEbhn3AtXxHtsnKjg=="], + "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.70.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-hJucmUf8OlinHNb1R7fI4Fw6WsAstOz7i8nmkWQfiHoZXtbufNm+MxiDTIMk1ggh2Ro4vLzgQ+bKvRY54MZoRA=="], - "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.69.0", "", { "os": "linux", "cpu": "arm" }, "sha512-gq84vM1a1oEehXo27YCDzGVcxPsZDI1yswZwz2Da1/cbnWtrL16XZZnz0G/+gIU8edtHpfjxq5c+vWEHqJfWoQ=="], + "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.70.0", "", { "os": "linux", "cpu": "arm" }, "sha512-1BnS7wbCYDSXwWzJJ+mc3NURoha6m6m6RT5c6vgAY3oz7C3OVXP+S0awo2mRq97arrJkVvO3qRQfyAHL+76xtQ=="], - "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.69.0", "", { "os": "linux", "cpu": "arm" }, "sha512-kIqEa98JQ0VRyrcncxA417m2AzasqTlD+FyVT1AksjvjkqQcvm7pBWYvoW3/mpyOP2XYvi5nSCCTIe6De1yu5g=="], + "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.70.0", "", { "os": "linux", "cpu": "arm" }, "sha512-yKy/UdbR55+M2yEcuiV5DCNC/gdQAjr/GioUy50QwBzSrKm8ueWADqyRLS9Xk+qjNeCYGg6A8FvUBds56ttfqg=="], - "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.69.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-j+xYiXozxGWx2cpjCrwwGR4awTxPFsRv3JZrv23RCogEPMc4R7UqjHW47p/RG0aRlbWiROCJ8coUfCwy0dvzHA=="], + "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.70.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-0A5XJ4alvmqFUFP/4oYSyaO+qLto/HrKEWTSaegiVl+HOufFngK2BjYw9x4RbwBt/du5QG6l5q1zeWiJYYG5yg=="], - "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.69.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-xEPpNppTfN1l/nM7gYSf9iocscu/as+p/7vxkLeLEKnYU+09Dm+5V6IhDYDh+Uz6FajEupWwCLt5SOG0y1PCKg=="], + "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.70.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-JiylyurlB0CLSedNtx1gzv3FvfWPF1h/2Y3BJszPLNt5XQFlBsH5ke0Jle3iJb3uqu5m2e7A/DwzpuCAHdiU+A=="], - "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.69.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-Ug0+eU7HJBlek+SjklYH62IlOMirEJsdxpihH0kSqX0XdrDD4NdHpQc10fK1JC35yn6KrrcN+uYzlHD38XAf8Q=="], + "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.70.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-J8VPG7I3/HmgaU4u8pNU2kFx2+0U+vPLS1dXFxXOaR/2TQ0f8AC7DRz0SRGRI1bfphnX2hVYTTtLuhL4nYKL+Q=="], - "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.69.0", "", { "os": "linux", "cpu": "none" }, "sha512-iEyI3GIg0l/s3G4qy2TlaaWKdzj4PJJStwtlocpDTC00PY9hZueotf6OKUj9+yfQh0lrpBW/pLMgTztbAHKJEg=="], + "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.70.0", "", { "os": "linux", "cpu": "none" }, "sha512-N2+4lV2KLN+oXTIIIwmWDhwkrnvqf5oX7Hw0zPjk+RuIVgiBQSOlJWF7uQoFx2siEYX0ZQ5cfSbEAHm+J3t7Wg=="], - "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.69.0", "", { "os": "linux", "cpu": "none" }, "sha512-NjHjpiI4WIKSMwuoJSZi5VToPeoYOS1FR52HLIDG6lidMdqquusgtODb4iLk0+lb1q3Z0nv2/aPRcC/olmpQGg=="], + "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.70.0", "", { "os": "linux", "cpu": "none" }, "sha512-1e2L7cFCvx9QDzq6NPP+0tABKb5z6nWHyddWTNKprEsjO9xNrAtPowuCGpjNXxkTdsMiZ4jc8YQ5SstZd4XK6g=="], - "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.69.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-Ai/prDewoItkDXbp38gwGZi41DycZbUTZJ3UidwoHgQC0/DaqC2TGdtBTQLJ6hSD+SAxASzh8+/eSBPmxfOacA=="], + "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.70.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-Kwu/l/8GcYibCWA9m9N5pRXMIKVSsL/YbgpLzYkqDhWTiqdRfnNJ/+nqIKRKQiFbHWsdlHEhzMwruJK+qcEruA=="], - "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.69.0", "", { "os": "linux", "cpu": "x64" }, "sha512-Gt3KHgp46mRKz4sJeaASmKvD8ayXookRw07RMf+NowhEztGGDZ7VrXpoW96XuKJLjFukWizOFVNjmYb/u7caNQ=="], + "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.70.0", "", { "os": "linux", "cpu": "x64" }, "sha512-tap04CsHYOl0nSAQJfPNIuBxqEPB2HnhQqwaOXLg1jnp2XfRo8Fa814dA4QC4zpvTWXCjAAaCY1W5LOORkEQuQ=="], - "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.69.0", "", { "os": "linux", "cpu": "x64" }, "sha512-7tQhJ2+p/oHv1zcfnjYI7YVzC/7iBaVOfIvFYtxdJ5F45mWgEdrCyXZXZGfiLey5t/5JhOhsaMnnv1kAzckd7g=="], + "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.70.0", "", { "os": "linux", "cpu": "x64" }, "sha512-hzJa/WgvtJpbBD9rgfy0qe+MjbxOXNUT0bfR1S6EQQzfTtBFA9xg5q8KSwRrQ2QfSS+TaP4j+4mVPQrfNc6UNg=="], - "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.69.0", "", { "os": "none", "cpu": "arm64" }, "sha512-vmWz6TKp/3hfA4lksR0zHBv/6xuX1jhym6eqOjdH2DXsDDHZWcp2f0KG0VCAnlVbIrjk29G4wAWMXb/Hn1YobA=="], + "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.70.0", "", { "os": "none", "cpu": "arm64" }, "sha512-xbsaNSNzVSnaJACCUYr1HQMyY/Q/Q1LkePmHG3UvZPvGCYGNxrsZp9OmtA6ick8xH47ltRRbRrPCM1YXYcyC+A=="], - "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.69.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-9RExaLgmaw6IoIkU9cTpT71mLfI0xZ86iZH8x518LVsOkjquJMYqb9P7KpC8lgd1t0Dxs41p2pxynq4XR3Ttzw=="], + "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.70.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-icAEsUI7JbW1TMRdEXV83mVAInhRVQYuuAlPpxdGwJ95chNdnCzjloRW8GglT0WvzOEZSio6fnYSk2DJ2Hv7LQ=="], - "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.69.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-1907kRPF8/PrcIw1E7LMs9JbVrpgnt/MvFdss3an8oDkYNAACXzTntV3t3869ZZhMZxb2AzRGbz1pA/jdFatXA=="], + "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.70.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-FHMSWbVsPVs/f+Jcl04ws4JJ2wUnauyTzlpxWRG/lSO/8GpX08Fo2gQZqdA6CrRFI+zvkxl+N/KwJGWfUwYVZA=="], - "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.69.0", "", { "os": "win32", "cpu": "x64" }, "sha512-w8SOXv3mT9Fi6jY8OXdXCfnvX/3KNLXGNr4HEz2TA7S4Mv/PYAOmpB8y/ge40mxvBMgGNaSaaDwZpAsQn7HtWA=="], + "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.70.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ptOlKwCz7n4AKs5VweMqG6DAg677FmKOK+vBkkL9DMNgFATIQ+upqUYBTOEwRQyRAx1ncGlPlXleV2hIcm3z4g=="], "@quansync/fs": ["@quansync/fs@1.0.0", "", { "dependencies": { "quansync": "^1.0.0" } }, "sha512-4TJ3DFtlf1L5LDMaM6CanJ/0lckGNtJcMjQ1NAV6zDmA0tEHKZtxNKin8EgPaVX1YzljbxckyT2tJrpQKAtngQ=="], - "@rolldown/binding-android-arm64": ["@rolldown/binding-android-arm64@1.1.0", "", { "os": "android", "cpu": "arm64" }, "sha512-gCYzGOSkYY6Z034suzd20euvds7lPzMEEla62DJGE/ZAlR4OMBnNbvnBSsIGUCAr52gaWMsloGxP4tVGtN5aCA=="], + "@rolldown/binding-android-arm64": ["@rolldown/binding-android-arm64@1.1.2", "", { "os": "android", "cpu": "arm64" }, "sha512-2cZ+7xRS+DBcuJBJKnfzsbleumJhBqSlJVpuzHC0nTqfd3QQ7Vx2/x5YR/D7cBamKSeWplwo82Fn9lqYUDEMfA=="], - "@rolldown/binding-darwin-arm64": ["@rolldown/binding-darwin-arm64@1.1.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-JQBD77MNgu+4Z6RAyg69acugdrhhVoWesr3l47zohYZ2YV2fwkWMArkN/2p4l6Ei+Sno7W5q+UsKdVWq5Ens0w=="], + "@rolldown/binding-darwin-arm64": ["@rolldown/binding-darwin-arm64@1.1.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-RkPMJnygxsgOYdkfqgpwY0/Fzm8d0VQe6HGU2/B00Xa9eqdLbrII+DOKAodbJAn3ZL1AJxGHkZRPYazgGY6Ljw=="], - "@rolldown/binding-darwin-x64": ["@rolldown/binding-darwin-x64@1.1.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-p/8cXUTK4Sob604e+xxPhVSbDFf29E6J0l/xESM9rdCfn3aDai3nEs6TnMHUsdD5aNlFz0+gDbiGlozLKGa2YA=="], + "@rolldown/binding-darwin-x64": ["@rolldown/binding-darwin-x64@1.1.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-Uiczh6vFhwyfd7WNe7Q7mCA4KxAiLdz7jPE/WGizfRpIieoyFuNVMmM8HqZ9HwudTkY6/AeMQwlNJ9NJijguWw=="], - "@rolldown/binding-freebsd-x64": ["@rolldown/binding-freebsd-x64@1.1.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-KbtOSlVv6fElujiZWMcC3aQYhEwLVVf073RcwlSmpGQvIsKZFUqc0ef4sjUuurRwfbiI6JJXji9DQn+86hawmQ=="], + "@rolldown/binding-freebsd-x64": ["@rolldown/binding-freebsd-x64@1.1.2", "", { "os": "freebsd", "cpu": "x64" }, "sha512-+TpdtTRgHiJFjCVFbw311SuLk3KfytPOQQn+VlAEv+gBxYPtL7E6JS9e/tk+8CwxhIZvemJKo4rTKgfWNsKkkA=="], - "@rolldown/binding-linux-arm-gnueabihf": ["@rolldown/binding-linux-arm-gnueabihf@1.1.0", "", { "os": "linux", "cpu": "arm" }, "sha512-9fZ9i0o0/MQaw7om6Z6TsT7tfCk0jtbEFtC+aPqZL5RNsGWNcHvn6EHgL3dAprjq+AZzPTAQjg2JtpJaMt+6pg=="], + "@rolldown/binding-linux-arm-gnueabihf": ["@rolldown/binding-linux-arm-gnueabihf@1.1.2", "", { "os": "linux", "cpu": "arm" }, "sha512-4lv1/tkmi7ueIVHnyreaOeUpiZP26BH9rRy6hoYfR9310A2B9nUEVRDvBx69vx64Nr3eTPPRkyciqJJs+j9Jmw=="], - "@rolldown/binding-linux-arm64-gnu": ["@rolldown/binding-linux-arm64-gnu@1.1.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-+tog7T66i+yFyIuuAnjL6xmW182W/qTBOUt6BtQ6lBIM1Eikh/fSMz4HGgvuCp5uU0zuIVWng7kDYthjCMOHcg=="], + "@rolldown/binding-linux-arm64-gnu": ["@rolldown/binding-linux-arm64-gnu@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-gBSUVO0eaWgw1JMjK3gB8BMlX2Mk148s2lTiVT3e9vjVxbl7UDfMWWY8CfIaaqiXuM9fVTMxIpUz6CAo/B6Vlw=="], - "@rolldown/binding-linux-arm64-musl": ["@rolldown/binding-linux-arm64-musl@1.1.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-4b7yruLIIj/oZ3GpcLOvxcLCLDMraohn3IhQfN2hBP4w9UekG0DTIajWguJosRGfySf/+h/NwRUiMKoCpxCrqQ=="], + "@rolldown/binding-linux-arm64-musl": ["@rolldown/binding-linux-arm64-musl@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-LjQP/iZLBu8o8PjIfk4x3At0/mT6h282pvz8Z5LAyhGbu/kDezyO7ea62rF5uoqmgnIYqbN/MqJ3Si3Aymi7xQ=="], - "@rolldown/binding-linux-ppc64-gnu": ["@rolldown/binding-linux-ppc64-gnu@1.1.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-QRDOVZd0bhQ5jLsUsCC3dUxDWdTSVY9WMznowZgCGOrZfLLgctWpelhUASEiBwsXfat/JwYnVd1EaxMhqyT+UQ=="], + "@rolldown/binding-linux-ppc64-gnu": ["@rolldown/binding-linux-ppc64-gnu@1.1.2", "", { "os": "linux", "cpu": "ppc64" }, "sha512-X/7bVLWelEsbyWDUSXt7zVsTniLLPIY2n1rH58qr78l9i7MNbbxBWD8gI2vRfBWf4NUXJCUuQnfZDsp32LqsfQ=="], - "@rolldown/binding-linux-s390x-gnu": ["@rolldown/binding-linux-s390x-gnu@1.1.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-ypxT+Hq76NFG7woFbNbySnGEajFuYuIXeKz/jfCU+lXUoxfi3zLE6OG/ZQNeK3RpZSYJlAe2bokpsQ046CaieQ=="], + "@rolldown/binding-linux-s390x-gnu": ["@rolldown/binding-linux-s390x-gnu@1.1.2", "", { "os": "linux", "cpu": "s390x" }, "sha512-gb6dYKW/1KDorGXyy48glEBJs/sxVSC5pcVrox/pFGV4mvwSFeg2sK5L2tRkVsVlh7kueqOgg4GEcuipJcGuKg=="], - "@rolldown/binding-linux-x64-gnu": ["@rolldown/binding-linux-x64-gnu@1.1.0", "", { "os": "linux", "cpu": "x64" }, "sha512-IdovCmfROFmpTLahdecTDFL74aLERVYN68F/mLZjfVh6LfoplPfI6deyHNMTcVujbokDV5k05XrFO22zfv+qjg=="], + "@rolldown/binding-linux-x64-gnu": ["@rolldown/binding-linux-x64-gnu@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-JY4w85pU3iAiJVMh5nuk4/Mh9GjMsupe8MrIN53rwxAZW64GKrWeJBuN6SxQg9QTU5uB1cxyhDzW8jqRn1EABw=="], - "@rolldown/binding-linux-x64-musl": ["@rolldown/binding-linux-x64-musl@1.1.0", "", { "os": "linux", "cpu": "x64" }, "sha512-pcA8xlFp2tyk9T2R6Fi/rPe3bQ1MA+sSMDNUU5Ogu80GHOatkE4P8YCreGAvZErm5Ho2YRXnyvNrWiRncfVysQ=="], + "@rolldown/binding-linux-x64-musl": ["@rolldown/binding-linux-x64-musl@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-xvpA7o5KCYLB0Rwscmuylb1/zHHSUx4g4xilm4prC5jP76pEUlzBmMbgpbh7bVDbId4NcfT96gN5i6mE6UDaiw=="], - "@rolldown/binding-openharmony-arm64": ["@rolldown/binding-openharmony-arm64@1.1.0", "", { "os": "none", "cpu": "arm64" }, "sha512-4+fexHayrLCWpriPh4c6dNvL4an34DEZCG7zOM/FD5QNF6h8DT+bDXzyB/kfC8lDJbaFb7jKShtnjDQFXVQEjg=="], + "@rolldown/binding-openharmony-arm64": ["@rolldown/binding-openharmony-arm64@1.1.2", "", { "os": "none", "cpu": "arm64" }, "sha512-p/ts6KBLjuk49Bp21XH77poQGt02iNz7ChgHep7tudPOaLinR/De/RHdxF8w8Yj4r/bF/bqXwH6PZrB2sA+Nvw=="], - "@rolldown/binding-wasm32-wasi": ["@rolldown/binding-wasm32-wasi@1.1.0", "", { "dependencies": { "@emnapi/core": "1.10.0", "@emnapi/runtime": "1.10.0", "@napi-rs/wasm-runtime": "^1.1.4" }, "cpu": "none" }, "sha512-SbL++MNmOw6QamrwIGDMSSfM4ceTzFr+RjbOExJSLLBinScU4WI5OdA413h1qwPw2yH7lVF1+H4svQ+6mSXKTQ=="], + "@rolldown/binding-wasm32-wasi": ["@rolldown/binding-wasm32-wasi@1.1.2", "", { "dependencies": { "@emnapi/core": "1.11.1", "@emnapi/runtime": "1.11.1", "@napi-rs/wasm-runtime": "^1.1.5" }, "cpu": "none" }, "sha512-VMu/wmrZ9hJzYlRhbw7jK5PODlugyKZ5mOdX78+lS8OvuFkWNQdz1pFLrI2p3P0pjXOmUZ7B48o5VnMH9QOGtg=="], - "@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.1.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-+xTE6XC7wBgk0VKRXGG+QAnyW5S9b8vfsFpiMjf0waQTmSQSU8onsH/beyZ8X4aXVveJnotiy7VDjLOaW8bTrg=="], + "@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.1.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-xtUJqs8qEkuSviS0n1tsohaPuz3a1SPhZywOji4Oo+sgrJs8daEDMZ0QtqL0OS7dx8PoVpg2J/ZZycPY5I2+Zg=="], - "@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.1.0", "", { "os": "win32", "cpu": "x64" }, "sha512-Ogji1TQNqH3ACLnYr+1Ns1nyrJ0CO2P585u9Hsh02pXvtFiFpgtgT2b3P4PnCOU86VVCvqtAeCN4OftMT8KU4w=="], + "@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.1.2", "", { "os": "win32", "cpu": "x64" }, "sha512-85YiLQqjUKgSO/Zjnf9e0XIn5Ymrh1fLDWBeAkZqpuBR/3R8TpfoHXuyblqyQrftSSgWO9qpcHN8mkyKsLraoA=="], "@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0", "", {}, "sha512-aKs/3GSWyV0mrhNmt/96/Z3yczC3yvrzYATCiCXQebBsGyYzjNdUphRVLeJQ67ySKVXRfMxt2lm12pmXvbPFQQ=="], @@ -276,21 +276,21 @@ "@stll/anonymize-wasm": ["@stll/anonymize-wasm@workspace:packages/anonymize/wasm"], - "@stll/fuzzy-search": ["@stll/fuzzy-search@1.1.2", "", { "optionalDependencies": { "@stll/fuzzy-search-darwin-arm64": "1.1.2", "@stll/fuzzy-search-darwin-x64": "1.1.2", "@stll/fuzzy-search-linux-arm64-gnu": "1.1.2", "@stll/fuzzy-search-linux-x64-gnu": "1.1.2", "@stll/fuzzy-search-wasm32-wasi": "1.1.2", "@stll/fuzzy-search-win32-x64-msvc": "1.1.2" } }, "sha512-0KtL+cnvZebyvo8orkR1Rb4zgUKUDdGB68a4J3lLzMKk9RTOPPOPrmQbVijHZNMD13ZA975pnSnn5ZfzqpubAw=="], + "@stll/fuzzy-search": ["@stll/fuzzy-search@1.1.3", "", { "optionalDependencies": { "@stll/fuzzy-search-darwin-arm64": "1.1.3", "@stll/fuzzy-search-darwin-x64": "1.1.3", "@stll/fuzzy-search-linux-arm64-gnu": "1.1.3", "@stll/fuzzy-search-linux-x64-gnu": "1.1.3", "@stll/fuzzy-search-wasm32-wasi": "1.1.3", "@stll/fuzzy-search-win32-x64-msvc": "1.1.3" } }, "sha512-OAZPMRT2UIrxoEc3Vv9tdzpKOHHLkMaVhqGygQQE6rz5uvbBNs/EWNrD+bEIpYu/AU053LUUltj1dZmymLz19w=="], - "@stll/fuzzy-search-darwin-arm64": ["@stll/fuzzy-search-darwin-arm64@1.1.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-DyEeolxz2Hnq9DCqsSmBgDivZHPJ/+M80SkEYaqmMfl9rllLlHuNJ7nsjnZOJ47o7Zg1JEzR86wwQRmQbY0o5w=="], + "@stll/fuzzy-search-darwin-arm64": ["@stll/fuzzy-search-darwin-arm64@1.1.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-xe5qWIRQAgo6GJqS7CQhBw3B496NazlBpQbHUMEt/0qjF1O8fzTkJT7IAYoMyHTu/A/14Cv1MVyZ/WuKSAsC7Q=="], - "@stll/fuzzy-search-darwin-x64": ["@stll/fuzzy-search-darwin-x64@1.1.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-p2YtioZhzzoIM2Ua+rxkfzu1SCp4TCILnWu6YqSX3Lk0QjOXIn0yd3qQVhU+XVrM8f/8Nn4PwWly44iNdYpyuw=="], + "@stll/fuzzy-search-darwin-x64": ["@stll/fuzzy-search-darwin-x64@1.1.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-WrBE0MSoi52bLPiiSjplelVQoPon5QLthEIFBYZE/FKNL237/1SckWgxyHlXcbYHGkdvkV3NzVUTWBhcc+XqKw=="], - "@stll/fuzzy-search-linux-arm64-gnu": ["@stll/fuzzy-search-linux-arm64-gnu@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-d1ZaTgk/7ys1jwOY7zd85/zDoHbfOYjXDCz/aTRVjvj0c3S0cNwF40TDa29QtJPzCNad7KNeQPPOpTsEiDUZgA=="], + "@stll/fuzzy-search-linux-arm64-gnu": ["@stll/fuzzy-search-linux-arm64-gnu@1.1.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-mLCGuw4uUSFsYyS8+v7rL/aUMII8bMd1XN31wjHsqJeDIpUGD7jmvIVaktssdlbkHtkeAe+G95VYdhfF8cUbhA=="], - "@stll/fuzzy-search-linux-x64-gnu": ["@stll/fuzzy-search-linux-x64-gnu@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-YJcwBqr1HSjt5rQ6v3iZ3SnmF+J1yFNAIG3maZbX5TRmUWYZ/M2adpLP7PO8x5GO6VgfqM+9F0oPqM/8u4UUoQ=="], + "@stll/fuzzy-search-linux-x64-gnu": ["@stll/fuzzy-search-linux-x64-gnu@1.1.3", "", { "os": "linux", "cpu": "x64" }, "sha512-LSeWf0jnVV+g9uGG3b8kTL4PdOz7/XwBnBZIzuzrLW6gkG1uhAnaxYvyGKbaCyy1Bqsu9rxmDhE1RcPSFkkyxQ=="], "@stll/fuzzy-search-wasm": ["@stll/fuzzy-search-wasm@1.1.2", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-fRaiksdcoVgZObYj1Df068y8X3fYrBYaDTuvhV8dWVBZOk+f9NP1jyZY98uv+wakW3O0WYlRvBblbBDou+B6VA=="], - "@stll/fuzzy-search-wasm32-wasi": ["@stll/fuzzy-search-wasm32-wasi@1.1.2", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "cpu": "none" }, "sha512-vLHOLFl08IyN+hHEypVvT2XHP8dZAYOXRXnJ3LwajHR9HOKoWr7AYZqkc0jkrUousOUozdDfbd2oAL8EPxs44g=="], + "@stll/fuzzy-search-wasm32-wasi": ["@stll/fuzzy-search-wasm32-wasi@1.1.3", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "cpu": "none" }, "sha512-Sx4P9K4W1jcqlj07Zd5s9zj0kUWp+GE8V0/5W+hHw89gTMHiDnO172A/dnj3K1BB6F+BFBgioLGChdZy5hH+Vw=="], - "@stll/fuzzy-search-win32-x64-msvc": ["@stll/fuzzy-search-win32-x64-msvc@1.1.2", "", { "os": "win32", "cpu": "x64" }, "sha512-oIyp/GPgIbUaz/AGgPvcCkikO1Po80GAqt5zARhAXs489G0kwrVXoxnF3fPZd8kMvXuOVbQkSFFy257DPDbxjA=="], + "@stll/fuzzy-search-win32-x64-msvc": ["@stll/fuzzy-search-win32-x64-msvc@1.1.3", "", { "os": "win32", "cpu": "x64" }, "sha512-fP7e4B0I2/h3MKIx1Hp7+e/QLnmv4feodEVGKGmfu9P+KbpX8ZzWdBxIEeKnPSd1fGd6F9dmwQ15YC9W+1gtzQ=="], "@stll/oxlint-config": ["@stll/oxlint-config@0.3.0", "", { "peerDependencies": { "oxlint": ">=1.66.0", "oxlint-tsgolint": ">=0.23.0" } }, "sha512-kT4jS/0mgMejp5LUrQ7joHljyphqQr2kk2zGyuJGiJqFz7pHnp9lPJw5WmDyoOqEC34+jkHh0vPeNcZIEiXhog=="], @@ -312,9 +312,9 @@ "@stll/stdnum": ["@stll/stdnum@2.1.1", "", {}, "sha512-VV+9w+u3tLYjos2Z0idJBsl+iCmE171u4rhUNEh/QDqljPBjKETKyLkf81Z1sR0QeaAcn3rg+0Y4vauPVU566w=="], - "@stll/text-search": ["@stll/text-search@1.0.6", "", { "dependencies": { "@stll/aho-corasick": "^1.0.4", "@stll/fuzzy-search": "^1.1.2", "@stll/regex-set": "^1.0.5" } }, "sha512-gjBAD7rssDe7SKMoouRfzourSfI+ssWv/HdiSAUXAJe6SvZgpqI2ePbeCnGzaGj1wij4QI+QgpZpqPWlXDNM+Q=="], + "@stll/text-search": ["@stll/text-search@1.0.7", "", { "dependencies": { "@stll/aho-corasick": "^1.0.4", "@stll/fuzzy-search": "^1.1.3", "@stll/regex-set": "^1.0.5" } }, "sha512-lvAwLKzLUhIToAnmjR0noS7Oa3d+2OFuTxl4BTqX1X6Z1JYphYkVB/hbyQpfGWrjl+LhFJDhOXBPkvTA7Yw39w=="], - "@stll/text-search-wasm": ["@stll/text-search-wasm@1.0.6", "", { "dependencies": { "@stll/aho-corasick-wasm": "^1.0.4", "@stll/fuzzy-search-wasm": "^1.1.2", "@stll/regex-set-wasm": "^1.0.5" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-LRDS557o0U08k4OR3m/lZMuWu8shPtb+yXNKZ7cbgF0wXe4n4ECEqAkjymcGw7gp0fk/At35C0LRphm2cj6eyQ=="], + "@stll/text-search-wasm": ["@stll/text-search-wasm@1.0.7", "", { "dependencies": { "@stll/aho-corasick-wasm": "^1.0.4", "@stll/fuzzy-search-wasm": "^1.1.2", "@stll/regex-set-wasm": "^1.0.5" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-b7oUNQ1lhS21DxIpt56j+7hPGFdrBJgTR4yygLGdEXjLjksvf0lXOu1Qgn+NwOSXaqn8O9Rw+R8bfKO0xhs1rg=="], "@stll/typescript-config": ["@stll/typescript-config@0.3.0", "", {}, "sha512-l0dj2IirCUKbSRSJ9Xwfu4vg8SjtfRlg5MEYGrx6lo/q2GLxS6MhFY+5hPOk36bJB99grqgmUtJql9Qg0WxIhw=="], @@ -336,11 +336,11 @@ "@types/jsesc": ["@types/jsesc@2.5.1", "", {}, "sha512-9VN+6yxLOPLOav+7PwjZbxiID2bVaeq0ED4qSQmdQTdjnXJSaCVKTR58t15oqH1H5t8Ng2ZX1SabJVoN9Q34bw=="], - "@types/node": ["@types/node@25.9.3", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-603BddQMv3pUcr4U2dhujk83N2tTDVr/34wII2B6bJy6g+8WD6yUb11jszNs0gdi4PesVWl7ABt8nYMVpnLUcg=="], + "@types/node": ["@types/node@25.9.4", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-dszCsrKb5U7ZsVZBWiHFklTloVl0mSEnWH/iZXfZUlI4rzCUnsvGmgqfuVRHL54ugE7/wRuxEIXRa2iMZ+BG6g=="], "ansis": ["ansis@4.3.1", "", {}, "sha512-BJ8/l4R5LRE7hW9WdSuGYrLSHi2ynxeFpDFbH0K/CgNeY/tyhk+vO6TYxXC5r5CpUhNVX310xzPsN/H9lCdfOA=="], - "ast-kit": ["ast-kit@3.0.0-beta.1", "", { "dependencies": { "@babel/parser": "^8.0.0-beta.4", "estree-walker": "^3.0.3", "pathe": "^2.0.3" } }, "sha512-trmleAnZ2PxN/loHWVhhx1qeOHSRXq4TDsBBxq3GqeJitfk3+jTQ+v/C1km/KYq9M7wKqCewMh+/NAvVH7m+bw=="], + "ast-kit": ["ast-kit@3.0.0", "", { "dependencies": { "@babel/parser": "^8.0.0", "estree-walker": "^3.0.3", "pathe": "^2.0.3" } }, "sha512-8OG92q3R35qjC/4i6BLBMg8IB+fClWu/1PEwg2Z9Rn+BuNaiEgJzpzn+pxWOdHJWDCAwu2JP0wCDTozAM4QirQ=="], "birpc": ["birpc@4.0.0", "", {}, "sha512-LShSxJP0KTmd101b6DRyGBj57LZxSDYWKitQNW/mi8GRMvZb078Uf9+pveax1DrVL89vm7mWe+TovdI/UDOuPw=="], @@ -420,11 +420,11 @@ "nanoid": ["nanoid@3.3.12", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ=="], - "obug": ["obug@2.1.1", "", {}, "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ=="], + "obug": ["obug@2.1.3", "", {}, "sha512-9miFgM2OFba7hB+pRgvtV84pYTBaoTHohvmIgiRt6dRIzbwEOIaNaP+dIlGs2fNFoB0SeISs0Jz5WFVRid6Xyg=="], "oxfmt": ["oxfmt@0.54.0", "", { "dependencies": { "tinypool": "2.1.0" }, "optionalDependencies": { "@oxfmt/binding-android-arm-eabi": "0.54.0", "@oxfmt/binding-android-arm64": "0.54.0", "@oxfmt/binding-darwin-arm64": "0.54.0", "@oxfmt/binding-darwin-x64": "0.54.0", "@oxfmt/binding-freebsd-x64": "0.54.0", "@oxfmt/binding-linux-arm-gnueabihf": "0.54.0", "@oxfmt/binding-linux-arm-musleabihf": "0.54.0", "@oxfmt/binding-linux-arm64-gnu": "0.54.0", "@oxfmt/binding-linux-arm64-musl": "0.54.0", "@oxfmt/binding-linux-ppc64-gnu": "0.54.0", "@oxfmt/binding-linux-riscv64-gnu": "0.54.0", "@oxfmt/binding-linux-riscv64-musl": "0.54.0", "@oxfmt/binding-linux-s390x-gnu": "0.54.0", "@oxfmt/binding-linux-x64-gnu": "0.54.0", "@oxfmt/binding-linux-x64-musl": "0.54.0", "@oxfmt/binding-openharmony-arm64": "0.54.0", "@oxfmt/binding-win32-arm64-msvc": "0.54.0", "@oxfmt/binding-win32-ia32-msvc": "0.54.0", "@oxfmt/binding-win32-x64-msvc": "0.54.0" }, "peerDependencies": { "svelte": "^5.0.0", "vite-plus": "*" }, "optionalPeers": ["svelte", "vite-plus"], "bin": { "oxfmt": "bin/oxfmt" } }, "sha512-DjnMwn7smSLF+Mc2+pRItnuPftm/dkUFpY/d4+33y9TfKrsHZo8GLhmUg9BrOIUEy94Rlom1Q11N6vuhE+e0oQ=="], - "oxlint": ["oxlint@1.69.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.69.0", "@oxlint/binding-android-arm64": "1.69.0", "@oxlint/binding-darwin-arm64": "1.69.0", "@oxlint/binding-darwin-x64": "1.69.0", "@oxlint/binding-freebsd-x64": "1.69.0", "@oxlint/binding-linux-arm-gnueabihf": "1.69.0", "@oxlint/binding-linux-arm-musleabihf": "1.69.0", "@oxlint/binding-linux-arm64-gnu": "1.69.0", "@oxlint/binding-linux-arm64-musl": "1.69.0", "@oxlint/binding-linux-ppc64-gnu": "1.69.0", "@oxlint/binding-linux-riscv64-gnu": "1.69.0", "@oxlint/binding-linux-riscv64-musl": "1.69.0", "@oxlint/binding-linux-s390x-gnu": "1.69.0", "@oxlint/binding-linux-x64-gnu": "1.69.0", "@oxlint/binding-linux-x64-musl": "1.69.0", "@oxlint/binding-openharmony-arm64": "1.69.0", "@oxlint/binding-win32-arm64-msvc": "1.69.0", "@oxlint/binding-win32-ia32-msvc": "1.69.0", "@oxlint/binding-win32-x64-msvc": "1.69.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.22.1", "vite-plus": "*" }, "optionalPeers": ["oxlint-tsgolint", "vite-plus"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-ypZkK/aDc5NQV8zIR6s2H2Tl3aNW8FmJ1m9+2qsaYuRenl8vgnHNCGwTHviWJdUQzglOlHFchgopdtGhSy17Rw=="], + "oxlint": ["oxlint@1.70.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.70.0", "@oxlint/binding-android-arm64": "1.70.0", "@oxlint/binding-darwin-arm64": "1.70.0", "@oxlint/binding-darwin-x64": "1.70.0", "@oxlint/binding-freebsd-x64": "1.70.0", "@oxlint/binding-linux-arm-gnueabihf": "1.70.0", "@oxlint/binding-linux-arm-musleabihf": "1.70.0", "@oxlint/binding-linux-arm64-gnu": "1.70.0", "@oxlint/binding-linux-arm64-musl": "1.70.0", "@oxlint/binding-linux-ppc64-gnu": "1.70.0", "@oxlint/binding-linux-riscv64-gnu": "1.70.0", "@oxlint/binding-linux-riscv64-musl": "1.70.0", "@oxlint/binding-linux-s390x-gnu": "1.70.0", "@oxlint/binding-linux-x64-gnu": "1.70.0", "@oxlint/binding-linux-x64-musl": "1.70.0", "@oxlint/binding-openharmony-arm64": "1.70.0", "@oxlint/binding-win32-arm64-msvc": "1.70.0", "@oxlint/binding-win32-ia32-msvc": "1.70.0", "@oxlint/binding-win32-x64-msvc": "1.70.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.22.1", "vite-plus": "*" }, "optionalPeers": ["oxlint-tsgolint", "vite-plus"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-D6JgHtzkhRwvEC+A0Nw5AEc5bk8x5i1pHzvZIEf/a0C4hOzmAACNGtkDGPyFaxxX3ZVGxCPeig3P3rMM8XU3/g=="], "oxlint-tsgolint": ["oxlint-tsgolint@0.23.0", "", { "optionalDependencies": { "@oxlint-tsgolint/darwin-arm64": "0.23.0", "@oxlint-tsgolint/darwin-x64": "0.23.0", "@oxlint-tsgolint/linux-arm64": "0.23.0", "@oxlint-tsgolint/linux-x64": "0.23.0", "@oxlint-tsgolint/win32-arm64": "0.23.0", "@oxlint-tsgolint/win32-x64": "0.23.0" }, "bin": { "tsgolint": "bin/tsgolint.js" } }, "sha512-3mBv3CoPbh8dFbzfDGIWa2ytZjn2v+3EX4aKRXjIhsoGFzG8GCjfRirz3rwZf1wYbZzsNLTSgpw8VjQuWdp/jA=="], @@ -442,11 +442,11 @@ "resolve-pkg-maps": ["resolve-pkg-maps@1.0.0", "", {}, "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw=="], - "rolldown": ["rolldown@1.1.0", "", { "dependencies": { "@oxc-project/types": "=0.134.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.1.0", "@rolldown/binding-darwin-arm64": "1.1.0", "@rolldown/binding-darwin-x64": "1.1.0", "@rolldown/binding-freebsd-x64": "1.1.0", "@rolldown/binding-linux-arm-gnueabihf": "1.1.0", "@rolldown/binding-linux-arm64-gnu": "1.1.0", "@rolldown/binding-linux-arm64-musl": "1.1.0", "@rolldown/binding-linux-ppc64-gnu": "1.1.0", "@rolldown/binding-linux-s390x-gnu": "1.1.0", "@rolldown/binding-linux-x64-gnu": "1.1.0", "@rolldown/binding-linux-x64-musl": "1.1.0", "@rolldown/binding-openharmony-arm64": "1.1.0", "@rolldown/binding-wasm32-wasi": "1.1.0", "@rolldown/binding-win32-arm64-msvc": "1.1.0", "@rolldown/binding-win32-x64-msvc": "1.1.0" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-zpMvlJhs5PkXRTtKc0CaLBVI9AR/VDiJFpM+kx//hgToEca7FgMlGjaRIisXBcb19T76LswgmKECSQ96hjWr5A=="], + "rolldown": ["rolldown@1.1.2", "", { "dependencies": { "@oxc-project/types": "=0.137.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.1.2", "@rolldown/binding-darwin-arm64": "1.1.2", "@rolldown/binding-darwin-x64": "1.1.2", "@rolldown/binding-freebsd-x64": "1.1.2", "@rolldown/binding-linux-arm-gnueabihf": "1.1.2", "@rolldown/binding-linux-arm64-gnu": "1.1.2", "@rolldown/binding-linux-arm64-musl": "1.1.2", "@rolldown/binding-linux-ppc64-gnu": "1.1.2", "@rolldown/binding-linux-s390x-gnu": "1.1.2", "@rolldown/binding-linux-x64-gnu": "1.1.2", "@rolldown/binding-linux-x64-musl": "1.1.2", "@rolldown/binding-openharmony-arm64": "1.1.2", "@rolldown/binding-wasm32-wasi": "1.1.2", "@rolldown/binding-win32-arm64-msvc": "1.1.2", "@rolldown/binding-win32-x64-msvc": "1.1.2" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-x0CrQQqCXWGeI8dTvFfN/Dnv3yMKT9hv5jFjlOreKAx9wqLq9wz7VvLLHyaAXC90/CpggTu9SisSbsJJTPSjNQ=="], - "rolldown-plugin-dts": ["rolldown-plugin-dts@0.25.2", "", { "dependencies": { "@babel/generator": "8.0.0-rc.6", "@babel/helper-validator-identifier": "8.0.0-rc.6", "@babel/parser": "8.0.0-rc.6", "ast-kit": "^3.0.0-beta.1", "birpc": "^4.0.0", "dts-resolver": "^3.0.0", "get-tsconfig": "5.0.0-beta.5", "obug": "^2.1.1" }, "peerDependencies": { "@ts-macro/tsc": "^0.3.6", "@typescript/native-preview": ">=7.0.0-dev.20260325.1", "rolldown": "^1.0.0", "typescript": "^5.0.0 || ^6.0.0", "vue-tsc": "~3.2.0" }, "optionalPeers": ["@ts-macro/tsc", "@typescript/native-preview", "typescript", "vue-tsc"] }, "sha512-nMhN/R+vmR8GM45ZW1FWMSjRTSDDn/6w4GTf8RNrEFCBdl8B1kySWrU1ixPtbwzXoRlcO+R/S88VgXuJQwfdDg=="], + "rolldown-plugin-dts": ["rolldown-plugin-dts@0.26.0", "", { "dependencies": { "@babel/generator": "^8.0.0", "@babel/helper-validator-identifier": "^8.0.0", "@babel/parser": "^8.0.0", "ast-kit": "^3.0.0", "birpc": "^4.0.0", "dts-resolver": "^3.0.0", "get-tsconfig": "5.0.0-beta.5", "obug": "^2.1.3" }, "peerDependencies": { "@ts-macro/tsc": "^0.3.6", "@typescript/native-preview": ">=7.0.0-dev.20260325.1", "rolldown": "^1.0.0", "typescript": "^5.0.0 || ^6.0.0", "vue-tsc": "~3.2.0 || ~3.3.0" }, "optionalPeers": ["@ts-macro/tsc", "@typescript/native-preview", "typescript", "vue-tsc"] }, "sha512-e+kEPtUiDES0htk5iqkSeF4EzAV7R+vugGB44iPDuw1Kw9E+WyL1VG7PaV0IIjGHLiacztMBcMTyrr8ON9CT1Q=="], - "semver": ["semver@7.8.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-rkVq3IXh+4FDGch+KwzX3aV9W3kO54GyEgpvBzSyctDA6Xtd7RJQV1xmXbeQp5v7+VzLOfVqiutSE6GICgPFvg=="], + "semver": ["semver@7.8.5", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-Y7/KDsb8LjooZpwaqGyulO6DQlksgCncchHGk+sZIY4SBvUocMBEFH5Ur1fI4dV+Jvl0w6cjvucaIi40puRioA=="], "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], @@ -460,7 +460,7 @@ "tree-kill": ["tree-kill@1.2.2", "", { "bin": { "tree-kill": "cli.js" } }, "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A=="], - "tsdown": ["tsdown@0.22.2", "", { "dependencies": { "ansis": "^4.3.1", "cac": "^7.0.0", "defu": "^6.1.7", "empathic": "^2.0.1", "hookable": "^6.1.1", "import-without-cache": "^0.4.0", "obug": "^2.1.1", "picomatch": "^4.0.4", "rolldown": "~1.1.0", "rolldown-plugin-dts": "^0.25.2", "semver": "^7.8.1", "tinyexec": "^1.2.4", "tinyglobby": "^0.2.17", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.22.2", "@tsdown/exe": "0.22.2", "@vitejs/devtools": "*", "publint": "^0.3.8", "tsx": "*", "typescript": "^5.0.0 || ^6.0.0", "unplugin-unused": "^0.5.0", "unrun": "*" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "tsx", "typescript", "unplugin-unused", "unrun"], "bin": { "tsdown": "./dist/run.mjs" } }, "sha512-VX9gsyKXsTnBZjnIM4jsHl9aRv+GfgkE/k1hQslilaBfZMlaw3JuGR+6yhiU0QxWBtOCDnTjwOSoXzgB7Rr50g=="], + "tsdown": ["tsdown@0.22.3", "", { "dependencies": { "ansis": "^4.3.1", "cac": "^7.0.0", "defu": "^6.1.7", "empathic": "^2.0.1", "hookable": "^6.1.1", "import-without-cache": "^0.4.0", "obug": "^2.1.3", "picomatch": "^4.0.4", "rolldown": "~1.1.1", "rolldown-plugin-dts": "^0.26.0", "semver": "^7.8.4", "tinyexec": "^1.2.4", "tinyglobby": "^0.2.17", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.22.3", "@tsdown/exe": "0.22.3", "@vitejs/devtools": "*", "publint": "^0.3.8", "tsx": "*", "typescript": "^5.0.0 || ^6.0.0", "unplugin-unused": "^0.5.0", "unrun": "*" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "tsx", "typescript", "unplugin-unused", "unrun"], "bin": { "tsdown": "./dist/run.mjs" } }, "sha512-louqbfA8Qf//B9jTTL0FPtXTNpjCWv1VPkbcmQMph2pTpzs+LnB1tbe4tDDRVpo2BjF5SgUXaTZe45SxB8pWHg=="], "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], @@ -474,13 +474,23 @@ "vite": ["vite@8.0.16", "", { "dependencies": { "lightningcss": "^1.32.0", "picomatch": "^4.0.4", "postcss": "^8.5.15", "rolldown": "1.0.3", "tinyglobby": "^0.2.17" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^20.19.0 || >=22.12.0", "@vitejs/devtools": "^0.1.18", "esbuild": "^0.27.0 || ^0.28.0", "jiti": ">=1.21.0", "less": "^4.0.0", "sass": "^1.70.0", "sass-embedded": "^1.70.0", "stylus": ">=0.54.8", "sugarss": "^5.0.0", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "@vitejs/devtools", "esbuild", "jiti", "less", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-h9bXPmJichP5fLmVQo3PyaGSDE2n3aPuomeAlVRm0JLmt4rY6zmPKd59HYI4LNW8oTK7tlTsuC7l/m7awx9Jcw=="], - "ast-kit/@babel/parser": ["@babel/parser@8.0.0-rc.3", "", { "dependencies": { "@babel/types": "^8.0.0-rc.3" }, "bin": "./bin/babel-parser.js" }, "sha512-B20dvP3MfNc/XS5KKCHy/oyWl5IA6Cn9YjXRdDlCjNmUFrjvLXMNUfQq/QUy9fnG2gYkKKcrto2YaF9B32ToOQ=="], + "@rolldown/binding-wasm32-wasi/@emnapi/core": ["@emnapi/core@1.11.1", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.2", "tslib": "^2.4.0" } }, "sha512-RSvbQmHzdKzNsLYa/wHrbc3KN4sYLKAdPZxqiM2HATqv/SBk2/ENSHpvXGaLOMcsAyz0poEGqkmmKYG3OWiJEQ=="], + + "@rolldown/binding-wasm32-wasi/@emnapi/runtime": ["@emnapi/runtime@1.11.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-vgj7R3y3Wgx24IQaGPA/R6YFXLHVMOZ0uVEyIQPaWs+rd1AzfEMXlAC22FYwO1XkKR6NPsq7mUandH8oIRdZFw=="], + + "@rolldown/binding-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.5", "", { "dependencies": { "@tybys/wasm-util": "^0.10.2" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q=="], + + "@stll/fuzzy-search-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.5", "", { "dependencies": { "@tybys/wasm-util": "^0.10.2" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q=="], "bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="], "vite/rolldown": ["rolldown@1.0.3", "", { "dependencies": { "@oxc-project/types": "=0.133.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.0.3", "@rolldown/binding-darwin-arm64": "1.0.3", "@rolldown/binding-darwin-x64": "1.0.3", "@rolldown/binding-freebsd-x64": "1.0.3", "@rolldown/binding-linux-arm-gnueabihf": "1.0.3", "@rolldown/binding-linux-arm64-gnu": "1.0.3", "@rolldown/binding-linux-arm64-musl": "1.0.3", "@rolldown/binding-linux-ppc64-gnu": "1.0.3", "@rolldown/binding-linux-s390x-gnu": "1.0.3", "@rolldown/binding-linux-x64-gnu": "1.0.3", "@rolldown/binding-linux-x64-musl": "1.0.3", "@rolldown/binding-openharmony-arm64": "1.0.3", "@rolldown/binding-wasm32-wasi": "1.0.3", "@rolldown/binding-win32-arm64-msvc": "1.0.3", "@rolldown/binding-win32-x64-msvc": "1.0.3" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-i00lAJ2ks1BYr7rjNjKC7BcqAS7nVfiT3QX1SI5aY+AFHblCmaUf9OE9dbdzDvW6dJxbi2ZCZiy9v3CcwOiX3g=="], - "ast-kit/@babel/parser/@babel/types": ["@babel/types@8.0.0-rc.3", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0-rc.3", "@babel/helper-validator-identifier": "^8.0.0-rc.3" } }, "sha512-mOm5ZrYmphGfqVWoH5YYMTITb3cDXsFgmvFlvkvWDMsR9X8RFnt7a0Wb6yNIdoFsiMO9WjYLq+U/FMtqIYAF8Q=="], + "@rolldown/binding-wasm32-wasi/@emnapi/core/@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-c95qOXkHdydNKhscBTebqEC1CVAZpyqOfVfBzQ1qgzyl3gfeldUjIggDbIZgDKsHLgnsM+igH7TJ/eAasaVuMA=="], + + "@rolldown/binding-wasm32-wasi/@napi-rs/wasm-runtime/@tybys/wasm-util": ["@tybys/wasm-util@0.10.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg=="], + + "@stll/fuzzy-search-wasm32-wasi/@napi-rs/wasm-runtime/@tybys/wasm-util": ["@tybys/wasm-util@0.10.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg=="], "bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], @@ -515,9 +525,5 @@ "vite/rolldown/@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.0.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-gEdFFEN70A/jxb2svrWsN3aDL7OUtmvlOy+6fa2jxG8K0wQ1ZbdeLGnidov6Yu5/733dI5ySfzFlQ/cb0bSz1g=="], "vite/rolldown/@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.0.3", "", { "os": "win32", "cpu": "x64" }, "sha512-eXB7CHuaQdqmJcc3koCNtNPmT/bj2gc999kUFgBxG8Ac0NdgXc4rkCHhqrgrhN3zddvvvrgzj1e90SuSfmyIXA=="], - - "ast-kit/@babel/parser/@babel/types/@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0-rc.3", "", {}, "sha512-AmwWFx1m8G/a5cXkxLxTiWl+YEoWuoFLUCwqMlNuWO1tqAYITQAbCRPUkyBHv1VOFgfjVOqEj6L3u15J5ZCzTA=="], - - "ast-kit/@babel/parser/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.0-rc.3", "", {}, "sha512-8AWCJ2VJJyDFlGBep5GpaaQ9AAaE/FjAcrqI7jyssYhtL7WGV0DOKpJsQqM037xDbpRLHXsY8TwU7zDma7coOw=="], } } diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index e3af912c..3148368f 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -10,7 +10,7 @@ keywords = ["anonymization", "pii", "redaction", "text"] categories = ["text-processing"] [dependencies] -fancy-regex = "0.16" +fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } diff --git a/package.json b/package.json index 11a11068..11458b04 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ "@stll/typescript-config": "^0.3.0", "lefthook": "^2.1.9", "oxfmt": "^0.54.0", - "oxlint": "^1.69.0", + "oxlint": "^1.70.0", "oxlint-tsgolint": "^0.23.0", "turbo": "^2.9.18" } diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 2eb16dd4..b4334eee 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -42,7 +42,7 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search": "^1.0.6" + "@stll/text-search": "^1.0.7" }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6" @@ -54,10 +54,10 @@ }, "devDependencies": { "@stll/anonymize-data": "workspace:*", - "@stll/text-search-wasm": "^1.0.6", + "@stll/text-search-wasm": "^1.0.7", "bun-types": "^1.3.14", "fast-check": "^4.8.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", "vite": "^8.0.16" } diff --git a/packages/anonymize/wasm/package.json b/packages/anonymize/wasm/package.json index 5a9d0543..7129124a 100644 --- a/packages/anonymize/wasm/package.json +++ b/packages/anonymize/wasm/package.json @@ -34,7 +34,7 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search-wasm": "^1.0.5" + "@stll/text-search-wasm": "^1.0.7" }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6", diff --git a/packages/cli/package.json b/packages/cli/package.json index f638c473..8ff88b86 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -31,9 +31,9 @@ }, "devDependencies": { "@stll/anonymize-wasm": "workspace:*", - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3" } } diff --git a/packages/corpus/package.json b/packages/corpus/package.json index 5578a675..a367a9ff 100644 --- a/packages/corpus/package.json +++ b/packages/corpus/package.json @@ -15,7 +15,7 @@ "@stll/anonymize-data": "^0.0.6" }, "devDependencies": { - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", "typescript": "^6.0.3" } diff --git a/packages/data/package.json b/packages/data/package.json index f992d4c8..cb6265b4 100644 --- a/packages/data/package.json +++ b/packages/data/package.json @@ -52,7 +52,7 @@ }, "devDependencies": { "stopwords-iso": "1.1.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3" } } From 0778c82d5facb7deccbece5d1d207f916c0a8123 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 11:16:26 +0200 Subject: [PATCH 031/130] fix: keep migration fixtures behavior-stable --- .../cs/nakit-legal-services-framework.snapshot.json | 6 +++--- .../contracts/cs/sanofi-bonus-agreement.snapshot.json | 8 ++++---- .../de/geschaeftsfuehrer-dienstvertrag.snapshot.json | 4 ++-- .../en/software-license-agreement.snapshot.json | 9 ++++----- .../anonymize/src/__test__/us-bank-routing.test.ts | 10 ---------- packages/anonymize/src/data/address-boundaries.json | 4 ---- packages/anonymize/src/data/triggers.de.json | 2 +- packages/anonymize/src/data/triggers.en.json | 11 ----------- packages/anonymize/src/detectors/regex.ts | 8 ++++---- packages/data/config/address-boundaries.json | 4 ---- packages/data/config/triggers.de.json | 2 +- packages/data/config/triggers.en.json | 11 ----------- 12 files changed, 19 insertions(+), 60 deletions(-) diff --git a/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json index 4b2e3172..938969d7 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/cs/nakit-legal-services-framework.snapshot.json @@ -111,9 +111,9 @@ }, { "start": 12008, - "end": 12029, + "end": 12077, "label": "address", - "text": "Kodaňská 46, Praha 10", + "text": "Kodaňská 46, Praha 10, nebude-li v konkrétním případě dohodnuto jinak", "source": "regex" }, { @@ -229,5 +229,5 @@ "source": "trigger" } ], - "redactedText": "RÁMCOVÁ DOHODA NA POSKYTOVÁNÍ PRÁVNÍCH SLUŽEB\n\nČíslo 2026/051 NAKIT\n\n\n\nSmluvní strany\n\n\n\n[ORGANIZATION_1]\n\nse sídlem \t[ADDRESS_1]\n\nIČO: \t[REGISTRATION_NUMBER_1] \n\nDIČ: \t [TAX_IDENTIFICATION_NUMBER_1]\n\nzastoupen: \txxx\n\nzapsán v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2]\n\nbankovní spojení \txxx\n\n\tč. ú. xxx\n\n(dále jen „Objednatel“)\n\n\n\na\n\n[PERSON_1]\n\nse sídlem [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2] \n\nbankovní spojení xxx\n\n č. ú. xxx\n\n \n\n (dále jen „Poskytovatel“)\n\n\n\n(Objednatel a Poskytovatel budou v této rámcové dohodě na poskytování právních služeb označováni jednotlivě jako „Smluvní strana“ a společně jako „Smluvní strany“ a tato rámcová dohoda jako „Smlouva“),\n\n\n\nuzavírají v souladu s ustanovením § 1746 odst. 2 zákona č. 89/2012 Sb., občanský zákoník, v platném znění (dále jen „Občanský zákoník“) a v souladu s ustanovením § 29 písm. k) bod 1. a 2. zákona č. 134/2016 Sb., o zadávání veřejných zakázek, ve znění pozdějších předpisů (dále jen „Zákon o zadávání veřejných zakázek“), jakož i v souladu se zákonem č. 85/1996 Sb., o advokacii, ve znění pozdějších předpisů (dále jen „Zákon o advokacii“) tuto Smlouvu. \n\n\n\n\n\n\n\n\n\nÚčel a předmět Smlouvy\n\nÚčelem této Smlouvy je stanovení podmínek a právního rámce pro uzavírání Dílčích smluv (jak je tento pojem definován níže v odst. 1.8 Smlouvy) mezi Objednatelem a Poskytovatelem na poskytování právních služeb, a to na základě písemných Objednávek Objednatele.\n\nPředmětem této Smlouvy je stanovení práv a povinností Smluvních stran pro postup při uzavírání Dílčích smluv a následném poskytování právních služeb Poskytovatelem Objednateli, přičemž poskytováním právních služeb se pro účely této Smlouvy rozumí poskytování právních služeb ve smyslu § 29 odst. 1 písm. k) bod 1. a 2. Zákona o zadávání veřejných zakázek (dále jen „Služby“).\n\nPoskytovatel se zavazuje poskytnout Objednateli Služby na základě Dílčí smlouvy. Služby poskytované Poskytovatelem Objednateli na základě konkrétní Dílčí smlouvy budou dále nazývány jako „Plnění“. Dílčí smlouvy budou uzavírány níže uvedeným postupem, na základě písemné Objednávky Objednatele doručené Poskytovateli (dále jen „Objednávka“). Objednávka musí obsahovat minimálně tyto náležitosti:\n\nidentifikační údaje Poskytovatele a Objednatele;\n\nčíslo a datum vystavení Objednávky;\n\nčíslo Smlouvy;\n\nrámcové vymezení Plnění;\n\nmaximální rozsah a maximální cenu Plnění; a\n\npodpis oprávněné osoby Objednatele.\n\nObjednatel je oprávněn, avšak nikoli povinen, vystavovat dle svého uvážení Objednávky ode dne nabytí účinnosti této Smlouvy. Každá takto vystavená Objednávka se považuje za návrh na uzavření Dílčí smlouvy za podmínek stanovených touto Smlouvou. Poskytovatel je povinen písemně potvrdit Objednávku ve lhůtě dvou (2) pracovních dnů od jejího doručení Poskytovateli.\n\nPotvrzení Objednávky musí obsahovat minimálně tyto náležitosti: \n\nidentifikační údaje Objednatele a Poskytovatele; \n\nčíslo Objednávky, která je potvrzována; a \n\npodpis oprávněné osoby Poskytovatele.\n\nV případě, že Objednávka nebude splňovat uvedené minimální náležitosti, má Poskytovatel povinnost na tuto skutečnost neprodleně upozornit Objednatele. Objednatel je poté povinen vystavit novou Objednávku a Poskytovatel je povinen ji ve lhůtě dvou (2) pracovních dnů od jejího doručení písemně potvrdit. Není-li v článku 4 Smlouvy stanoveno jinak, běží lhůta pro poskytnutí Plnění dle příslušné Dílčí smlouvy od okamžiku doručení této nové Objednávky. \n\nPotvrzení Objednávky, které obsahuje dodatky, výhrady, omezení nebo jiné změny se považuje za odmítnutí Objednávky a tvoří nový návrh Poskytovatele na uzavření Dílčí smlouvy, a to i v případě takového dodatku, výhrady, omezení nebo jiné změny, které podstatně nemění podmínky Objednávky. Dílčí smlouva je v takovém případě uzavřena pouze tehdy, pokud tento nový návrh Objednatel písemně potvrdí a doručí zpět Poskytovateli. \n\nDoručením potvrzení Objednávky Objednateli dojde k uzavření smlouvy o poskytnutí služeb, přičemž práva a povinnosti Smluvních stran dle této smlouvy o poskytnutí služeb odpovídají v celém rozsahu právům a povinnostem Objednatele a Poskytovatele stanovených touto Smlouvou (dále jen „Dílčí smlouva“).\n\nPočet Objednávek vystavených Objednatelem není omezený. Současně platí, že Objednatel není povinen Objednávku vystavit.\n\nPoskytovatel se zavazuje poskytnout Objednateli Plnění za podmínek uvedených v této Smlouvě a v Dílčí smlouvě ve sjednaném rozsahu, jakosti a čase. \n\nObjednatel se zavazuje zaplatit za Plnění poskytnuté v souladu s touto Smlouvou a Dílčí smlouvou Cenu dle článku 2 této Smlouvy.\n\nObjednatel při uzavírání této Smlouvy negarantuje žádný minimální objem plnění, který bude zadán v průběhu její platnosti. Objednatel uzpůsobuje rozsah poptávaného plnění svým aktuálním potřebám, které jsou v čase proměnlivé. Poskytovatel se přes výše uvedené zavazuje být připraven poskytnout plnění v rozsahu poptávaném Objednatelem dle podmínek této Smlouvy. \n\nSmluvní strany sjednávají, že k poskytnutí konkrétního Plnění (resp. jeho relevantní části) na základě Dílčí smlouvy je Poskytovatel povinen na základě, v rozsahu a v souladu s požadavky a/nebo pokyny Objednatele, které budou činěny prostřednictvím e-mailové komunikace kontaktní osobou Objednatele uvedenou v čl. 13 odst. 13.11 písm. a) Smlouvy nebo jí pověřenou osobou. V e-mailu podle přechozí věty Objednatel uvede specifikaci konkrétního požadavku (včetně případného požadavku na výstup) a/nebo pokynu. Hovoří-li se v této Smlouvě o Plnění, rozumí se jím i jeho relevantní část, poskytnutá Objednateli na základě konkrétního požadavku a/nebo pokynu dle tohoto odstavce Smlouvy.\n\nKaždá Dílčí smlouva nabývá platnosti dnem uzavření. Dílčí smlouva nabývá účinnosti dnem uzavření, nevztahuje-li se na ni povinnost zveřejnění v registru smluv podle zákona č. 340/2015 Sb., o zvláštních podmínkách účinnosti některých smluv, uveřejňování těchto smluv a o registru smluv (zákon o registru smluv) ve znění pozdějších předpisů (dále jen „Zákon o registru smluv“). Vztahuje-li se na příslušnou Dílčí smlouvu povinnost jejího zveřejnění v registru smluv, nabývá Dílčí smlouva účinnosti dnem zveřejnění v registru smluv, přičemž zveřejnění Dílčí smlouvy v registru smluv zajistí Objednatel. V Dílčí smlouvě může být výslovně uvedeno pozdější datum nabytí účinnosti než dnem jejího uzavření/zveřejnění v registru smluv (dle relevance).\n\nCena\n\nCena za poskytnutí Plnění Poskytovatelem odpovídá součinu skutečného časového rozsahu poskytnutého Plnění a hodinové sazby dle Přílohy č. 1 této Smlouvy na základě konkrétní Dílčí smlouvy (dále jen „Cena“). Nejnižší časová jednotka odpracovaného času, za kterou náleží Poskytovateli odměna za poskytnuté Plnění, je jedna (1) hodina.\n\nSkutečný časový rozsah Plnění je limitován odhadovaným maximálním časovým rozsahem Plnění uvedeným v Dílčí smlouvě. Skutečný časový rozsah Plnění bude Poskytovatelem Objednateli dokladován v rámci akceptační procedury dle článku 6 Smlouvy, jejíž průběh bude stvrzen Smluvními stranami podpisem Akceptačního protokolu, jehož vzor tvoří Přílohu č. 2 této Smlouvy a je její nedílnou součástí. \n\nObjednatel si vyhrazuje právo uznat v rámci fakturace pouze takový časový rozsah Plnění, který byl na poskytování Plnění účelně vynaložen. \n\nCena každé jednotlivé složky Plnění zahrnuje veškeré náklady Poskytovatele spojené s plněním Smlouvy, Dílčí smlouvy a poskytnutím Plnění Objednateli, vyjma pravomocně přiznané odměny za zastupování v soudním řízení, která připadá Poskytovateli. Tato Cena je cenou konečnou.\n\nCelková cena Plnění poskytnutého na základě této Smlouvy a Dílčích smluv nesmí převýšit částku [MONETARY_AMOUNT_1] bez DPH. DPH bude připočítána k ceně v souladu s platnými právními předpisy ke dni uskutečnění zdanitelného plnění.\n\nPlatební podmínky\n\nDaňové doklady za poskytování Plnění budou Poskytovatelem vystavovány vždy k poslednímu dni příslušného kalendářního měsíce, ve kterém bylo Plnění poskytováno, a bude v nich vyúčtováno Plnění poskytnuté Objednateli bez jakýchkoli vad v příslušném kalendářním měsíci. Za den uskutečnění zdanitelného plnění se považuje den podpisu Akceptačního protokolu Objednatelem.\n\nDaňový doklad (faktura) musí obsahovat náležitosti řádného daňového dokladu podle příslušných právních předpisů, zejména podle § 29 zákona č. 235/2004 Sb., o dani z přidané hodnoty, ve znění pozdějších předpisů (dále jen „Zákon o DPH“), dle zákona č. 563/1991 Sb., o účetnictví, ve znění pozdějších předpisů, dle § 435 Občanského zákoníku a níže uvedené údaje:\n\nčíslo Smlouvy a Dílčí smlouvy (Objednávky),\n\nplatební podmínky v souladu se Smlouvou a Dílčí smlouvou,\n\nmísto a datum předání a převzetí Plnění,\n\npopis fakturovaného Plnění, rozsah, jednotkovou a celkovou cenu,\n\npřílohou je kopie Akceptačního protokolu s výrokem „Akceptováno“, odsouhlaseného a potvrzeného Objednatelem.\n\nSplatnost daňového dokladu (faktury) vystaveného Poskytovatelem je třicet (30) kalendářních dní ode dne jeho doručení Objednateli. \n\nPoskytovatel zašle daňový doklad spolu s veškerými požadovanými dokumenty Objednateli nejpozději do pěti (5) kalendářních dnů od podpisu Akceptačního protokolu, jedním z následujících způsobů: \n\nbuď v elektronické podobě na adresu:\n\nxxx\n\nnebo doporučeným dopisem na následující adresu: \n\n[ORGANIZATION_1]\n\n[ADDRESS_1].\n\nV případě, že faktura nebude obsahovat stanovené náležitosti, přílohy nebo nebude vystavena v souladu s touto Smlouvou, je Objednatel oprávněn vrátit ji ve lhůtě splatnosti Poskytovateli k doplnění či opravě, aniž se tím dostane do prodlení. Lhůta splatnosti v délce třicet (30) kalendářních dní počíná běžet znovu ode dne doručení náležitě doplněné či opravené faktury Objednateli.\n\nPlatba bude provedena v české měně formou bankovního převodu na účet Poskytovatele uvedený v záhlaví této Smlouvy. Cena se považuje za uhrazenou dnem odepsání fakturované částky z účtu Objednatele ve prospěch účtu Poskytovatele.\n\nObjednatel neposkytuje Poskytovateli jakékoliv zálohy na cenu za Služby / Plnění.\n\nSmluvní strany se dohodly, že pokud bude v okamžiku uskutečnění zdanitelného plnění správcem daně zveřejněna způsobem umožňujícím dálkový přístup skutečnost, že poskytovatel zdanitelného plnění (Poskytovatel) je nespolehlivým plátcem ve smyslu ust. § 106a Zákona o DPH nebo že úplata za toto plnění má být poskytnuta zcela nebo zčásti bezhotovostním převodem na jiný účet než účet Poskytovatele, který je správcem daně zveřejněn způsobem umožňujícím dálkový přístup ve smyslu ust. § 96 Zákona o DPH, je příjemce zdanitelného plnění (Objednatel) oprávněn část ceny odpovídající dani z přidané hodnoty zaplatit přímo na bankovní účet správce daně ve smyslu ust. § 109a Zákona o DPH. Na bankovní účet Poskytovatele bude v tomto případě uhrazena část ceny odpovídající výši základu daně z přidané hodnoty. Úhrada ceny plnění (základu daně) provedená Objednatelem v souladu s ustanovením tohoto odstavce bude považována za řádnou úhradu ceny plnění poskytnutého dle Smlouvy.\n\nDoba, místo a podmínky plnění\n\nPoskytovatel je povinen poskytnout Objednateli Plnění a předat Objednateli výstup/y nejdéle do pěti (5) kalendářních dnů ode dne doručení požadavku a/nebo pokynu ve smyslu čl. 1 odst. 1.13 Smlouvy, nedohodnou-li se Smluvní strany písemně (např. e-mailem) na jiném termínu poskytnutí Plnění, nebo nevyplývá-li jiný čas poskytnutí Plnění z platných právních předpisů nebo z požadavku či výzvy příslušného orgánu. \n\nPoskytovatel se zavazuje poskytovat Služby dle této Smlouvy na celém území [COUNTRY_1]. Místem předání veškerých výstupů dle této Smlouvy je [ADDRESS_3], nebude-li v konkrétním případě dohodnuto jinak. \n\nDalší práva a povinnosti Smluvních stran\n\nPoskytovatel je povinen postupovat při poskytování Služeb / Plnění s odbornou péčí podle svých nejlepších odborných znalostí a schopností, v souladu s právním řádem [COUNTRY_1] a se Smlouvou, přičemž je při své činnosti povinen sledovat a chránit zájmy a dobré jméno Objednatele a postupovat v souladu s jeho aktuálními potřebami a pokyny. V případě nevhodných pokynů Objednatele je Poskytovatel povinen na nevhodnost těchto pokynů Objednatele písemně upozornit, v opačném případě nese Poskytovatel zejména odpovědnost za vady a za škodu, které v důsledku nevhodných pokynů Objednatele Poskytovateli a/nebo třetím osobám vznikly.\n\nPoskytovatel je dále povinen bezodkladně oznámit Objednateli všechny okolnosti, o kterých se při poskytování Služeb / Plnění dozví, a které by mohly mít vliv na změnu pokynů Objednatele nebo na poskytování Služeb / Plnění dle této Smlouvy a Dílčí smlouvy. \n\nPoskytovatel je povinen informovat Objednatele na jeho žádost o průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, a akceptovat jeho doplňující pokyny a připomínky k poskytovanému Plnění. V případě, že Objednatel zjistí v průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, nedostatky, Poskytovatel je povinen na písemnou výzvu Objednatele tyto nedostatky odstranit bez nároku na navýšení ceny poskytovaného Plnění bezodkladně, nejdéle však do pěti (5) pracovních dnů ode dne obdržení výzvy.\n\nObjednatel poskytne Poskytovateli k plnění požadovaného Plnění:\n\nveškerou jemu dostupnou dokumentaci;\n\nb)\tpravdivé a včasné informace potřebné k řádnému poskytování Plnění;\n\nc)\tveškerou součinnost nezbytnou pro řádné poskytování Plnění.\n\nPoskytovatel je povinen řádně pečovat o věci a dokumenty, které od Objednatele k poskytnutí požadovaného Plnění obdrží. \n\nSmluvní strany se zavazují vzájemně se písemně informovat o případných změnách sídla, právní formy, změně bankovního spojení, zrušení registrace k DPH, a dalších významných skutečností rozhodných pro plnění ze Smlouvy, resp. Dílčí smlouvy, a to bezodkladně po uskutečnění takovéto změny. \n\nPoskytovatel je povinen neprodleně informovat Objednatele o kybernetických bezpečnostních incidentech (dále jen „KBI“) na straně Poskytovatele souvisejících s plněním dle Smlouvy a/nebo Dílčích smluv, které by mohly mít dopad na kybernetickou bezpečnost u Objednatele. KBI je definován v § 2 odst. 2 písm. f) zákona č. 264/2025 Sb., o kybernetické bezpečnosti. \n\nPoskytovatel poskytne Objednateli součinnost při zvládání KBI v souvislosti s poskytovaným plněním dle Smlouvy a/nebo Dílčích smluv, a bude se v této souvislosti řídit pokyny Objednatele.\n\nPoskytovatel prohlašuje, že si je vědom předpisů týkajících se mezinárodních sankcí, zejm. pak čl. 5 k nařízení Rady EU č. 833/2014 o omezujících opatřeních vzhledem k činnostem Ruska destabilizujícím situaci na Ukrajině, ve znění pozdějších předpisů a nařízení Rady EU č. 269/2014 o omezujících opatřeních vzhledem k činnostem narušujícím nebo ohrožujícím územní celistvost, svrchovanost a nezávislost Ukrajiny, ve znění pozdějších předpisů, vč. prováděcího nařízení Rady EU 2022/581 ze dne [DATE_1], ve znění pozdějších předpisů (dále jen „Předpisy o mezinárodních sankcích“). Poskytovatel prohlašuje, že u něho, jakož ani u okruhu sledovaných subjektů dle právních Předpisů o mezinárodních sankcích vztahujícího se k plnění Smlouvy a/nebo Dílčích smluv není dána překážka uzavření či plnění Smlouvy a/nebo Dílčích smluv. Dále výslovně Poskytovatel zvláště prohlašuje, že nezpřístupní žádné finanční prostředky ani hospodářské zdroje sankcionovaným subjektům ve smyslu tohoto odstavce Smlouvy. Pro vyloučení pochybností se stanoví, že: (i) prohlášení musí být v platnosti po celou dobu plnění Smlouvy, resp. Dílčích smluv, a (ii) jsou-li do tohoto prohlášení zahrnuti poddodavatelé či jiné třetí osoby, je Poskytovatel povinen zjistit skutečnosti vztahující se k těmto třetím osobám s řádnou péčí, přinejmenším ověřením informace u třetích osob a prověřením veřejných rejstříků a evidencí. Poskytovatel je povinen zajistit smluvně dodržování příslušných povinností a omezovat rizika vyplývajících z okolností vedoucích k mezinárodním sankcím, a zavazuje se zajistit, aby jeho prohlášení dle tohoto odstavce Smlouvy zůstala pravdivá a v platnosti po celou dobu účinnosti Smlouvy a/nebo Dílčích smluv. V případě, že Poskytovatel zjistí, že pravdivost jeho prohlášení je, byť jen ohrožena, je povinen o tom Objednatele bezodkladně písemně vyrozumět.\n\nSmluvní strany se dohodly, že pokud to bude potřebné ke splnění požadavků v oblasti kybernetické bezpečnosti stanovených obecně závaznými právními předpisy, zejména v návaznosti na nový zákon č. 264/2025 Sb., o kybernetické bezpečnosti, který nabyl účinnosti dne [DATE_2], a související prováděcí právní předpisy, uzavřou bez zbytečného odkladu po výzvě Objednatele písemný dodatek k této Smlouvě a/nebo Dílčí smlouvě zohledňující takové požadavky, a to formou úpravy či doplnění ustanovení týkajících se zajištění bezpečnostních požadavků v souladu s novou právní úpravou a implementovaným systémem řízení bezpečnosti informací na straně Objednatele a/nebo koncového zákazníka Objednatele. Náklady na bezpečnost informací v důsledku změny legislativy v oblasti bezpečnosti informací nese Poskytovatel.\n\nSchválení poskytnutého Plnění a převzetí výstupů\n\nPoskytovatel splní svou povinnost řádně poskytnout Plnění dnem, kdy je příslušná činnost řádně vykonána a její výstup v Objednatelem požadované formě (dále jen „výstup“) řádně předán Objednateli. Poskytovatel je povinen vypracovat písemnou zprávu, která bude obsahovat zejména údaje o Objednateli a Poskytovateli, číslo této Smlouvy a Dílčí smlouvy, obsah a rozsah poskytnutého Plnění, závěr z poskytnutého Plnění, popř. doporučení Poskytovatele pro další postup Objednatele. Výstup bude Objednateli Poskytovatelem předán v českém jazyce v dohodnutých termínech buď v listinné podobě vytištěné v jednom (1) originálu nebo v elektronické podobě ve formátu požadovaném Objednatelem. \n\nSplnění povinnosti Poskytovatele podle odstavce 6.1 Smlouvy Smluvní strany osvědčí sepsáním protokolu o schválení poskytnutého Plnění a předání a převzetí výstupu, obsahujícího soupis poskytnutého Plnění, včetně rozpisu hodin odpracovaných Poskytovatelem při plnění jednotlivých úkolů, a označení veškerých předávaných výstupů, který bude vyhotoven ve dvou (2) vyhotoveních s platností originálu a bude opatřen podpisem oprávněných osob obou Smluvních stran (dále jen „Akceptační protokol“), přičemž každá ze Smluvních stran obdrží po jednom (1) vyhotovení. Takto vyhotovený Akceptační protokol předá Poskytovatel Objednateli vždy do pěti (5) kalendářních dnů od skončení příslušného měsíce, za který se Akceptační protokol vyhotovuje. \n\nObjednatel je oprávněn odmítnout převzetí výstupu, a tedy podepsat Akceptační protokol s výrokem „Neakceptováno“, pokud Plnění nebylo poskytnuto řádně v souladu s touto Smlouvou a Dílčí smlouvou a/nebo ve sjednané kvalitě a/nebo pokud výstup neobsahoval veškeré údaje požadované Objednatelem a/nebo Objednatel nesouhlasí s počtem hodin poskytnutého Plnění, které budou Objednateli účtovány, přičemž v takovém případě Objednatel důvody odmítnutí převzetí výstupu písemně Poskytovateli sdělí, a to nejpozději do pěti (5) pracovních dnů od předání Akceptačního protokolu. Na následné předání výstupu se použijí výše uvedená ustanovení tohoto článku Smlouvy. Pokud Objednatel uplatní písemný nárok na odstranění vad výstupu, zavazuje se Poskytovatel tyto vady odstranit bez zbytečného odkladu, nejpozději však do pěti (5) pracovních dnů, nestanoví-li Objednatel jinak. \n\nV případě zjevných vad poskytnutého Plnění nebo jeho výstupů není Objednatel povinen Plnění schválit a výstupy převzít a do odstranění těchto vad není povinen podepsat Akceptační protokol s výrokem „Akceptováno“ a zaplatit fakturovanou cenu Plnění. \n\nPovinnost mlčenlivosti a zpracování osobních údajů\n\nSmluvní strany sjednávají, že za důvěrné informace považují takové informace, které získají od druhé Smluvní strany, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, nebo které nejsou v obchodních kruzích běžně dostupné, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, zejména pak informace, údaje a skutečnosti o jakýchkoliv obchodních, finančních, technických, právních a jiných skutečnostech, které by s ohledem na dané podmínky mohly být považovány za důvěrné, poskytnuté či jakkoliv zpřístupněné jednou ze Smluvních stran či jejími zástupci druhé Smluvní straně či jejím zástupcům, ať v ústní, písemné, grafické, elektronické či jiné formě, které se Smluvní strany dozvěděly v souvislosti se Smlouvou a/nebo Dílčí smlouvou, a to bez ohledu zda jsou nebo nejsou označené za důvěrné informace (dále jen „Důvěrné informace“). \n\nS těmito Důvěrnými informacemi budou nakládat jako s vlastním obchodním tajemstvím, aniž by bylo nutné takové informace jako Důvěrné vždy jednotlivě označovat. Výše uvedené nevylučuje možnost v jednotlivých případech při zvýšeném zájmu toto označení pro jednotlivé informace použít. Smluvní strany berou zároveň na vědomí, že některé z Důvěrných informací jsou také předmětem obchodního tajemství druhé Smluvní strany, chráněným dle příslušných ustanovení Občanského zákoníku. \n\nKaždá ze Smluvních stran se zavazuje vynaložit maximální úsilí, které lze spravedlivě požadovat, aby důvěrnost Důvěrných informací druhé Smluvní strany byla důsledně dodržována jejími pracovníky i osobami, které případně, v souladu s dohodou uzavřenou s druhou Smluvní stranou, k plnění účelu spolupráce použije. Použije-li některá ze Smluvních stran k plnění třetí osoby, je oprávněna zpřístupnit jí Důvěrné informace získané od druhé Smluvní strany pouze s jejím souhlasem a v rozsahu nezbytně nutném pro jí poskytované plnění, a je rovněž povinna zavázat třetí osobu povinností mlčenlivosti v rozsahu dle Smlouvy. Za porušení povinností třetí osobou odpovídá Smluvní strana, která jí Důvěrné informace zpřístupnila.\n\nSmluvní strany se dále zavazují:\n\nzachovat mlčenlivost o Důvěrných informací, a to až do doby, kdy se informace této povahy stanou obecně známými za předpokladu, že se tak nestane porušením povinnosti mlčenlivosti;\n\npoužít informace uvedené povahy pouze pro činnosti související s přípravou a plněním Smlouvy a/nebo Dílčí smlouvy, dále tyto informace nerozšiřovat ani nereprodukovat, nezpracovávat je v systémech umělé inteligence (systémech AI), nezpřístupnit je jiným osobám ani je nevyužít pro sebe či pro jinou osobu;\n\nomezit počet svých pracovníků pro styk s těmito Důvěrnými informacemi a přijmout účinná opatření pro zamezení jejich úniku, případně zabezpečit, aby i tyto osoby považovaly uvedené informace za Důvěrné a zachovávaly o nich mlčenlivost.\n\nPovinnost plnit ustanovení dle čl. 7 odst. 7.1 až 7.4 této Smlouvy se nevztahuje na informace, které:\n\nje Smluvní strana povinna zveřejnit na základě zákonem stanovené povinnosti;\n\nmohou být zveřejněny bez porušení Smlouvy;\n\nbyly písemným souhlasem obou Smluvních stran zproštěny těchto omezení;\n\njsou známé nebo byly zveřejněny jinak, než následkem zanedbání povinnosti jedné ze Smluvních stran;\n\npříjemce je zná dříve, než je sdělí Smluvní strana;\n\njsou vyžádány soudem, státním zastupitelstvím nebo příslušným správním orgánem na základě zákona;\n\nSmluvní strana je sdělí osobě vázané zákonnou povinností mlčenlivosti (např. advokátovi nebo daňovému poradci) za účelem uplatňování svých práv;\n\nje Objednatel povinen sdělit svému zakladateli.\n\nV případě, že se kterákoliv Smluvní strana hodnověrným způsobem dozví, popřípadě bude mít důvodné podezření, že došlo k prozrazení či zpřístupnění Důvěrné informace neoprávněné osobě, je povinna neprodleně tuto skutečnost druhé Smluvní straně oznámit.\n\nPokud Smluvní strana, která poruší svůj závazek vyplývající z tohoto článku Smlouvy, takto způsobí druhé Smluvní straně škodu nebo ona či jiná třetí osoba získá na základě takové skutečnosti majetkový prospěch, má druhá Smluvní strana vůči porušující Smluvní straně nárok na náhradu veškeré jí vzniklé škody a na zaplacení částky odpovídající majetkovému prospěchu získanému v souvislosti s touto skutečností porušující Smluvní stranou či jinou třetí osobou. Nárok na náhradu případné škody není sjednáním ani zaplacením kterékoliv smluvní pokuty dle Smlouvy a/nebo Dílčí smlouvy dotčen.\n\nPovinnost ochrany Důvěrných informací trvá bez ohledu na ukončení platnosti a účinnosti Smlouvy a/nebo Dílčích smluv.\n\nPokud řádné poskytování Služeb vyžaduje zpracování osobních údajů zaměstnanců Objednatele, budou tyto osobní údaje zaměstnanců Objednatele v postavení kontaktních osob zpracovávány Poskytovatelem v rozsahu:\n\njméno, příjmení a titul,\n\ne-mailová adresa,\n\ntelefonní číslo.\n\nZpracováním osobních údajů ve smyslu tohoto odstavce Smlouvy se rozumí zejména jejich shromažďování, ukládání na nosiče informací, používání, třídění nebo kombinování, blokování a likvidace s využitím manuálních a automatizovaných prostředků v rozsahu nezbytném pro zajištění řádného poskytování Služeb.\n\nOsobní údaje budou zpracovány po dobu poskytování Služeb. Ukončením této Smlouvy / Dílčí smlouvy nezanikají povinnosti Poskytovatele týkající se bezpečnosti a ochrany osobních údajů až do okamžiku jejich úplné likvidace či předání jinému zpracovateli.\n\nSmluvní strany se dohodly, že Poskytovatel nemá nárok na náhradu nákladů spojených se zpracováním osobních údajů či s plněním povinností vyplývajících z příslušné právní úpravy.\n\nObjednatel je povinen přijmout vhodná opatření na to, aby poskytl subjektům údajů stručným, transparentním, srozumitelným a snadno přístupným způsobem za použití jasných a jednoduchých jazykových prostředků veškeré informace a učinil veškerá sdělení požadovaná Nařízením Evropského parlamentu a Rady (EU) č. 2016/679 ze dne [DATE_3], obecného nařízení o ochraně osobních údajů (dále jen „Nařízení“) ve spojení se zákonem o zpracování osobních údajů.\n\n\n\nPoskytovatel je při plnění této povinnosti povinen:\n\nzpracovávat osobní údaje pouze na základě doložených pokynů Objednatele;\n\nzohledňovat povahu zpracování osobních údajů a být Objednateli nápomocen pro splnění Objednatelovy povinnosti reagovat na žádosti o výkon práv subjektu údajů, jakož i pro splnění dalších povinností ve smyslu Nařízení;\n\nzajistit, že jeho zaměstnanci budou zpracovávat osobní údaje pouze za podmínek a v rozsahu Poskytovatelem stanoveném;\n\nPoskytovatel je při plnění této povinnosti oprávněn v rozsahu nezbytném pro plnění předmětu Smlouvy / Dílčí smlouvy zapojit do zpracování i další případné zpracovatele, k čemuž mu Objednatel tímto uděluje povolení. \n\nSmluvní strany jsou při zpracování povinny:\n\nzavést technická, organizační, personální a jiná vhodná opatření ve smyslu Nařízení, aby zajistily a byly schopny kdykoliv doložit, že zpracování osobních údajů je prováděno v souladu s Nařízením a zákonem o zpracování osobních údajů tak, aby nemohlo dojít k neoprávněnému nebo nahodilému přístupu k osobním údajům a k datovým nosičům, které tyto údaje obsahují, k jejich změně, zničení či ztrátě, neoprávněným přenosům, k jejich jinému neoprávněnému zpracování, jakož i k jinému zneužití, a tato opatření podle potřeby průběžné revidovat a aktualizovat;\n\nvést a průběžné revidovat a aktualizovat záznamy o zpracování osobních údajů ve smyslu Nařízení;\n\nřádně a včas ohlašovat případná porušení zabezpečení osobních údajů Úřadu pro ochranu osobních údajů a spolupracovat s tímto úřadem v nezbytném rozsahu;\n\nnavzájem se informovat o všech okolnostech významných pro plnění dle tohoto článku Smlouvy;\n\nzachovávat mlčenlivost o osobních údajích a o bezpečnostních opatřeních, jejichž zveřejnění by ohrozilo zabezpečení osobních údajů, a to i po skončení této Smlouvy / Dílčí smlouvy;\n\npostupovat v souladu s dalšími požadavky Nařízení a zákona o zpracování osobních údajů, zejména dodržovat obecné zásady zpracování osobních údajů, plnit své informační povinnosti, nepředávat osobní údaje třetím osobám bez potřebného oprávnění, respektovat práva subjektů údajů a poskytovat v této souvislosti nezbytnou součinnost.\n\nOchrana autorských práv\n\nPodpisem Smlouvy Poskytovatel poskytuje Objednateli na dobu trvání majetkových práv autorských nevypověditelnou, převoditelnou, výhradní a územně neomezenou licenci k vytváření kopií, užívání a zpřístupnění dalším osobám všech výstupů a dále jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, jež podle obecně závazných právních předpisů představují autorská díla nebo práva pořizovatele k jím pořízené databázi, včetně práva upravovat a )měnit takováto autorská díla nebo databáze.\n\nObjednatel není ve svých právech k užití výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, nijak omezen. Objednatel je oprávněn bez souhlasu Poskytovatele výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořené v souvislosti s ní Poskytovatelem či jeho poddodavateli, nebo jejich části upravovat či doplňovat.\n\nPoskytovatel není oprávněn výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich části, jakkoliv rozšiřovat bez předchozího písemného souhlasu Objednatele. Přenechání výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich částí Poskytovatelem třetí osobě bez předchozího písemného souhlasu Objednatele se považuje za podstatné porušení Smlouvy.\n\nPoskytovatel odpovídá za to, že plnění předmětu Smlouvy / Dílčí smlouvy, nezasahuje a nebude zasahovat do práv jiných osob, zejména práv z průmyslového nebo jiného duševního vlastnictví, a to pro jakékoliv využití plnění v [COUNTRY_2] i v zahraničí.\n\nSmluvní strany tímto sjednávají, že veškerá finanční vyrovnání za poskytnutí licence dle tohoto článku 8 Smlouvy jsou zahrnuta v Ceně dle článku 2 Smlouvy.\n\nPoddodavatelé\n\nPoskytuje-li Poskytovatel Objednateli část plnění předmětu Smlouvy, resp. Dílčí smlouvy, prostřednictvím poddodavatele písemně schváleným ze strany Objednatele, je za veškerá taková plnění poddodavatele odpovědný Poskytovatel sám, jako kdyby tato plnění byla poskytována Poskytovatelem. \n\nPoskytovatel je povinen zajistit, aby všichni poddodavatelé měli platná příslušná oprávnění, odbornou kvalifikaci a dostatek odborných zkušeností, jež jsou nezbytné pro poskytování příslušných částí Služeb dle jejich smluv s Poskytovatelem. Žádná poddodavatelská smlouva nezakládá smluvní vztahy mezi Objednatelem a poddodavatelem. \n\nPoskytovatel je dále povinen smluvně zajistit, že i jeho poddodavatelé, kteří se budou podílet na plnění dle Smlouvy, resp. Dílčích smluv, se zaváží dodržovat v plném rozsahu ujednání mezi Poskytovatelem a Objednatelem a nebudou v rozporu s požadavky Objednatele uvedenými ve Smlouvě a/nebo Dílčích smlouvách.\n\nPokud Objednatel shledá, že plnění předmětu Smlouvy, resp. Dílčí smlouvy, uskutečněné poddodavatelem nedosahuje potřebných kvalit, je vadné nebo nevykazuje jiné náležitosti požadované Objednatelem, nebo že sám poddodavatel není subjektem kompetentním pro provádění plnění z této Smlouvy, resp. Dílčí smlouvy, je oprávněn požadovat, aby Poskytovatel neprodleně svěřil takto identifikovanou část plnění jinému poddodavateli, nebo se ujal této části plnění sám. \n\nZajištění plnění, která Poskytovatel svěří poddodavateli, není poddodavatel oprávněn zadat třetím osobám. Poskytovatel je povinen na tuto skutečnost poddodavatele upozornit před uzavřením poddodavatelské smlouvy a odpovídá za její dodržování. \n\nUzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, budou považovány za podstatné porušení Smlouvy.\n\nCompliance ujednání\n\nSmluvní strany se zavazují dodržovat právní předpisy a chovat se tak, aby jejich jednání nemohlo vzbudit důvodné podezření ze spáchání nebo páchání trestného činu, a to ani takového, který by mohl být přičitatelný Objednateli podle zákona č. 418/2011 Sb., o trestní odpovědnosti právnických osob a řízení proti nim, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že učiní všechna opatření k tomu, aby se nedopustily ony a ani nikdo z jejich zaměstnanců či zástupců jakékoliv formy korupčního jednání, zejména jednání, které by mohlo být vnímáno jako přijetí úplatku, podplácení nebo nepřímé úplatkářství či jiný trestný čin spojený s korupcí dle zákona č. 40/2009 Sb., trestní zákoník, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že:\n\nneposkytnou, nenabídnou ani neslíbí úplatek jinému nebo pro jiného v souvislosti s obstaráváním věcí obecného zájmu anebo v souvislosti s podnikáním svým nebo jiného; \n\núplatek nepřijmou, ani si jej nedají slíbit, ať už pro sebe nebo pro jiného v souvislosti s obstaráním věcí obecného zájmu nebo v souvislosti s podnikáním svým nebo jiného. \n\nÚplatkem se přitom rozumí neoprávněná výhoda spočívající v přímém majetkovém obohacení nebo jiném zvýhodnění, které se dostává nebo má dostat uplácené osobě nebo s jejím souhlasem jiné osobě, a na kterou není nárok.\n\nSmluvní strany nebudou ani u svých obchodních partnerů tolerovat jakoukoliv formu korupce či uplácení.\n\nV případě, že je zahájeno trestní stíhání Poskytovatele, zavazuje se Poskytovatel o tomto bez zbytečného odkladu Objednatele písemně informovat.\n\nSankce\n\nV případě nedodržení termínu poskytnutí Plnění a/nebo předání výstupu a/nebo odstranění vad poskytnutého Plnění ve sjednané kvalitě podle této Smlouvy a Dílčí smlouvy ze strany Poskytovatele je Poskytovatel povinen uhradit Objednateli smluvní pokutu ve výši [MONETARY_AMOUNT_2]. \n\nZa každé jednotlivé porušení povinnosti mlčenlivosti podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinnosti Poskytovatele při zpracování osobních údajů podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinností Poskytovatele stanovených v čl. 5.7 až 5.9 této Smlouvy je Objednatel oprávněn požadovat po Poskytovateli zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nJestliže se jakékoli prohlášení Poskytovatele podle článku 8 Smlouvy ukáže nepravdivým nebo zavádějícím nebo Poskytovatel poruší jiné povinnosti podle článku 8 této Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nV případě prodlení Objednatele s úhradou řádně vystavených a doručených faktur, je Objednatel povinen uhradit Poskytovateli úrok z prodlení dle nařízení vlády č. 351/2013 Sb., kterým se určuje výše úroků z prodlení a nákladů spojených s uplatněním pohledávky, určuje odměna likvidátora, likvidačního správce a člena orgánu právnické osoby jmenovaného soudem a upravují některé otázky Obchodního věstníku a veřejných rejstříků právnických a fyzických osob, evidence svěřenských fondů a evidence údajů o skutečných majitelích.\n\nVyúčtování smluvní pokuty / úroků z prodlení – penalizační faktura, musí být druhé Smluvní straně zasláno datovou zprávou prostřednictvím datové schránky. Smluvní pokuta a úroky z prodlení jsou splatné ve lhůtě třiceti (30) kalendářních dnů ode dne doručení penalizační faktury povinné Smluvní straně. Úhrada smluvní pokuty / úroků z prodlení se provádí bankovním převodem na účet oprávněné Smluvní strany uvedený v penalizační faktuře. Částka se považuje za zaplacenou okamžikem jejího připsání ve prospěch účtu oprávněné Smluvní strany.\n\nZaplacením smluvní pokuty není dotčen nárok Objednatele na náhradu újmy v celém rozsahu způsobené újmy, ani povinnost Poskytovatele řádně dokončit plnění předmětu Smlouvy, popř. odstranit vady.\n\nObjednatel je v případě uplatnění smluvní pokuty vůči Poskytovateli dle této Smlouvy a v případě neuhrazení smluvní pokuty ze strany Poskytovatele oprávněn využít institut započtení vzájemných pohledávek.\n\nDoba trvání Smlouvy\n\nTato Smlouva nabývá platnosti dnem jejího podpisu oběma Smluvními stranami a účinnosti dnem jejího uveřejnění v registru smluv v souladu se zákonem č. 340/2015 Sb., o registru smluv, ve znění pozdějších předpisů. Zveřejnění Smlouvy v registru smluv zajistí Objednatel. \n\nTato Smlouva se uzavírá na dobu určitou, a to na dobu 48 měsíců od nabytí účinnosti. \n\nSmlouva / Dílčí smlouva může být ukončena dohodou Smluvních stran v písemné formě, přičemž účinky zrušení Smlouvy / Dílčí smlouvy nastanou k okamžiku stanoveném v této dohodě. Nebude-li takovýto okamžik dohodou stanoven, pak tyto účinky nastanou ke dni uzavření takovéto dohody.\n\nSmluvní strany jsou oprávněny od Smlouvy a/nebo Dílčí smlouvy odstoupit v případě jejího podstatného porušení druhou Smluvní stranou, za podmínek uvedených v § 2001 a násl. Občanského zákoníku.\n\nZa podstatné porušení této Smlouvy a/nebo Dílčí Smlouvy Poskytovatelem, které zakládá právo Objednatele na odstoupení od této Smlouvy a/nebo Dílčí smlouvy, se považuje zejména:\n\nnedodržení právních předpisů Poskytovatelem při poskytování Služeb / Plnění;\n\nprodlení Poskytovatele s poskytováním Služeb / Plnění z důvodů spočívajících výlučně na straně Poskytovatele a/nebo předáním výstupu po dobu delší než deset (10) kalendářních dnů;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 7 této Smlouvy;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 8 této Smlouvy;\n\nporušení jakékoliv povinnosti Poskytovatele stanovené v čl. 5.7 až 5.9 této Smlouvy;\n\nuzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, ve smyslu čl. 9 odst. 9.5 Smlouvy; \n\npostup Poskytovatele při poskytování Služeb / Plnění v rozporu s oprávněnými pokyny Objednatele.\n\nObjednatel je dále oprávněn od této Smlouvy a/nebo Dílčí Smlouvy odstoupit v případě, že \n\nPoskytovatel přestane splňovat požadavky na kvalifikaci uvedené ve výzvě k podání nabídky; \n\nvůči majetku Poskytovatele probíhá insolvenční řízení, v němž bylo vydáno rozhodnutí o úpadku, pokud to právní předpisy umožňují;\n\ninsolvenční návrh na Poskytovatele byl zamítnut proto, že majetek Poskytovatele nepostačuje k úhradě nákladů insolvenčního řízení;\n\nbyl Poskytovatel pravomocně odsouzen pro trestný čin. \n\nPoskytovatel je oprávněn odstoupit od této Smlouvy a/nebo Dílčí Smlouvy, pokud Objednatel bude přes písemné upozornění Poskytovatele déle než třicet (30) kalendářních dnů od písemného upozornění Poskytovatele v prodlení s plněním své platební povinnosti vůči Poskytovateli. \n\nÚčinky odstoupení nastávají uplynutím lhůty deseti (10) kalendářních dnů, která počíná běžet prvním dnem následujícím po doručení projevu vůle odstoupit od Smlouvy a/nebo Dílčí Smlouvy druhé Smluvní straně. Poskytovatel je v případě odstoupení od Smlouvy / Dílčí smlouvy povinen učinit již jen takové úkony, bez nichž by mohly být zájmy Objednatele vážně ohroženy.\n\nOdstoupení od Smlouvy a/nebo Dílčí Smlouvy se nedotýká zejména práva na náhradu újmy, smluvní pokuty a povinnosti mlčenlivosti, ani ujednání, které mají vzhledem ke své povaze zavazovat Smluvní strany i po ukončení Smlouvy / Dílčí smlouvy.\n\nSmluvní strany jsou oprávněny Smlouvu a/nebo Dílčí Smlouvu vypovědět, i bez uvedení důvodu, na základě písemné výpovědi. Výpovědní doba činí tři (3) měsíce a počíná běžet dnem doručení výpovědi druhé Smluvní straně.\n\nV případě jakéhokoliv skončení tohoto smluvního vztahu podle Smlouvy a/nebo Dílčí Smlouvy, je Poskytovatel vždy povinen neprodleně předat Objednateli veškeré věci a dokumenty, vztahující se k plnění této Smlouvy a/nebo Dílčí Smlouvy nebo poskytnuté za účelem plnění předmětu Smlouvy a/nebo Dílčí Smlouvy, nejpozději však do pěti (5) pracovních dnů ode dne ukončení smluvního vztahu. \n\nZávěrečná ustanovení\n\nSmluvní strany potvrzují, že si při uzavírání Smlouvy vzájemně sdělily všechny skutkové a právní okolnosti, o nichž ví nebo vědět musí, tak, aby se každá ze Smluvních stran mohla přesvědčit o možnosti uzavřít platnou Smlouvu a aby byl každé ze Smluvních stran zřejmý zájem druhé Smluvní strany Smlouvu uzavřít. \n\nSmluvní strany výslovně potvrzují, že si vzájemně sdělily veškeré okolnosti důležité pro uzavření Smlouvy. Smluvní strany prohlašují, že se dohodly o veškerých náležitostech Smlouvy.\n\nPoskytovatel prohlašuje a potvrzuje, že na sebe přebírá nebezpečí změny okolností ve smyslu ustanovení § 1765 odst. 2 Občanského zákoníku.\n\nSmluvní strany si ve smyslu ustanovení § 1794 odst. 2 Občanského zákoníku ujednaly, že se Poskytovatel výslovně vzdává jeho práva ve smyslu ustanovení § 1793 Občanského zákoníku a souhlasí s cenou tak, jak byla Smluvními stranami sjednána výše v této Smlouvě.\n\nSmluvní strany se zavazují vyvinout maximální úsilí k odstranění vzájemných sporů, vzniklých na základě této Smlouvy nebo v souvislosti s touto Smlouvou, a k jejich vyřešení zejména prostřednictvím jednání odpovědných pracovníků nebo jiných pověřených subjektů. Nedohodnou-li se na způsobu řešení vzájemného sporu, dohodly se Smluvní strany, že místně příslušným soudem pro řešení případných sporů bude soud příslušný dle místa sídla Objednatele.\n\nTato Smlouva může být měněna pouze vzestupně očíslovanými písemnými dodatky ke Smlouvě podepsanými oběma Smluvními stranami. \n\nDnem doručení písemností odeslaných na základě této Smlouvy nebo v souvislosti s touto Smlouvou prostřednictvím provozovatele poštovních služeb, pokud není prokázán jiný den doručení, se rozumí poslední den lhůty, ve které byla písemnost pro adresáta uložena u provozovatele poštovních služeb, a to i tehdy, jestliže se adresát o jejím uložení nedověděl. Smluvní strany tímto výslovně vylučují ust. § 573 Občanského zákoníku.\n\nPokud kterékoli ustanovení této Smlouvy nebo jeho část je nebo se stane neplatným či nevynutitelným, nebude mít tato neplatnost či nevynutitelnost vliv na platnost či vynutitelnost ostatních ustanovení této Smlouvy nebo jejích částí, pokud nevyplývá přímo z obsahu této Smlouvy, že toto ustanovení nebo jeho část nelze oddělit od dalšího obsahu. V takovém případě se obě Smluvní strany zavazují neúčinné a neplatné ustanovení nahradit novým ustanovením, které je svým účelem a významem co nejbližší ustanovení této Smlouvy, jež má být nahrazeno. \n\nPro případ, že tato Smlouva není uzavírána za přítomnosti všech Smluvních stran, platí, že Smlouva nebude uzavřena, pokud ji Poskytovatel podepíše s jakoukoliv změnou či odchylkou, byť nepodstatnou, nebo dodatkem, ledaže Objednatel takovou změnu či odchylku nebo dodatek následně schválí. To platí i v případě připojení obchodních podmínek Poskytovatele, které budou odporovat svým obsahem jakýmkoliv způsobem textu této Smlouvy. \n\nTato Smlouva je vyhotovena ve čtyřech (4) rovnocenných vyhotoveních, z nichž každé má platnost originálu. Každá ze Smluvních stran obdrží po dvou (2) stejnopisech. V případě, že bude Smlouva uzavírána elektronicky obdrží Poskytovatel/Objednatel elektronický dokument podepsaný v souladu s platnou právní úpravou.\n\nKontaktní údaje Smluvních stran pro doručování jsou následující:\n\nKontaktní osoba [PERSON_2]:\n\nxxx\n\ne-mail: xxx tel. xxx\n\nKontaktní osoba [PERSON_3]:\n\n xxx\n\n e-mail: xxx tel. xxx\n\n\n\n Nedílnou součástí této Smlouvy jsou následující přílohy:\n\nPříloha č. 1 – Cena\n\nPříloha č. 2 – Akceptační protokol\n\n\n\n\n\n\n\n\n\nSmluvní strany prohlašují, že tato Smlouva je projevem jejich pravé a svobodné vůle a nebyla sjednána v tísni ani za jinak jednostranně nevýhodných podmínek. Na důkaz toho připojují Smluvní strany své podpisy.\n\n\n\nV Praze dne: Dle elektronického podpisu\tV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\n__________________________________\n\nxxx\n\n[PERSON_1]\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\nV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\nxxx\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 1 – Cena\n\n\n\nHodinová sazba v Kč bez DPH za poskytování Služeb:\n\n\n\nHodinová sazba v Kč bez DPH\n\n2 190,- \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 2 – Akceptační protokol\n\n\n\nAkceptační protokol\n\n\n\n\n\nPlnění / výstup:\t……………………………………………………………………………………….\n\n\n\nPředáno dne: \n\n\n\n…………………………………………………………………………………………………………..\n\n\n\n\n\nPřevzal: \t\t\t\t\t\tPředal:\n\n\n\n…………………………………………… \t\t……………………………………………….\n\nObjednatel:\t\t\t\t\t\t[ORGANIZATION_3]:\n\nNárodní agentura pro komunikační a  \n\ninformační technologie, s. p. \n\nJméno, příjmení \t\t\t\t \n\nFunkce\t\t\t \n\n\t\n\n\n\nAkceptováno dne: datum akceptace\n\n\n\nVýsledek akceptace: \tAKCEPTOVÁNO / NEAKCEPTOVÁNO\n\n\n\nAkceptoval\n\nFunkce\n\nPodpis\n\n\n\n\n\n\n\n\n\n\n\n\n\nPřipomínky, výhrady:\n\n\n\npřipomínky a výhrady k předanému Plnění" + "redactedText": "RÁMCOVÁ DOHODA NA POSKYTOVÁNÍ PRÁVNÍCH SLUŽEB\n\nČíslo 2026/051 NAKIT\n\n\n\nSmluvní strany\n\n\n\n[ORGANIZATION_1]\n\nse sídlem \t[ADDRESS_1]\n\nIČO: \t[REGISTRATION_NUMBER_1] \n\nDIČ: \t [TAX_IDENTIFICATION_NUMBER_1]\n\nzastoupen: \txxx\n\nzapsán v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2]\n\nbankovní spojení \txxx\n\n\tč. ú. xxx\n\n(dále jen „Objednatel“)\n\n\n\na\n\n[PERSON_1]\n\nse sídlem [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2] \n\nbankovní spojení xxx\n\n č. ú. xxx\n\n \n\n (dále jen „Poskytovatel“)\n\n\n\n(Objednatel a Poskytovatel budou v této rámcové dohodě na poskytování právních služeb označováni jednotlivě jako „Smluvní strana“ a společně jako „Smluvní strany“ a tato rámcová dohoda jako „Smlouva“),\n\n\n\nuzavírají v souladu s ustanovením § 1746 odst. 2 zákona č. 89/2012 Sb., občanský zákoník, v platném znění (dále jen „Občanský zákoník“) a v souladu s ustanovením § 29 písm. k) bod 1. a 2. zákona č. 134/2016 Sb., o zadávání veřejných zakázek, ve znění pozdějších předpisů (dále jen „Zákon o zadávání veřejných zakázek“), jakož i v souladu se zákonem č. 85/1996 Sb., o advokacii, ve znění pozdějších předpisů (dále jen „Zákon o advokacii“) tuto Smlouvu. \n\n\n\n\n\n\n\n\n\nÚčel a předmět Smlouvy\n\nÚčelem této Smlouvy je stanovení podmínek a právního rámce pro uzavírání Dílčích smluv (jak je tento pojem definován níže v odst. 1.8 Smlouvy) mezi Objednatelem a Poskytovatelem na poskytování právních služeb, a to na základě písemných Objednávek Objednatele.\n\nPředmětem této Smlouvy je stanovení práv a povinností Smluvních stran pro postup při uzavírání Dílčích smluv a následném poskytování právních služeb Poskytovatelem Objednateli, přičemž poskytováním právních služeb se pro účely této Smlouvy rozumí poskytování právních služeb ve smyslu § 29 odst. 1 písm. k) bod 1. a 2. Zákona o zadávání veřejných zakázek (dále jen „Služby“).\n\nPoskytovatel se zavazuje poskytnout Objednateli Služby na základě Dílčí smlouvy. Služby poskytované Poskytovatelem Objednateli na základě konkrétní Dílčí smlouvy budou dále nazývány jako „Plnění“. Dílčí smlouvy budou uzavírány níže uvedeným postupem, na základě písemné Objednávky Objednatele doručené Poskytovateli (dále jen „Objednávka“). Objednávka musí obsahovat minimálně tyto náležitosti:\n\nidentifikační údaje Poskytovatele a Objednatele;\n\nčíslo a datum vystavení Objednávky;\n\nčíslo Smlouvy;\n\nrámcové vymezení Plnění;\n\nmaximální rozsah a maximální cenu Plnění; a\n\npodpis oprávněné osoby Objednatele.\n\nObjednatel je oprávněn, avšak nikoli povinen, vystavovat dle svého uvážení Objednávky ode dne nabytí účinnosti této Smlouvy. Každá takto vystavená Objednávka se považuje za návrh na uzavření Dílčí smlouvy za podmínek stanovených touto Smlouvou. Poskytovatel je povinen písemně potvrdit Objednávku ve lhůtě dvou (2) pracovních dnů od jejího doručení Poskytovateli.\n\nPotvrzení Objednávky musí obsahovat minimálně tyto náležitosti: \n\nidentifikační údaje Objednatele a Poskytovatele; \n\nčíslo Objednávky, která je potvrzována; a \n\npodpis oprávněné osoby Poskytovatele.\n\nV případě, že Objednávka nebude splňovat uvedené minimální náležitosti, má Poskytovatel povinnost na tuto skutečnost neprodleně upozornit Objednatele. Objednatel je poté povinen vystavit novou Objednávku a Poskytovatel je povinen ji ve lhůtě dvou (2) pracovních dnů od jejího doručení písemně potvrdit. Není-li v článku 4 Smlouvy stanoveno jinak, běží lhůta pro poskytnutí Plnění dle příslušné Dílčí smlouvy od okamžiku doručení této nové Objednávky. \n\nPotvrzení Objednávky, které obsahuje dodatky, výhrady, omezení nebo jiné změny se považuje za odmítnutí Objednávky a tvoří nový návrh Poskytovatele na uzavření Dílčí smlouvy, a to i v případě takového dodatku, výhrady, omezení nebo jiné změny, které podstatně nemění podmínky Objednávky. Dílčí smlouva je v takovém případě uzavřena pouze tehdy, pokud tento nový návrh Objednatel písemně potvrdí a doručí zpět Poskytovateli. \n\nDoručením potvrzení Objednávky Objednateli dojde k uzavření smlouvy o poskytnutí služeb, přičemž práva a povinnosti Smluvních stran dle této smlouvy o poskytnutí služeb odpovídají v celém rozsahu právům a povinnostem Objednatele a Poskytovatele stanovených touto Smlouvou (dále jen „Dílčí smlouva“).\n\nPočet Objednávek vystavených Objednatelem není omezený. Současně platí, že Objednatel není povinen Objednávku vystavit.\n\nPoskytovatel se zavazuje poskytnout Objednateli Plnění za podmínek uvedených v této Smlouvě a v Dílčí smlouvě ve sjednaném rozsahu, jakosti a čase. \n\nObjednatel se zavazuje zaplatit za Plnění poskytnuté v souladu s touto Smlouvou a Dílčí smlouvou Cenu dle článku 2 této Smlouvy.\n\nObjednatel při uzavírání této Smlouvy negarantuje žádný minimální objem plnění, který bude zadán v průběhu její platnosti. Objednatel uzpůsobuje rozsah poptávaného plnění svým aktuálním potřebám, které jsou v čase proměnlivé. Poskytovatel se přes výše uvedené zavazuje být připraven poskytnout plnění v rozsahu poptávaném Objednatelem dle podmínek této Smlouvy. \n\nSmluvní strany sjednávají, že k poskytnutí konkrétního Plnění (resp. jeho relevantní části) na základě Dílčí smlouvy je Poskytovatel povinen na základě, v rozsahu a v souladu s požadavky a/nebo pokyny Objednatele, které budou činěny prostřednictvím e-mailové komunikace kontaktní osobou Objednatele uvedenou v čl. 13 odst. 13.11 písm. a) Smlouvy nebo jí pověřenou osobou. V e-mailu podle přechozí věty Objednatel uvede specifikaci konkrétního požadavku (včetně případného požadavku na výstup) a/nebo pokynu. Hovoří-li se v této Smlouvě o Plnění, rozumí se jím i jeho relevantní část, poskytnutá Objednateli na základě konkrétního požadavku a/nebo pokynu dle tohoto odstavce Smlouvy.\n\nKaždá Dílčí smlouva nabývá platnosti dnem uzavření. Dílčí smlouva nabývá účinnosti dnem uzavření, nevztahuje-li se na ni povinnost zveřejnění v registru smluv podle zákona č. 340/2015 Sb., o zvláštních podmínkách účinnosti některých smluv, uveřejňování těchto smluv a o registru smluv (zákon o registru smluv) ve znění pozdějších předpisů (dále jen „Zákon o registru smluv“). Vztahuje-li se na příslušnou Dílčí smlouvu povinnost jejího zveřejnění v registru smluv, nabývá Dílčí smlouva účinnosti dnem zveřejnění v registru smluv, přičemž zveřejnění Dílčí smlouvy v registru smluv zajistí Objednatel. V Dílčí smlouvě může být výslovně uvedeno pozdější datum nabytí účinnosti než dnem jejího uzavření/zveřejnění v registru smluv (dle relevance).\n\nCena\n\nCena za poskytnutí Plnění Poskytovatelem odpovídá součinu skutečného časového rozsahu poskytnutého Plnění a hodinové sazby dle Přílohy č. 1 této Smlouvy na základě konkrétní Dílčí smlouvy (dále jen „Cena“). Nejnižší časová jednotka odpracovaného času, za kterou náleží Poskytovateli odměna za poskytnuté Plnění, je jedna (1) hodina.\n\nSkutečný časový rozsah Plnění je limitován odhadovaným maximálním časovým rozsahem Plnění uvedeným v Dílčí smlouvě. Skutečný časový rozsah Plnění bude Poskytovatelem Objednateli dokladován v rámci akceptační procedury dle článku 6 Smlouvy, jejíž průběh bude stvrzen Smluvními stranami podpisem Akceptačního protokolu, jehož vzor tvoří Přílohu č. 2 této Smlouvy a je její nedílnou součástí. \n\nObjednatel si vyhrazuje právo uznat v rámci fakturace pouze takový časový rozsah Plnění, který byl na poskytování Plnění účelně vynaložen. \n\nCena každé jednotlivé složky Plnění zahrnuje veškeré náklady Poskytovatele spojené s plněním Smlouvy, Dílčí smlouvy a poskytnutím Plnění Objednateli, vyjma pravomocně přiznané odměny za zastupování v soudním řízení, která připadá Poskytovateli. Tato Cena je cenou konečnou.\n\nCelková cena Plnění poskytnutého na základě této Smlouvy a Dílčích smluv nesmí převýšit částku [MONETARY_AMOUNT_1] bez DPH. DPH bude připočítána k ceně v souladu s platnými právními předpisy ke dni uskutečnění zdanitelného plnění.\n\nPlatební podmínky\n\nDaňové doklady za poskytování Plnění budou Poskytovatelem vystavovány vždy k poslednímu dni příslušného kalendářního měsíce, ve kterém bylo Plnění poskytováno, a bude v nich vyúčtováno Plnění poskytnuté Objednateli bez jakýchkoli vad v příslušném kalendářním měsíci. Za den uskutečnění zdanitelného plnění se považuje den podpisu Akceptačního protokolu Objednatelem.\n\nDaňový doklad (faktura) musí obsahovat náležitosti řádného daňového dokladu podle příslušných právních předpisů, zejména podle § 29 zákona č. 235/2004 Sb., o dani z přidané hodnoty, ve znění pozdějších předpisů (dále jen „Zákon o DPH“), dle zákona č. 563/1991 Sb., o účetnictví, ve znění pozdějších předpisů, dle § 435 Občanského zákoníku a níže uvedené údaje:\n\nčíslo Smlouvy a Dílčí smlouvy (Objednávky),\n\nplatební podmínky v souladu se Smlouvou a Dílčí smlouvou,\n\nmísto a datum předání a převzetí Plnění,\n\npopis fakturovaného Plnění, rozsah, jednotkovou a celkovou cenu,\n\npřílohou je kopie Akceptačního protokolu s výrokem „Akceptováno“, odsouhlaseného a potvrzeného Objednatelem.\n\nSplatnost daňového dokladu (faktury) vystaveného Poskytovatelem je třicet (30) kalendářních dní ode dne jeho doručení Objednateli. \n\nPoskytovatel zašle daňový doklad spolu s veškerými požadovanými dokumenty Objednateli nejpozději do pěti (5) kalendářních dnů od podpisu Akceptačního protokolu, jedním z následujících způsobů: \n\nbuď v elektronické podobě na adresu:\n\nxxx\n\nnebo doporučeným dopisem na následující adresu: \n\n[ORGANIZATION_1]\n\n[ADDRESS_1].\n\nV případě, že faktura nebude obsahovat stanovené náležitosti, přílohy nebo nebude vystavena v souladu s touto Smlouvou, je Objednatel oprávněn vrátit ji ve lhůtě splatnosti Poskytovateli k doplnění či opravě, aniž se tím dostane do prodlení. Lhůta splatnosti v délce třicet (30) kalendářních dní počíná běžet znovu ode dne doručení náležitě doplněné či opravené faktury Objednateli.\n\nPlatba bude provedena v české měně formou bankovního převodu na účet Poskytovatele uvedený v záhlaví této Smlouvy. Cena se považuje za uhrazenou dnem odepsání fakturované částky z účtu Objednatele ve prospěch účtu Poskytovatele.\n\nObjednatel neposkytuje Poskytovateli jakékoliv zálohy na cenu za Služby / Plnění.\n\nSmluvní strany se dohodly, že pokud bude v okamžiku uskutečnění zdanitelného plnění správcem daně zveřejněna způsobem umožňujícím dálkový přístup skutečnost, že poskytovatel zdanitelného plnění (Poskytovatel) je nespolehlivým plátcem ve smyslu ust. § 106a Zákona o DPH nebo že úplata za toto plnění má být poskytnuta zcela nebo zčásti bezhotovostním převodem na jiný účet než účet Poskytovatele, který je správcem daně zveřejněn způsobem umožňujícím dálkový přístup ve smyslu ust. § 96 Zákona o DPH, je příjemce zdanitelného plnění (Objednatel) oprávněn část ceny odpovídající dani z přidané hodnoty zaplatit přímo na bankovní účet správce daně ve smyslu ust. § 109a Zákona o DPH. Na bankovní účet Poskytovatele bude v tomto případě uhrazena část ceny odpovídající výši základu daně z přidané hodnoty. Úhrada ceny plnění (základu daně) provedená Objednatelem v souladu s ustanovením tohoto odstavce bude považována za řádnou úhradu ceny plnění poskytnutého dle Smlouvy.\n\nDoba, místo a podmínky plnění\n\nPoskytovatel je povinen poskytnout Objednateli Plnění a předat Objednateli výstup/y nejdéle do pěti (5) kalendářních dnů ode dne doručení požadavku a/nebo pokynu ve smyslu čl. 1 odst. 1.13 Smlouvy, nedohodnou-li se Smluvní strany písemně (např. e-mailem) na jiném termínu poskytnutí Plnění, nebo nevyplývá-li jiný čas poskytnutí Plnění z platných právních předpisů nebo z požadavku či výzvy příslušného orgánu. \n\nPoskytovatel se zavazuje poskytovat Služby dle této Smlouvy na celém území [COUNTRY_1]. Místem předání veškerých výstupů dle této Smlouvy je [ADDRESS_3]. \n\nDalší práva a povinnosti Smluvních stran\n\nPoskytovatel je povinen postupovat při poskytování Služeb / Plnění s odbornou péčí podle svých nejlepších odborných znalostí a schopností, v souladu s právním řádem [COUNTRY_1] a se Smlouvou, přičemž je při své činnosti povinen sledovat a chránit zájmy a dobré jméno Objednatele a postupovat v souladu s jeho aktuálními potřebami a pokyny. V případě nevhodných pokynů Objednatele je Poskytovatel povinen na nevhodnost těchto pokynů Objednatele písemně upozornit, v opačném případě nese Poskytovatel zejména odpovědnost za vady a za škodu, které v důsledku nevhodných pokynů Objednatele Poskytovateli a/nebo třetím osobám vznikly.\n\nPoskytovatel je dále povinen bezodkladně oznámit Objednateli všechny okolnosti, o kterých se při poskytování Služeb / Plnění dozví, a které by mohly mít vliv na změnu pokynů Objednatele nebo na poskytování Služeb / Plnění dle této Smlouvy a Dílčí smlouvy. \n\nPoskytovatel je povinen informovat Objednatele na jeho žádost o průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, a akceptovat jeho doplňující pokyny a připomínky k poskytovanému Plnění. V případě, že Objednatel zjistí v průběhu plnění předmětu Smlouvy, resp. Dílčí smlouvy, nedostatky, Poskytovatel je povinen na písemnou výzvu Objednatele tyto nedostatky odstranit bez nároku na navýšení ceny poskytovaného Plnění bezodkladně, nejdéle však do pěti (5) pracovních dnů ode dne obdržení výzvy.\n\nObjednatel poskytne Poskytovateli k plnění požadovaného Plnění:\n\nveškerou jemu dostupnou dokumentaci;\n\nb)\tpravdivé a včasné informace potřebné k řádnému poskytování Plnění;\n\nc)\tveškerou součinnost nezbytnou pro řádné poskytování Plnění.\n\nPoskytovatel je povinen řádně pečovat o věci a dokumenty, které od Objednatele k poskytnutí požadovaného Plnění obdrží. \n\nSmluvní strany se zavazují vzájemně se písemně informovat o případných změnách sídla, právní formy, změně bankovního spojení, zrušení registrace k DPH, a dalších významných skutečností rozhodných pro plnění ze Smlouvy, resp. Dílčí smlouvy, a to bezodkladně po uskutečnění takovéto změny. \n\nPoskytovatel je povinen neprodleně informovat Objednatele o kybernetických bezpečnostních incidentech (dále jen „KBI“) na straně Poskytovatele souvisejících s plněním dle Smlouvy a/nebo Dílčích smluv, které by mohly mít dopad na kybernetickou bezpečnost u Objednatele. KBI je definován v § 2 odst. 2 písm. f) zákona č. 264/2025 Sb., o kybernetické bezpečnosti. \n\nPoskytovatel poskytne Objednateli součinnost při zvládání KBI v souvislosti s poskytovaným plněním dle Smlouvy a/nebo Dílčích smluv, a bude se v této souvislosti řídit pokyny Objednatele.\n\nPoskytovatel prohlašuje, že si je vědom předpisů týkajících se mezinárodních sankcí, zejm. pak čl. 5 k nařízení Rady EU č. 833/2014 o omezujících opatřeních vzhledem k činnostem Ruska destabilizujícím situaci na Ukrajině, ve znění pozdějších předpisů a nařízení Rady EU č. 269/2014 o omezujících opatřeních vzhledem k činnostem narušujícím nebo ohrožujícím územní celistvost, svrchovanost a nezávislost Ukrajiny, ve znění pozdějších předpisů, vč. prováděcího nařízení Rady EU 2022/581 ze dne [DATE_1], ve znění pozdějších předpisů (dále jen „Předpisy o mezinárodních sankcích“). Poskytovatel prohlašuje, že u něho, jakož ani u okruhu sledovaných subjektů dle právních Předpisů o mezinárodních sankcích vztahujícího se k plnění Smlouvy a/nebo Dílčích smluv není dána překážka uzavření či plnění Smlouvy a/nebo Dílčích smluv. Dále výslovně Poskytovatel zvláště prohlašuje, že nezpřístupní žádné finanční prostředky ani hospodářské zdroje sankcionovaným subjektům ve smyslu tohoto odstavce Smlouvy. Pro vyloučení pochybností se stanoví, že: (i) prohlášení musí být v platnosti po celou dobu plnění Smlouvy, resp. Dílčích smluv, a (ii) jsou-li do tohoto prohlášení zahrnuti poddodavatelé či jiné třetí osoby, je Poskytovatel povinen zjistit skutečnosti vztahující se k těmto třetím osobám s řádnou péčí, přinejmenším ověřením informace u třetích osob a prověřením veřejných rejstříků a evidencí. Poskytovatel je povinen zajistit smluvně dodržování příslušných povinností a omezovat rizika vyplývajících z okolností vedoucích k mezinárodním sankcím, a zavazuje se zajistit, aby jeho prohlášení dle tohoto odstavce Smlouvy zůstala pravdivá a v platnosti po celou dobu účinnosti Smlouvy a/nebo Dílčích smluv. V případě, že Poskytovatel zjistí, že pravdivost jeho prohlášení je, byť jen ohrožena, je povinen o tom Objednatele bezodkladně písemně vyrozumět.\n\nSmluvní strany se dohodly, že pokud to bude potřebné ke splnění požadavků v oblasti kybernetické bezpečnosti stanovených obecně závaznými právními předpisy, zejména v návaznosti na nový zákon č. 264/2025 Sb., o kybernetické bezpečnosti, který nabyl účinnosti dne [DATE_2], a související prováděcí právní předpisy, uzavřou bez zbytečného odkladu po výzvě Objednatele písemný dodatek k této Smlouvě a/nebo Dílčí smlouvě zohledňující takové požadavky, a to formou úpravy či doplnění ustanovení týkajících se zajištění bezpečnostních požadavků v souladu s novou právní úpravou a implementovaným systémem řízení bezpečnosti informací na straně Objednatele a/nebo koncového zákazníka Objednatele. Náklady na bezpečnost informací v důsledku změny legislativy v oblasti bezpečnosti informací nese Poskytovatel.\n\nSchválení poskytnutého Plnění a převzetí výstupů\n\nPoskytovatel splní svou povinnost řádně poskytnout Plnění dnem, kdy je příslušná činnost řádně vykonána a její výstup v Objednatelem požadované formě (dále jen „výstup“) řádně předán Objednateli. Poskytovatel je povinen vypracovat písemnou zprávu, která bude obsahovat zejména údaje o Objednateli a Poskytovateli, číslo této Smlouvy a Dílčí smlouvy, obsah a rozsah poskytnutého Plnění, závěr z poskytnutého Plnění, popř. doporučení Poskytovatele pro další postup Objednatele. Výstup bude Objednateli Poskytovatelem předán v českém jazyce v dohodnutých termínech buď v listinné podobě vytištěné v jednom (1) originálu nebo v elektronické podobě ve formátu požadovaném Objednatelem. \n\nSplnění povinnosti Poskytovatele podle odstavce 6.1 Smlouvy Smluvní strany osvědčí sepsáním protokolu o schválení poskytnutého Plnění a předání a převzetí výstupu, obsahujícího soupis poskytnutého Plnění, včetně rozpisu hodin odpracovaných Poskytovatelem při plnění jednotlivých úkolů, a označení veškerých předávaných výstupů, který bude vyhotoven ve dvou (2) vyhotoveních s platností originálu a bude opatřen podpisem oprávněných osob obou Smluvních stran (dále jen „Akceptační protokol“), přičemž každá ze Smluvních stran obdrží po jednom (1) vyhotovení. Takto vyhotovený Akceptační protokol předá Poskytovatel Objednateli vždy do pěti (5) kalendářních dnů od skončení příslušného měsíce, za který se Akceptační protokol vyhotovuje. \n\nObjednatel je oprávněn odmítnout převzetí výstupu, a tedy podepsat Akceptační protokol s výrokem „Neakceptováno“, pokud Plnění nebylo poskytnuto řádně v souladu s touto Smlouvou a Dílčí smlouvou a/nebo ve sjednané kvalitě a/nebo pokud výstup neobsahoval veškeré údaje požadované Objednatelem a/nebo Objednatel nesouhlasí s počtem hodin poskytnutého Plnění, které budou Objednateli účtovány, přičemž v takovém případě Objednatel důvody odmítnutí převzetí výstupu písemně Poskytovateli sdělí, a to nejpozději do pěti (5) pracovních dnů od předání Akceptačního protokolu. Na následné předání výstupu se použijí výše uvedená ustanovení tohoto článku Smlouvy. Pokud Objednatel uplatní písemný nárok na odstranění vad výstupu, zavazuje se Poskytovatel tyto vady odstranit bez zbytečného odkladu, nejpozději však do pěti (5) pracovních dnů, nestanoví-li Objednatel jinak. \n\nV případě zjevných vad poskytnutého Plnění nebo jeho výstupů není Objednatel povinen Plnění schválit a výstupy převzít a do odstranění těchto vad není povinen podepsat Akceptační protokol s výrokem „Akceptováno“ a zaplatit fakturovanou cenu Plnění. \n\nPovinnost mlčenlivosti a zpracování osobních údajů\n\nSmluvní strany sjednávají, že za důvěrné informace považují takové informace, které získají od druhé Smluvní strany, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, nebo které nejsou v obchodních kruzích běžně dostupné, a o kterých vzhledem k povaze takových informací mohly předpokládat, že na zachování jejich důvěrnosti má druhá Smluvní strana oprávněný zájem, zejména pak informace, údaje a skutečnosti o jakýchkoliv obchodních, finančních, technických, právních a jiných skutečnostech, které by s ohledem na dané podmínky mohly být považovány za důvěrné, poskytnuté či jakkoliv zpřístupněné jednou ze Smluvních stran či jejími zástupci druhé Smluvní straně či jejím zástupcům, ať v ústní, písemné, grafické, elektronické či jiné formě, které se Smluvní strany dozvěděly v souvislosti se Smlouvou a/nebo Dílčí smlouvou, a to bez ohledu zda jsou nebo nejsou označené za důvěrné informace (dále jen „Důvěrné informace“). \n\nS těmito Důvěrnými informacemi budou nakládat jako s vlastním obchodním tajemstvím, aniž by bylo nutné takové informace jako Důvěrné vždy jednotlivě označovat. Výše uvedené nevylučuje možnost v jednotlivých případech při zvýšeném zájmu toto označení pro jednotlivé informace použít. Smluvní strany berou zároveň na vědomí, že některé z Důvěrných informací jsou také předmětem obchodního tajemství druhé Smluvní strany, chráněným dle příslušných ustanovení Občanského zákoníku. \n\nKaždá ze Smluvních stran se zavazuje vynaložit maximální úsilí, které lze spravedlivě požadovat, aby důvěrnost Důvěrných informací druhé Smluvní strany byla důsledně dodržována jejími pracovníky i osobami, které případně, v souladu s dohodou uzavřenou s druhou Smluvní stranou, k plnění účelu spolupráce použije. Použije-li některá ze Smluvních stran k plnění třetí osoby, je oprávněna zpřístupnit jí Důvěrné informace získané od druhé Smluvní strany pouze s jejím souhlasem a v rozsahu nezbytně nutném pro jí poskytované plnění, a je rovněž povinna zavázat třetí osobu povinností mlčenlivosti v rozsahu dle Smlouvy. Za porušení povinností třetí osobou odpovídá Smluvní strana, která jí Důvěrné informace zpřístupnila.\n\nSmluvní strany se dále zavazují:\n\nzachovat mlčenlivost o Důvěrných informací, a to až do doby, kdy se informace této povahy stanou obecně známými za předpokladu, že se tak nestane porušením povinnosti mlčenlivosti;\n\npoužít informace uvedené povahy pouze pro činnosti související s přípravou a plněním Smlouvy a/nebo Dílčí smlouvy, dále tyto informace nerozšiřovat ani nereprodukovat, nezpracovávat je v systémech umělé inteligence (systémech AI), nezpřístupnit je jiným osobám ani je nevyužít pro sebe či pro jinou osobu;\n\nomezit počet svých pracovníků pro styk s těmito Důvěrnými informacemi a přijmout účinná opatření pro zamezení jejich úniku, případně zabezpečit, aby i tyto osoby považovaly uvedené informace za Důvěrné a zachovávaly o nich mlčenlivost.\n\nPovinnost plnit ustanovení dle čl. 7 odst. 7.1 až 7.4 této Smlouvy se nevztahuje na informace, které:\n\nje Smluvní strana povinna zveřejnit na základě zákonem stanovené povinnosti;\n\nmohou být zveřejněny bez porušení Smlouvy;\n\nbyly písemným souhlasem obou Smluvních stran zproštěny těchto omezení;\n\njsou známé nebo byly zveřejněny jinak, než následkem zanedbání povinnosti jedné ze Smluvních stran;\n\npříjemce je zná dříve, než je sdělí Smluvní strana;\n\njsou vyžádány soudem, státním zastupitelstvím nebo příslušným správním orgánem na základě zákona;\n\nSmluvní strana je sdělí osobě vázané zákonnou povinností mlčenlivosti (např. advokátovi nebo daňovému poradci) za účelem uplatňování svých práv;\n\nje Objednatel povinen sdělit svému zakladateli.\n\nV případě, že se kterákoliv Smluvní strana hodnověrným způsobem dozví, popřípadě bude mít důvodné podezření, že došlo k prozrazení či zpřístupnění Důvěrné informace neoprávněné osobě, je povinna neprodleně tuto skutečnost druhé Smluvní straně oznámit.\n\nPokud Smluvní strana, která poruší svůj závazek vyplývající z tohoto článku Smlouvy, takto způsobí druhé Smluvní straně škodu nebo ona či jiná třetí osoba získá na základě takové skutečnosti majetkový prospěch, má druhá Smluvní strana vůči porušující Smluvní straně nárok na náhradu veškeré jí vzniklé škody a na zaplacení částky odpovídající majetkovému prospěchu získanému v souvislosti s touto skutečností porušující Smluvní stranou či jinou třetí osobou. Nárok na náhradu případné škody není sjednáním ani zaplacením kterékoliv smluvní pokuty dle Smlouvy a/nebo Dílčí smlouvy dotčen.\n\nPovinnost ochrany Důvěrných informací trvá bez ohledu na ukončení platnosti a účinnosti Smlouvy a/nebo Dílčích smluv.\n\nPokud řádné poskytování Služeb vyžaduje zpracování osobních údajů zaměstnanců Objednatele, budou tyto osobní údaje zaměstnanců Objednatele v postavení kontaktních osob zpracovávány Poskytovatelem v rozsahu:\n\njméno, příjmení a titul,\n\ne-mailová adresa,\n\ntelefonní číslo.\n\nZpracováním osobních údajů ve smyslu tohoto odstavce Smlouvy se rozumí zejména jejich shromažďování, ukládání na nosiče informací, používání, třídění nebo kombinování, blokování a likvidace s využitím manuálních a automatizovaných prostředků v rozsahu nezbytném pro zajištění řádného poskytování Služeb.\n\nOsobní údaje budou zpracovány po dobu poskytování Služeb. Ukončením této Smlouvy / Dílčí smlouvy nezanikají povinnosti Poskytovatele týkající se bezpečnosti a ochrany osobních údajů až do okamžiku jejich úplné likvidace či předání jinému zpracovateli.\n\nSmluvní strany se dohodly, že Poskytovatel nemá nárok na náhradu nákladů spojených se zpracováním osobních údajů či s plněním povinností vyplývajících z příslušné právní úpravy.\n\nObjednatel je povinen přijmout vhodná opatření na to, aby poskytl subjektům údajů stručným, transparentním, srozumitelným a snadno přístupným způsobem za použití jasných a jednoduchých jazykových prostředků veškeré informace a učinil veškerá sdělení požadovaná Nařízením Evropského parlamentu a Rady (EU) č. 2016/679 ze dne [DATE_3], obecného nařízení o ochraně osobních údajů (dále jen „Nařízení“) ve spojení se zákonem o zpracování osobních údajů.\n\n\n\nPoskytovatel je při plnění této povinnosti povinen:\n\nzpracovávat osobní údaje pouze na základě doložených pokynů Objednatele;\n\nzohledňovat povahu zpracování osobních údajů a být Objednateli nápomocen pro splnění Objednatelovy povinnosti reagovat na žádosti o výkon práv subjektu údajů, jakož i pro splnění dalších povinností ve smyslu Nařízení;\n\nzajistit, že jeho zaměstnanci budou zpracovávat osobní údaje pouze za podmínek a v rozsahu Poskytovatelem stanoveném;\n\nPoskytovatel je při plnění této povinnosti oprávněn v rozsahu nezbytném pro plnění předmětu Smlouvy / Dílčí smlouvy zapojit do zpracování i další případné zpracovatele, k čemuž mu Objednatel tímto uděluje povolení. \n\nSmluvní strany jsou při zpracování povinny:\n\nzavést technická, organizační, personální a jiná vhodná opatření ve smyslu Nařízení, aby zajistily a byly schopny kdykoliv doložit, že zpracování osobních údajů je prováděno v souladu s Nařízením a zákonem o zpracování osobních údajů tak, aby nemohlo dojít k neoprávněnému nebo nahodilému přístupu k osobním údajům a k datovým nosičům, které tyto údaje obsahují, k jejich změně, zničení či ztrátě, neoprávněným přenosům, k jejich jinému neoprávněnému zpracování, jakož i k jinému zneužití, a tato opatření podle potřeby průběžné revidovat a aktualizovat;\n\nvést a průběžné revidovat a aktualizovat záznamy o zpracování osobních údajů ve smyslu Nařízení;\n\nřádně a včas ohlašovat případná porušení zabezpečení osobních údajů Úřadu pro ochranu osobních údajů a spolupracovat s tímto úřadem v nezbytném rozsahu;\n\nnavzájem se informovat o všech okolnostech významných pro plnění dle tohoto článku Smlouvy;\n\nzachovávat mlčenlivost o osobních údajích a o bezpečnostních opatřeních, jejichž zveřejnění by ohrozilo zabezpečení osobních údajů, a to i po skončení této Smlouvy / Dílčí smlouvy;\n\npostupovat v souladu s dalšími požadavky Nařízení a zákona o zpracování osobních údajů, zejména dodržovat obecné zásady zpracování osobních údajů, plnit své informační povinnosti, nepředávat osobní údaje třetím osobám bez potřebného oprávnění, respektovat práva subjektů údajů a poskytovat v této souvislosti nezbytnou součinnost.\n\nOchrana autorských práv\n\nPodpisem Smlouvy Poskytovatel poskytuje Objednateli na dobu trvání majetkových práv autorských nevypověditelnou, převoditelnou, výhradní a územně neomezenou licenci k vytváření kopií, užívání a zpřístupnění dalším osobám všech výstupů a dále jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, jež podle obecně závazných právních předpisů představují autorská díla nebo práva pořizovatele k jím pořízené databázi, včetně práva upravovat a )měnit takováto autorská díla nebo databáze.\n\nObjednatel není ve svých právech k užití výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořených v souvislosti s ní Poskytovatelem či jeho poddodavateli, nijak omezen. Objednatel je oprávněn bez souhlasu Poskytovatele výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k předmětu plnění Smlouvy / Dílčí smlouvy nebo vytvořené v souvislosti s ní Poskytovatelem či jeho poddodavateli, nebo jejich části upravovat či doplňovat.\n\nPoskytovatel není oprávněn výstupy a jakékoliv dokumenty, stanoviska, listiny či návrhy vztahující se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich části, jakkoliv rozšiřovat bez předchozího písemného souhlasu Objednatele. Přenechání výstupů a jakýchkoliv dokumentů, stanovisek, listin či návrhů vztahujících se k plnění předmětu Smlouvy, resp. Dílčí smlouvy, nebo jejich částí Poskytovatelem třetí osobě bez předchozího písemného souhlasu Objednatele se považuje za podstatné porušení Smlouvy.\n\nPoskytovatel odpovídá za to, že plnění předmětu Smlouvy / Dílčí smlouvy, nezasahuje a nebude zasahovat do práv jiných osob, zejména práv z průmyslového nebo jiného duševního vlastnictví, a to pro jakékoliv využití plnění v [COUNTRY_2] i v zahraničí.\n\nSmluvní strany tímto sjednávají, že veškerá finanční vyrovnání za poskytnutí licence dle tohoto článku 8 Smlouvy jsou zahrnuta v Ceně dle článku 2 Smlouvy.\n\nPoddodavatelé\n\nPoskytuje-li Poskytovatel Objednateli část plnění předmětu Smlouvy, resp. Dílčí smlouvy, prostřednictvím poddodavatele písemně schváleným ze strany Objednatele, je za veškerá taková plnění poddodavatele odpovědný Poskytovatel sám, jako kdyby tato plnění byla poskytována Poskytovatelem. \n\nPoskytovatel je povinen zajistit, aby všichni poddodavatelé měli platná příslušná oprávnění, odbornou kvalifikaci a dostatek odborných zkušeností, jež jsou nezbytné pro poskytování příslušných částí Služeb dle jejich smluv s Poskytovatelem. Žádná poddodavatelská smlouva nezakládá smluvní vztahy mezi Objednatelem a poddodavatelem. \n\nPoskytovatel je dále povinen smluvně zajistit, že i jeho poddodavatelé, kteří se budou podílet na plnění dle Smlouvy, resp. Dílčích smluv, se zaváží dodržovat v plném rozsahu ujednání mezi Poskytovatelem a Objednatelem a nebudou v rozporu s požadavky Objednatele uvedenými ve Smlouvě a/nebo Dílčích smlouvách.\n\nPokud Objednatel shledá, že plnění předmětu Smlouvy, resp. Dílčí smlouvy, uskutečněné poddodavatelem nedosahuje potřebných kvalit, je vadné nebo nevykazuje jiné náležitosti požadované Objednatelem, nebo že sám poddodavatel není subjektem kompetentním pro provádění plnění z této Smlouvy, resp. Dílčí smlouvy, je oprávněn požadovat, aby Poskytovatel neprodleně svěřil takto identifikovanou část plnění jinému poddodavateli, nebo se ujal této části plnění sám. \n\nZajištění plnění, která Poskytovatel svěří poddodavateli, není poddodavatel oprávněn zadat třetím osobám. Poskytovatel je povinen na tuto skutečnost poddodavatele upozornit před uzavřením poddodavatelské smlouvy a odpovídá za její dodržování. \n\nUzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, budou považovány za podstatné porušení Smlouvy.\n\nCompliance ujednání\n\nSmluvní strany se zavazují dodržovat právní předpisy a chovat se tak, aby jejich jednání nemohlo vzbudit důvodné podezření ze spáchání nebo páchání trestného činu, a to ani takového, který by mohl být přičitatelný Objednateli podle zákona č. 418/2011 Sb., o trestní odpovědnosti právnických osob a řízení proti nim, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že učiní všechna opatření k tomu, aby se nedopustily ony a ani nikdo z jejich zaměstnanců či zástupců jakékoliv formy korupčního jednání, zejména jednání, které by mohlo být vnímáno jako přijetí úplatku, podplácení nebo nepřímé úplatkářství či jiný trestný čin spojený s korupcí dle zákona č. 40/2009 Sb., trestní zákoník, ve znění pozdějších předpisů.\n\nSmluvní strany se zavazují, že:\n\nneposkytnou, nenabídnou ani neslíbí úplatek jinému nebo pro jiného v souvislosti s obstaráváním věcí obecného zájmu anebo v souvislosti s podnikáním svým nebo jiného; \n\núplatek nepřijmou, ani si jej nedají slíbit, ať už pro sebe nebo pro jiného v souvislosti s obstaráním věcí obecného zájmu nebo v souvislosti s podnikáním svým nebo jiného. \n\nÚplatkem se přitom rozumí neoprávněná výhoda spočívající v přímém majetkovém obohacení nebo jiném zvýhodnění, které se dostává nebo má dostat uplácené osobě nebo s jejím souhlasem jiné osobě, a na kterou není nárok.\n\nSmluvní strany nebudou ani u svých obchodních partnerů tolerovat jakoukoliv formu korupce či uplácení.\n\nV případě, že je zahájeno trestní stíhání Poskytovatele, zavazuje se Poskytovatel o tomto bez zbytečného odkladu Objednatele písemně informovat.\n\nSankce\n\nV případě nedodržení termínu poskytnutí Plnění a/nebo předání výstupu a/nebo odstranění vad poskytnutého Plnění ve sjednané kvalitě podle této Smlouvy a Dílčí smlouvy ze strany Poskytovatele je Poskytovatel povinen uhradit Objednateli smluvní pokutu ve výši [MONETARY_AMOUNT_2]. \n\nZa každé jednotlivé porušení povinnosti mlčenlivosti podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinnosti Poskytovatele při zpracování osobních údajů podle příslušných ustanovení článku 7 Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_3]. \n\nZa každé jednotlivé porušení povinností Poskytovatele stanovených v čl. 5.7 až 5.9 této Smlouvy je Objednatel oprávněn požadovat po Poskytovateli zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nJestliže se jakékoli prohlášení Poskytovatele podle článku 8 Smlouvy ukáže nepravdivým nebo zavádějícím nebo Poskytovatel poruší jiné povinnosti podle článku 8 této Smlouvy, je Objednatel oprávněn požadovat od Poskytovatele zaplacení smluvní pokuty ve výši [MONETARY_AMOUNT_4] za každé jednotlivé porušení povinnosti.\n\nV případě prodlení Objednatele s úhradou řádně vystavených a doručených faktur, je Objednatel povinen uhradit Poskytovateli úrok z prodlení dle nařízení vlády č. 351/2013 Sb., kterým se určuje výše úroků z prodlení a nákladů spojených s uplatněním pohledávky, určuje odměna likvidátora, likvidačního správce a člena orgánu právnické osoby jmenovaného soudem a upravují některé otázky Obchodního věstníku a veřejných rejstříků právnických a fyzických osob, evidence svěřenských fondů a evidence údajů o skutečných majitelích.\n\nVyúčtování smluvní pokuty / úroků z prodlení – penalizační faktura, musí být druhé Smluvní straně zasláno datovou zprávou prostřednictvím datové schránky. Smluvní pokuta a úroky z prodlení jsou splatné ve lhůtě třiceti (30) kalendářních dnů ode dne doručení penalizační faktury povinné Smluvní straně. Úhrada smluvní pokuty / úroků z prodlení se provádí bankovním převodem na účet oprávněné Smluvní strany uvedený v penalizační faktuře. Částka se považuje za zaplacenou okamžikem jejího připsání ve prospěch účtu oprávněné Smluvní strany.\n\nZaplacením smluvní pokuty není dotčen nárok Objednatele na náhradu újmy v celém rozsahu způsobené újmy, ani povinnost Poskytovatele řádně dokončit plnění předmětu Smlouvy, popř. odstranit vady.\n\nObjednatel je v případě uplatnění smluvní pokuty vůči Poskytovateli dle této Smlouvy a v případě neuhrazení smluvní pokuty ze strany Poskytovatele oprávněn využít institut započtení vzájemných pohledávek.\n\nDoba trvání Smlouvy\n\nTato Smlouva nabývá platnosti dnem jejího podpisu oběma Smluvními stranami a účinnosti dnem jejího uveřejnění v registru smluv v souladu se zákonem č. 340/2015 Sb., o registru smluv, ve znění pozdějších předpisů. Zveřejnění Smlouvy v registru smluv zajistí Objednatel. \n\nTato Smlouva se uzavírá na dobu určitou, a to na dobu 48 měsíců od nabytí účinnosti. \n\nSmlouva / Dílčí smlouva může být ukončena dohodou Smluvních stran v písemné formě, přičemž účinky zrušení Smlouvy / Dílčí smlouvy nastanou k okamžiku stanoveném v této dohodě. Nebude-li takovýto okamžik dohodou stanoven, pak tyto účinky nastanou ke dni uzavření takovéto dohody.\n\nSmluvní strany jsou oprávněny od Smlouvy a/nebo Dílčí smlouvy odstoupit v případě jejího podstatného porušení druhou Smluvní stranou, za podmínek uvedených v § 2001 a násl. Občanského zákoníku.\n\nZa podstatné porušení této Smlouvy a/nebo Dílčí Smlouvy Poskytovatelem, které zakládá právo Objednatele na odstoupení od této Smlouvy a/nebo Dílčí smlouvy, se považuje zejména:\n\nnedodržení právních předpisů Poskytovatelem při poskytování Služeb / Plnění;\n\nprodlení Poskytovatele s poskytováním Služeb / Plnění z důvodů spočívajících výlučně na straně Poskytovatele a/nebo předáním výstupu po dobu delší než deset (10) kalendářních dnů;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 7 této Smlouvy;\n\nporušení jakékoli povinnosti Poskytovatele dle článku 8 této Smlouvy;\n\nporušení jakékoliv povinnosti Poskytovatele stanovené v čl. 5.7 až 5.9 této Smlouvy;\n\nuzavření jakékoliv poddodavatelské smlouvy nebo uskutečnění jakéhokoliv smluvního plnění poddodavatelem bez předchozího písemného souhlasu Objednatele, případně jakákoliv změna v osobě poddodavatele bez předchozího písemného souhlasu Objednatele, ve smyslu čl. 9 odst. 9.5 Smlouvy; \n\npostup Poskytovatele při poskytování Služeb / Plnění v rozporu s oprávněnými pokyny Objednatele.\n\nObjednatel je dále oprávněn od této Smlouvy a/nebo Dílčí Smlouvy odstoupit v případě, že \n\nPoskytovatel přestane splňovat požadavky na kvalifikaci uvedené ve výzvě k podání nabídky; \n\nvůči majetku Poskytovatele probíhá insolvenční řízení, v němž bylo vydáno rozhodnutí o úpadku, pokud to právní předpisy umožňují;\n\ninsolvenční návrh na Poskytovatele byl zamítnut proto, že majetek Poskytovatele nepostačuje k úhradě nákladů insolvenčního řízení;\n\nbyl Poskytovatel pravomocně odsouzen pro trestný čin. \n\nPoskytovatel je oprávněn odstoupit od této Smlouvy a/nebo Dílčí Smlouvy, pokud Objednatel bude přes písemné upozornění Poskytovatele déle než třicet (30) kalendářních dnů od písemného upozornění Poskytovatele v prodlení s plněním své platební povinnosti vůči Poskytovateli. \n\nÚčinky odstoupení nastávají uplynutím lhůty deseti (10) kalendářních dnů, která počíná běžet prvním dnem následujícím po doručení projevu vůle odstoupit od Smlouvy a/nebo Dílčí Smlouvy druhé Smluvní straně. Poskytovatel je v případě odstoupení od Smlouvy / Dílčí smlouvy povinen učinit již jen takové úkony, bez nichž by mohly být zájmy Objednatele vážně ohroženy.\n\nOdstoupení od Smlouvy a/nebo Dílčí Smlouvy se nedotýká zejména práva na náhradu újmy, smluvní pokuty a povinnosti mlčenlivosti, ani ujednání, které mají vzhledem ke své povaze zavazovat Smluvní strany i po ukončení Smlouvy / Dílčí smlouvy.\n\nSmluvní strany jsou oprávněny Smlouvu a/nebo Dílčí Smlouvu vypovědět, i bez uvedení důvodu, na základě písemné výpovědi. Výpovědní doba činí tři (3) měsíce a počíná běžet dnem doručení výpovědi druhé Smluvní straně.\n\nV případě jakéhokoliv skončení tohoto smluvního vztahu podle Smlouvy a/nebo Dílčí Smlouvy, je Poskytovatel vždy povinen neprodleně předat Objednateli veškeré věci a dokumenty, vztahující se k plnění této Smlouvy a/nebo Dílčí Smlouvy nebo poskytnuté za účelem plnění předmětu Smlouvy a/nebo Dílčí Smlouvy, nejpozději však do pěti (5) pracovních dnů ode dne ukončení smluvního vztahu. \n\nZávěrečná ustanovení\n\nSmluvní strany potvrzují, že si při uzavírání Smlouvy vzájemně sdělily všechny skutkové a právní okolnosti, o nichž ví nebo vědět musí, tak, aby se každá ze Smluvních stran mohla přesvědčit o možnosti uzavřít platnou Smlouvu a aby byl každé ze Smluvních stran zřejmý zájem druhé Smluvní strany Smlouvu uzavřít. \n\nSmluvní strany výslovně potvrzují, že si vzájemně sdělily veškeré okolnosti důležité pro uzavření Smlouvy. Smluvní strany prohlašují, že se dohodly o veškerých náležitostech Smlouvy.\n\nPoskytovatel prohlašuje a potvrzuje, že na sebe přebírá nebezpečí změny okolností ve smyslu ustanovení § 1765 odst. 2 Občanského zákoníku.\n\nSmluvní strany si ve smyslu ustanovení § 1794 odst. 2 Občanského zákoníku ujednaly, že se Poskytovatel výslovně vzdává jeho práva ve smyslu ustanovení § 1793 Občanského zákoníku a souhlasí s cenou tak, jak byla Smluvními stranami sjednána výše v této Smlouvě.\n\nSmluvní strany se zavazují vyvinout maximální úsilí k odstranění vzájemných sporů, vzniklých na základě této Smlouvy nebo v souvislosti s touto Smlouvou, a k jejich vyřešení zejména prostřednictvím jednání odpovědných pracovníků nebo jiných pověřených subjektů. Nedohodnou-li se na způsobu řešení vzájemného sporu, dohodly se Smluvní strany, že místně příslušným soudem pro řešení případných sporů bude soud příslušný dle místa sídla Objednatele.\n\nTato Smlouva může být měněna pouze vzestupně očíslovanými písemnými dodatky ke Smlouvě podepsanými oběma Smluvními stranami. \n\nDnem doručení písemností odeslaných na základě této Smlouvy nebo v souvislosti s touto Smlouvou prostřednictvím provozovatele poštovních služeb, pokud není prokázán jiný den doručení, se rozumí poslední den lhůty, ve které byla písemnost pro adresáta uložena u provozovatele poštovních služeb, a to i tehdy, jestliže se adresát o jejím uložení nedověděl. Smluvní strany tímto výslovně vylučují ust. § 573 Občanského zákoníku.\n\nPokud kterékoli ustanovení této Smlouvy nebo jeho část je nebo se stane neplatným či nevynutitelným, nebude mít tato neplatnost či nevynutitelnost vliv na platnost či vynutitelnost ostatních ustanovení této Smlouvy nebo jejích částí, pokud nevyplývá přímo z obsahu této Smlouvy, že toto ustanovení nebo jeho část nelze oddělit od dalšího obsahu. V takovém případě se obě Smluvní strany zavazují neúčinné a neplatné ustanovení nahradit novým ustanovením, které je svým účelem a významem co nejbližší ustanovení této Smlouvy, jež má být nahrazeno. \n\nPro případ, že tato Smlouva není uzavírána za přítomnosti všech Smluvních stran, platí, že Smlouva nebude uzavřena, pokud ji Poskytovatel podepíše s jakoukoliv změnou či odchylkou, byť nepodstatnou, nebo dodatkem, ledaže Objednatel takovou změnu či odchylku nebo dodatek následně schválí. To platí i v případě připojení obchodních podmínek Poskytovatele, které budou odporovat svým obsahem jakýmkoliv způsobem textu této Smlouvy. \n\nTato Smlouva je vyhotovena ve čtyřech (4) rovnocenných vyhotoveních, z nichž každé má platnost originálu. Každá ze Smluvních stran obdrží po dvou (2) stejnopisech. V případě, že bude Smlouva uzavírána elektronicky obdrží Poskytovatel/Objednatel elektronický dokument podepsaný v souladu s platnou právní úpravou.\n\nKontaktní údaje Smluvních stran pro doručování jsou následující:\n\nKontaktní osoba [PERSON_2]:\n\nxxx\n\ne-mail: xxx tel. xxx\n\nKontaktní osoba [PERSON_3]:\n\n xxx\n\n e-mail: xxx tel. xxx\n\n\n\n Nedílnou součástí této Smlouvy jsou následující přílohy:\n\nPříloha č. 1 – Cena\n\nPříloha č. 2 – Akceptační protokol\n\n\n\n\n\n\n\n\n\nSmluvní strany prohlašují, že tato Smlouva je projevem jejich pravé a svobodné vůle a nebyla sjednána v tísni ani za jinak jednostranně nevýhodných podmínek. Na důkaz toho připojují Smluvní strany své podpisy.\n\n\n\nV Praze dne: Dle elektronického podpisu\tV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\n__________________________________\n\nxxx\n\n[PERSON_1]\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\nV Praze dne: Dle elektronického podpisu\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n__________________________________\n\nxxx\n\nxxx\n\n[ORGANIZATION_1]\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 1 – Cena\n\n\n\nHodinová sazba v Kč bez DPH za poskytování Služeb:\n\n\n\nHodinová sazba v Kč bez DPH\n\n2 190,- \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPříloha č. 2 – Akceptační protokol\n\n\n\nAkceptační protokol\n\n\n\n\n\nPlnění / výstup:\t……………………………………………………………………………………….\n\n\n\nPředáno dne: \n\n\n\n…………………………………………………………………………………………………………..\n\n\n\n\n\nPřevzal: \t\t\t\t\t\tPředal:\n\n\n\n…………………………………………… \t\t……………………………………………….\n\nObjednatel:\t\t\t\t\t\t[ORGANIZATION_3]:\n\nNárodní agentura pro komunikační a  \n\ninformační technologie, s. p. \n\nJméno, příjmení \t\t\t\t \n\nFunkce\t\t\t \n\n\t\n\n\n\nAkceptováno dne: datum akceptace\n\n\n\nVýsledek akceptace: \tAKCEPTOVÁNO / NEAKCEPTOVÁNO\n\n\n\nAkceptoval\n\nFunkce\n\nPodpis\n\n\n\n\n\n\n\n\n\n\n\n\n\nPřipomínky, výhrady:\n\n\n\npřipomínky a výhrady k předanému Plnění" } diff --git a/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json index d8cbf035..467f694e 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/cs/sanofi-bonus-agreement.snapshot.json @@ -103,11 +103,11 @@ "source": "deny-list" }, { - "start": 670, + "start": 687, "end": 691, "label": "registration number", - "text": "oddíl Pr, vložka 1603", - "source": "regex" + "text": "1603", + "source": "trigger" }, { "start": 705, @@ -152,5 +152,5 @@ "source": "country" } ], - "redactedText": "Příloha č. 3a26 ke Smlouvě o poskytnutí obratového bonusu (COMMA CAF ID 266, ze dne 2.4. 2019 ) uzavřené mezi smluvními stranami, kterými jsou:\n\n\n\n\n\n\n\n[ORGANIZATION_1]\n\nSe sídlem: [ADDRESS_1]\n\nIČO: [REGISTRATION_NUMBER_1]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_1]\n\nBankovní spojení: [BANK_ACCOUNT_NUMBER_1] \n\nZapsaná v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2] \n\nZastoupená: [OU OU], Head of Trade Department [COUNTRY_1]\n\n(dále jen „Společnost“)\n\n \n\na \n\n\n\n[ORGANIZATION_3]\n\nSídlo: [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2]\n\nBankovní spojení: [XX XX]\n\nZapsaná v obchodním rejstříku vedeném u KS [ADDRESS_3], [REGISTRATION_NUMBER_4]\n\nZastoupená: [PERSON_1], ředitelka\n\n(dále též „Zdravotnické zařízení“).\n\n\n\nObsahem této přílohy je dohoda o podmínkách dosažení a o výši obratového bonusu pro tyto produkty: \n\n\n\n[XX XX]\n\n\n\nReferenční období: [XX XX]\n\n\n\nPotřebná výše obratu v referenčním období:\n\n \n\n[XX XX] \n\n[XX XX] \n\n[XX XX] \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nObratový bonus poskytnutý Zdravotnickému zařízení bude vyplacen pouze podle jednoho pásma, a to v souladu s dosaženou výší obratu Zdravotnickým zařízením v referenčním období:\n\n\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nV Praze, dne ...[DATE_1]..........\tV Blansku, dne …[DATE_2].........\n\n\n\n\n\n_______________________________________\t __________________________________________\n\n[ORGANIZATION_4]\n\n[OU OU]\t[PERSON_1] \n\nHead of Trade Department [COUNTRY_1]\t ředitelka" + "redactedText": "Příloha č. 3a26 ke Smlouvě o poskytnutí obratového bonusu (COMMA CAF ID 266, ze dne 2.4. 2019 ) uzavřené mezi smluvními stranami, kterými jsou:\n\n\n\n\n\n\n\n[ORGANIZATION_1]\n\nSe sídlem: [ADDRESS_1]\n\nIČO: [REGISTRATION_NUMBER_1]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_1]\n\nBankovní spojení: [BANK_ACCOUNT_NUMBER_1] \n\nZapsaná v obchodním rejstříku vedeném [ORGANIZATION_2], [REGISTRATION_NUMBER_2] \n\nZastoupená: [OU OU], Head of Trade Department [COUNTRY_1]\n\n(dále jen „Společnost“)\n\n \n\na \n\n\n\n[ORGANIZATION_3]\n\nSídlo: [ADDRESS_2]\n\nIČO: [REGISTRATION_NUMBER_3]\n\nDIČ: [TAX_IDENTIFICATION_NUMBER_2]\n\nBankovní spojení: [XX XX]\n\nZapsaná v obchodním rejstříku vedeném u KS [ADDRESS_3], oddíl Pr, vložka [REGISTRATION_NUMBER_4]\n\nZastoupená: [PERSON_1], ředitelka\n\n(dále též „Zdravotnické zařízení“).\n\n\n\nObsahem této přílohy je dohoda o podmínkách dosažení a o výši obratového bonusu pro tyto produkty: \n\n\n\n[XX XX]\n\n\n\nReferenční období: [XX XX]\n\n\n\nPotřebná výše obratu v referenčním období:\n\n \n\n[XX XX] \n\n[XX XX] \n\n[XX XX] \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nObratový bonus poskytnutý Zdravotnickému zařízení bude vyplacen pouze podle jednoho pásma, a to v souladu s dosaženou výší obratu Zdravotnickým zařízením v referenčním období:\n\n\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n [XX XX] % z obratu dosaženého Zdravotnickým zařízením\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nV Praze, dne ...[DATE_1]..........\tV Blansku, dne …[DATE_2].........\n\n\n\n\n\n_______________________________________\t __________________________________________\n\n[ORGANIZATION_4]\n\n[OU OU]\t[PERSON_1] \n\nHead of Trade Department [COUNTRY_1]\t ředitelka" } diff --git a/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json index 69689d1a..1477f3bd 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/de/geschaeftsfuehrer-dienstvertrag.snapshot.json @@ -45,7 +45,7 @@ "end": 311, "label": "date of birth", "text": "21. März 1968", - "source": "trigger" + "source": "regex" }, { "start": 326, @@ -66,7 +66,7 @@ "end": 467, "label": "date of birth", "text": "09. Juli 1982", - "source": "trigger" + "source": "regex" }, { "start": 482, diff --git a/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json b/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json index 48ef74e9..c7b463db 100644 --- a/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json +++ b/packages/anonymize/src/__test__/fixtures/contracts/en/software-license-agreement.snapshot.json @@ -4,11 +4,10 @@ "date": 3, "organization": 8, "address": 6, - "tax identification number": 2, + "tax identification number": 3, "person": 4, "monetary amount": 1, "country": 1, - "bank account number": 1, "email address": 1, "phone number": 1 }, @@ -121,9 +120,9 @@ { "start": 1452, "end": 1462, - "label": "bank account number", + "label": "tax identification number", "text": "4537891022", - "source": "trigger" + "source": "regex" }, { "start": 1572, @@ -210,5 +209,5 @@ "source": "regex" } ], - "redactedText": "SOFTWARE LICENSE AGREEMENT\n\nThis Software License Agreement (the \"Agreement\") is entered into as of [DATE_1]\n(the \"Effective Date\") by and between:\n\n(1) [ORGANIZATION_1], a [ADDRESS_1] corporation\n with its principal place of business at 1209 [ADDRESS_2], DE 19801,\n EIN: [TAX_IDENTIFICATION_NUMBER_1]\n (the \"Licensor\"), represented by its Chief Executive Officer,\n [PERSON_1]; and\n\n(2) [ORGANIZATION_2], a [ADDRESS_3] limited liability company\n with offices at 200 West [ADDRESS_4], NY 10282,\n EIN: [TAX_IDENTIFICATION_NUMBER_2]\n (the \"Licensee\"), represented by its Managing Director,\n Dr. [PERSON_2].\n\nRecitals\nLicensor has developed certain proprietary analytics software (\"Software\") and\nis willing to grant Licensee a license under the terms set forth herein.\nLicensee desires to obtain such a license to integrate the Software with its\n[ORGANIZATION_3] treasury operations and to deploy it across the Acme &\nCompany Holdings group of subsidiaries.\n\n1. License Grant. Subject to the terms of this Agreement, Licensor hereby grants\n to Licensee a non-exclusive, non-transferable license to use the Software\n solely for internal business purposes.\n\n2. Fees. Licensee shall pay Licensor an annual license fee of [MONETARY_AMOUNT_1]\n (one million two hundred fifty thousand [COUNTRY_1] dollars), payable to the\n account designated by Licensor ([ORGANIZATION_4], routing 121000248,\n account [BANK_ACCOUNT_NUMBER_1]).\n\n3. Notices. Any notice required hereunder shall be sent to the following\n addresses:\n If to Licensor: [ORGANIZATION_1], Attn: Legal Department,\n 1209 [ADDRESS_2], DE 19801; with a copy to general counsel\n at [EMAIL_ADDRESS_1].\n If to Licensee: [ORGANIZATION_2], Attn: General Counsel,\n 200 West [ADDRESS_4], NY 10282; phone: ([PHONE_NUMBER_1].\n\nIN WITNESS WHEREOF, the parties have executed this Agreement as of the\nEffective Date.\n\nLICENSOR: [ORGANIZATION_1] LICENSEE: [ORGANIZATION_2]\nBy: ____________________________ By: ____________________________\nName: [PERSON_3] Name: [PERSON_2]\nTitle: Chief Executive Officer Title: Managing Director\nDate: [DATE_1] Date: [DATE_1]\n" + "redactedText": "SOFTWARE LICENSE AGREEMENT\n\nThis Software License Agreement (the \"Agreement\") is entered into as of [DATE_1]\n(the \"Effective Date\") by and between:\n\n(1) [ORGANIZATION_1], a [ADDRESS_1] corporation\n with its principal place of business at 1209 [ADDRESS_2], DE 19801,\n EIN: [TAX_IDENTIFICATION_NUMBER_1]\n (the \"Licensor\"), represented by its Chief Executive Officer,\n [PERSON_1]; and\n\n(2) [ORGANIZATION_2], a [ADDRESS_3] limited liability company\n with offices at 200 West [ADDRESS_4], NY 10282,\n EIN: [TAX_IDENTIFICATION_NUMBER_2]\n (the \"Licensee\"), represented by its Managing Director,\n Dr. [PERSON_2].\n\nRecitals\nLicensor has developed certain proprietary analytics software (\"Software\") and\nis willing to grant Licensee a license under the terms set forth herein.\nLicensee desires to obtain such a license to integrate the Software with its\n[ORGANIZATION_3] treasury operations and to deploy it across the Acme &\nCompany Holdings group of subsidiaries.\n\n1. License Grant. Subject to the terms of this Agreement, Licensor hereby grants\n to Licensee a non-exclusive, non-transferable license to use the Software\n solely for internal business purposes.\n\n2. Fees. Licensee shall pay Licensor an annual license fee of [MONETARY_AMOUNT_1]\n (one million two hundred fifty thousand [COUNTRY_1] dollars), payable to the\n account designated by Licensor ([ORGANIZATION_4], routing 121000248,\n account [TAX_IDENTIFICATION_NUMBER_3]).\n\n3. Notices. Any notice required hereunder shall be sent to the following\n addresses:\n If to Licensor: [ORGANIZATION_1], Attn: Legal Department,\n 1209 [ADDRESS_2], DE 19801; with a copy to general counsel\n at [EMAIL_ADDRESS_1].\n If to Licensee: [ORGANIZATION_2], Attn: General Counsel,\n 200 West [ADDRESS_4], NY 10282; phone: ([PHONE_NUMBER_1].\n\nIN WITNESS WHEREOF, the parties have executed this Agreement as of the\nEffective Date.\n\nLICENSOR: [ORGANIZATION_1] LICENSEE: [ORGANIZATION_2]\nBy: ____________________________ By: ____________________________\nName: [PERSON_3] Name: [PERSON_2]\nTitle: Chief Executive Officer Title: Managing Director\nDate: [DATE_1] Date: [DATE_1]\n" } diff --git a/packages/anonymize/src/__test__/us-bank-routing.test.ts b/packages/anonymize/src/__test__/us-bank-routing.test.ts index b2e706a3..bd65978a 100644 --- a/packages/anonymize/src/__test__/us-bank-routing.test.ts +++ b/packages/anonymize/src/__test__/us-bank-routing.test.ts @@ -85,14 +85,4 @@ describe("US ABA routing number — cue + checksum recognizer", () => { bankAccounts(await run("The reference 122100024 appears in section 5.")), ).toHaveLength(0); }); - - test("a labelled account number in payment instructions is a bank account number", async () => { - expect( - bankAccounts( - await run( - "Pay Wells Fargo Bank, N.A., routing 121000248, account 4537891022.", - ), - ), - ).toContain("4537891022"); - }); }); diff --git a/packages/anonymize/src/data/address-boundaries.json b/packages/anonymize/src/data/address-boundaries.json index 5d71897f..12c5cc35 100644 --- a/packages/anonymize/src/data/address-boundaries.json +++ b/packages/anonymize/src/data/address-boundaries.json @@ -5,9 +5,6 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", - "nebude-li", - "nebudou-li", - "pokud", "zapsán", "zapsaná", "zapsané", @@ -38,7 +35,6 @@ "shall govern", "shall be governed", "to be enforced", - "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/anonymize/src/data/triggers.de.json b/packages/anonymize/src/data/triggers.de.json index 2dafb148..2de00fe0 100644 --- a/packages/anonymize/src/data/triggers.de.json +++ b/packages/anonymize/src/data/triggers.de.json @@ -10,7 +10,7 @@ "label": "date of birth", "strategy": { "type": "n-words", - "count": 3 + "count": 1 }, "triggers": ["geboren am", "geb."], "extensions": ["add-colon"] diff --git a/packages/anonymize/src/data/triggers.en.json b/packages/anonymize/src/data/triggers.en.json index 084b0851..e7f5a975 100644 --- a/packages/anonymize/src/data/triggers.en.json +++ b/packages/anonymize/src/data/triggers.en.json @@ -65,17 +65,6 @@ "strategy": { "type": "company-id-value" }, "triggers": ["VAT number", "VAT ID", "tax identification number", "tax id"] }, - { - "id": "en-bank-account", - "label": "bank account number", - "strategy": { "type": "company-id-value" }, - "triggers": ["account", "account number", "account no.", "account #"], - "validations": [ - { "type": "has-digits" }, - { "type": "min-length", "min": 5 }, - { "type": "max-length", "max": 34 } - ] - }, { "id": "en-uk-companies-house", "label": "registration number", diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 29ece50b..a3cc9227 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -658,7 +658,7 @@ const CZ_BIRTH_NUMBER: RegexDef = { // Czech commercial-register reference. Every Czech // legal entity in the public registry is uniquely -// identified by a registry section code ("oddíl X") +// identified by a registry section letter ("oddíl X") // plus an insert number ("vložka NNN"). The full phrase // uniquely identifies the company, so we emit it as a // single registration-number entity rather than only @@ -669,11 +669,11 @@ const CZ_BIRTH_NUMBER: RegexDef = { // - optional whitespace around comma and after each // keyword (DOCX exports add NBSPs and double // spaces); -// - section code is a short letter code; insert number -// is a 1-6 digit integer. +// - section letter is a single A-Z; insert number is +// a 1-6 digit integer. const CZ_COMMERCIAL_REGISTER: RegexDef = { pattern: - `(?i)\\boddíl[^\\S\\n]+\\p{L}{1,3}` + + `(?i)\\boddíl[^\\S\\n]+[A-Z]` + `[^\\S\\n]*,[^\\S\\n]*` + `vložka[^\\S\\n]+\\d{1,6}\\b`, label: "registration number", diff --git a/packages/data/config/address-boundaries.json b/packages/data/config/address-boundaries.json index 5d71897f..12c5cc35 100644 --- a/packages/data/config/address-boundaries.json +++ b/packages/data/config/address-boundaries.json @@ -5,9 +5,6 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", - "nebude-li", - "nebudou-li", - "pokud", "zapsán", "zapsaná", "zapsané", @@ -38,7 +35,6 @@ "shall govern", "shall be governed", "to be enforced", - "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/data/config/triggers.de.json b/packages/data/config/triggers.de.json index 2dafb148..2de00fe0 100644 --- a/packages/data/config/triggers.de.json +++ b/packages/data/config/triggers.de.json @@ -10,7 +10,7 @@ "label": "date of birth", "strategy": { "type": "n-words", - "count": 3 + "count": 1 }, "triggers": ["geboren am", "geb."], "extensions": ["add-colon"] diff --git a/packages/data/config/triggers.en.json b/packages/data/config/triggers.en.json index 084b0851..e7f5a975 100644 --- a/packages/data/config/triggers.en.json +++ b/packages/data/config/triggers.en.json @@ -65,17 +65,6 @@ "strategy": { "type": "company-id-value" }, "triggers": ["VAT number", "VAT ID", "tax identification number", "tax id"] }, - { - "id": "en-bank-account", - "label": "bank account number", - "strategy": { "type": "company-id-value" }, - "triggers": ["account", "account number", "account no.", "account #"], - "validations": [ - { "type": "has-digits" }, - { "type": "min-length", "min": 5 }, - { "type": "max-length", "max": 34 } - ] - }, { "id": "en-uk-companies-house", "label": "registration number", From 1b92e2225e8bb28d2fe6c0e6422a4069ae3bc372 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 11:32:38 +0200 Subject: [PATCH 032/130] fix: address native adapter review --- crates/anonymize-adapter-contract/src/lib.rs | 123 ++++++++++++++++++ crates/anonymize-core/src/diagnostics.rs | 7 +- crates/anonymize-core/tests/prepared.rs | 7 + crates/anonymize-napi/src/lib.rs | 25 ++-- crates/anonymize-py/src/lib.rs | 16 ++- .../__test__/native-adapter-parity.test.ts | 33 +++++ .../src/__test__/pipeline-config.test.ts | 30 +++++ .../anonymize/src/build-unified-search.ts | 13 +- 8 files changed, 232 insertions(+), 22 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 6676e7c3..dcf7b4f2 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -33,6 +33,7 @@ pub enum ContractError { CompactStringIndexOutOfRange { field: &'static str, index: u32 }, FuzzyDistanceOutOfRange { distance: u32 }, InvalidCompactStringGroups { field: &'static str, reason: String }, + InvalidBindingOffset { offset: u32 }, InvalidPreparedSearchPackage { reason: String }, MissingDenyListDataForLiteralPatterns, UnsupportedOperator { value: String }, @@ -58,6 +59,12 @@ impl std::fmt::Display for ContractError { "Compact string groups are invalid in {field}: {reason}" ) } + Self::InvalidBindingOffset { offset } => { + write!( + formatter, + "Byte offset is not on a character boundary: {offset}" + ) + } Self::InvalidPreparedSearchPackage { reason } => { write!(formatter, "Prepared search package is invalid: {reason}") } @@ -1487,6 +1494,16 @@ pub fn static_redaction_result_to_binding( } } +pub fn static_redaction_result_to_utf16_binding( + result: StaticRedactionResult, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut result = static_redaction_result_to_binding(result); + convert_pipeline_entity_offsets(&mut result.resolved_entities, &offsets)?; + Ok(result) +} + #[must_use] pub fn static_redaction_diagnostic_result_to_binding( result: StaticRedactionDiagnosticResult, @@ -1497,6 +1514,20 @@ pub fn static_redaction_diagnostic_result_to_binding( } } +pub fn static_redaction_diagnostic_result_to_utf16_binding( + result: StaticRedactionDiagnosticResult, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut result = static_redaction_diagnostic_result_to_binding(result); + convert_pipeline_entity_offsets( + &mut result.result.resolved_entities, + &offsets, + )?; + convert_diagnostic_offsets(&mut result.diagnostics.events, &offsets); + Ok(result) +} + #[must_use] pub fn static_redaction_diagnostics_to_binding( diagnostics: StaticRedactionDiagnostics, @@ -1510,6 +1541,16 @@ pub fn static_redaction_diagnostics_to_binding( } } +pub fn static_redaction_diagnostics_to_utf16_binding( + diagnostics: StaticRedactionDiagnostics, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut diagnostics = static_redaction_diagnostics_to_binding(diagnostics); + convert_diagnostic_offsets(&mut diagnostics.events, &offsets); + Ok(diagnostics) +} + fn diagnostic_event_to_binding( event: DiagnosticEvent, ) -> BindingDiagnosticEvent { @@ -1533,6 +1574,88 @@ fn diagnostic_event_to_binding( } } +fn convert_pipeline_entity_offsets( + entities: &mut [BindingPipelineEntity], + offsets: &Utf16OffsetMap, +) -> Result<()> { + for entity in entities { + entity.start = offsets.convert(entity.start)?; + entity.end = offsets.convert(entity.end)?; + } + Ok(()) +} + +fn convert_diagnostic_offsets( + events: &mut [BindingDiagnosticEvent], + offsets: &Utf16OffsetMap, +) { + for event in events { + if let Some(start) = event.start + && let Some(converted) = offsets.try_convert(start) + { + event.start = Some(converted); + } + if let Some(end) = event.end + && let Some(converted) = offsets.try_convert(end) + { + event.end = Some(converted); + } + } +} + +struct Utf16OffsetMap { + boundaries: Vec<(u32, u32)>, +} + +impl Utf16OffsetMap { + fn new(text: &str) -> Result { + let mut boundaries = Vec::new(); + let mut utf16_offset = 0_u32; + boundaries.push((0, 0)); + + for (byte_start, ch) in text.char_indices() { + utf16_offset = utf16_offset + .checked_add(char_utf16_width(ch)) + .ok_or_else(|| ContractError::InvalidPreparedSearchPackage { + reason: String::from("UTF-16 offset exceeds u32 range"), + })?; + let byte_end = byte_start.saturating_add(ch.len_utf8()); + boundaries.push((u32_from_usize(byte_end)?, utf16_offset)); + } + + Ok(Self { boundaries }) + } + + fn convert(&self, offset: u32) -> Result { + self + .try_convert(offset) + .ok_or(ContractError::InvalidBindingOffset { offset }) + } + + fn try_convert(&self, offset: u32) -> Option { + let index = self + .boundaries + .binary_search_by_key(&offset, |(byte_offset, _)| *byte_offset) + .ok()?; + self + .boundaries + .get(index) + .map(|(_, utf16_offset)| *utf16_offset) + } +} + +const fn char_utf16_width(ch: char) -> u32 { + if ch.len_utf16() == 1 { 1 } else { 2 } +} + +fn u32_from_usize(value: usize) -> Result { + u32::try_from(value).map_err(|_| { + ContractError::InvalidPreparedSearchPackage { + reason: format!("Offset exceeds u32 range: {value}"), + } + }) +} + fn deny_list_filters_from_binding( filters: BindingDenyListFilterData, ) -> DenyListFilterData { diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index a6e8cc78..267bef8c 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -93,9 +93,6 @@ impl StaticRedactionDiagnostics { let offsets = ByteOffsets::new(full_text); for found in matches { let span_valid = span_slices(&offsets, found.start(), found.end()); - let text = span_valid - .then(|| offsets.slice(full_text, found.start(), found.end()).ok()) - .flatten(); self.events.push(DiagnosticEvent { stage, kind: DiagnosticEventKind::SearchMatch, @@ -107,7 +104,7 @@ impl StaticRedactionDiagnostics { label: None, start: Some(found.start()), end: Some(found.end()), - text, + text: None, score: None, span_valid: Some(span_valid), elapsed_us: None, @@ -144,7 +141,7 @@ impl StaticRedactionDiagnostics { label: Some(entity.label.clone()), start: Some(entity.start), end: Some(entity.end), - text: Some(entity.text.clone()), + text: None, score: Some(entity.score), span_valid: Some(span_slices(&offsets, entity.start, entity.end)), elapsed_us: None, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index e48ec184..2a16fbda 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -867,6 +867,13 @@ fn prepared_search_reports_static_redaction_diagnostics() { && event.label.as_deref() == Some("organization") && event.span_valid == Some(true) })); + assert!( + result + .diagnostics + .events + .iter() + .all(|event| event.text.is_none()) + ); assert!(result.diagnostics.events.iter().any(|event| { event.stage == DiagnosticStage::Redaction && event.kind == DiagnosticEventKind::StageSummary diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 1387e4bc..9861639c 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -14,8 +14,9 @@ use stella_anonymize_adapter_contract::{ prepared_search_core_package_to_compressed_bytes, prepared_search_core_package_view_from_bytes, prepared_search_package_digest, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, - static_redaction_diagnostic_result_to_binding, - static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, + static_redaction_diagnostic_result_to_utf16_binding, + static_redaction_diagnostics_to_binding, + static_redaction_result_to_utf16_binding, }; use stella_anonymize_core::{ DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, PreparedSearch, @@ -251,8 +252,9 @@ pub fn redact_static_entities_json( &operator_config_from_binding(operators) .map_err(|error| to_napi_contract_error(&error))?, ) - .map(static_redaction_result_to_binding) .map_err(|error| to_napi_core_error(&error))?; + let result = static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) } @@ -288,7 +290,9 @@ pub fn redact_static_entities_diagnostics_json( .map_err(|error| to_napi_core_error(&error))?; diagnostics.extend(result.diagnostics); result.diagnostics = diagnostics; - let result = static_redaction_diagnostic_result_to_binding(result); + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) } @@ -591,12 +595,13 @@ impl NativePreparedSearch { let operators = operator_config_from_binding(operators.map(to_binding_operator_config)) .map_err(|error| to_napi_contract_error(&error))?; - self + let result = self .inner .redact_static_entities(&full_text, &operators) - .map(static_redaction_result_to_binding) - .map(to_js_static_redaction_result) - .map_err(|error| to_napi_core_error(&error))? + .map_err(|error| to_napi_core_error(&error))?; + static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error)) + .and_then(to_js_static_redaction_result) } #[napi] @@ -612,8 +617,10 @@ impl NativePreparedSearch { let result = self .inner .redact_static_entities_with_diagnostics(&full_text, &operators) - .map(static_redaction_diagnostic_result_to_binding) .map_err(|error| to_napi_core_error(&error))?; + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) } diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index e967afc4..7bd7716e 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -9,8 +9,9 @@ use stella_anonymize_adapter_contract::{ prepared_search_core_package_to_compressed_bytes, prepared_search_core_package_view_from_bytes, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, - static_redaction_diagnostic_result_to_binding, - static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, + static_redaction_diagnostic_result_to_utf16_binding, + static_redaction_diagnostics_to_binding, + static_redaction_result_to_utf16_binding, }; use stella_anonymize_core::{ PreparedSearch as CorePreparedSearch, PreparedSearchArtifacts, @@ -143,16 +144,17 @@ impl PyPreparedSearch { operators_json: Option<&str>, ) -> PyResult { let operators = parse_operator_config(operators_json)?; - self + let result = self .inner .redact_static_entities( full_text, &operator_config_from_binding(operators) .map_err(|error| to_py_contract_error(&error))?, ) - .map(static_redaction_result_to_binding) + .map_err(|error| to_py_core_error(&error))?; + static_redaction_result_to_utf16_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error)) .map(to_py_static_redaction_result) - .map_err(|error| to_py_core_error(&error)) } fn redact_static_entities_json( @@ -182,7 +184,9 @@ impl PyPreparedSearch { let mut diagnostics = self.prepare_diagnostics.clone(); diagnostics.extend(result.diagnostics); result.diagnostics = diagnostics; - let result = static_redaction_diagnostic_result_to_binding(result); + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error))?; serde_json::to_string(&result).map_err(|error| to_py_serde_error(&error)) } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index d63fb4d1..108655e2 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -413,6 +413,36 @@ describe("native adapter parity", () => { ); }); + test("adapter result offsets slice source text after multibyte prefixes", () => { + const adapters = getAdapters(); + const text = + "č Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const tsResult = runTsAdapter(adapters.native, text, null); + const pyResult = runPythonAdapters( + adapters.pythonModulePath, + [ + { + text, + operators: null, + sensitiveValues: [], + }, + ], + adapters.tempDir, + ).at(0); + + expect(pyResult).toEqual(tsResult); + const registration = tsResult.resolved_entities.find( + (entity) => entity.label === "registration number", + ); + expect(registration).toBeDefined(); + if (!registration) { + return; + } + expect(text.slice(registration.start, registration.end)).toBe("AB1234"); + }); + test("prepared search accepts config JSON bytes", () => { const adapters = getAdapters(); const text = @@ -568,6 +598,9 @@ describe("native adapter parity", () => { event.span_valid === true, ), ).toBe(true); + expect( + tsResult.diagnostics.events.every((event) => event.text === undefined), + ).toBe(true); }); }); diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index eac150db..cd346d18 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -200,6 +200,36 @@ describe("pipeline config semantics", () => { ]); }); + test("native config serializes gazetteer metadata with Rust field names", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableGazetteer: true, + labels: ["organization"], + }, + [ + { + id: "gazetteer-acme", + canonical: "Acme", + label: "organization", + variants: [], + workspaceId: "test", + createdAt: 0, + source: "manual", + }, + ], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.gazetteer_data).toEqual({ + labels: ["organization", "organization"], + is_fuzzy: [false, true], + }); + expect( + Object.hasOwn(search.nativeStaticConfig.gazetteer_data ?? {}, "isFuzzy"), + ).toBe(false); + }); + test("preparePipelineSearch reuses the context search cache", async () => { const context = createPipelineContext(); const config = { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 380c39f2..9025e70c 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -215,6 +215,10 @@ export type NativeDateData = { export type NativeMonetaryData = MonetaryData; export type NativeAddressSeedData = AddressSeedData; +export type NativeGazetteerData = { + labels: string[]; + is_fuzzy: boolean[]; +}; export type NativePreparedSearchConfig = { regex_patterns: NativeSearchPattern[]; @@ -237,7 +241,7 @@ export type NativePreparedSearchConfig = { regex_meta: NativeRegexMatchMeta[]; custom_regex_meta: NativeRegexMatchMeta[]; deny_list_data?: NativeDenyListMatchData; - gazetteer_data?: GazetteerData; + gazetteer_data?: NativeGazetteerData; country_data?: CountryData; trigger_data?: NativeTriggerData; legal_form_data?: NativeLegalFormData; @@ -1005,7 +1009,7 @@ const buildNativeStaticConfig = ({ nativeConfig.deny_list_data = toNativeDenyListData(denyListData); } if (gazetteerData) { - nativeConfig.gazetteer_data = gazetteerData; + nativeConfig.gazetteer_data = toNativeGazetteerData(gazetteerData); } if (countryData) { nativeConfig.country_data = countryData; @@ -1037,6 +1041,11 @@ const toNativeLegalFormPattern = (pattern: string): NativeSearchPattern => ({ pattern, }); +const toNativeGazetteerData = (data: GazetteerData): NativeGazetteerData => ({ + labels: [...data.labels], + is_fuzzy: [...data.isFuzzy], +}); + const toNativeTriggerPattern = (pattern: string): NativeSearchPattern => ({ kind: "literal-with-options", pattern, From d2e66cf9fa2f8c4b1135ee96b220608d5d491057 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 12:29:19 +0200 Subject: [PATCH 033/130] fix: address prepared search review --- crates/anonymize-adapter-contract/src/lib.rs | 15 +++- crates/anonymize-core/src/false_positives.rs | 69 +++++++++++++------ crates/anonymize-napi/src/lib.rs | 19 ++--- .../scripts/migration-fixture-perf.mjs | 8 +-- .../__test__/native-adapter-parity.test.ts | 48 +++++++++++++ 5 files changed, 119 insertions(+), 40 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index dcf7b4f2..b3b90ec4 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -973,6 +973,7 @@ fn write_package_header( #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingOperatorConfig { pub operators: Option>, + #[serde(default, alias = "redactString")] pub redact_string: Option, } @@ -2031,7 +2032,8 @@ mod tests { #![allow(clippy::unwrap_used)] use super::{ - BindingPreparedSearchConfig, BindingSearchPattern, ContractError, + BindingOperatorConfig, BindingPreparedSearchConfig, BindingSearchPattern, + ContractError, operator_config_from_binding, prepared_search_config_from_binding, prepared_search_core_package_from_bytes, prepared_search_core_package_to_bytes, @@ -2079,6 +2081,17 @@ mod tests { ); } + #[test] + fn binding_operator_config_accepts_camel_case_redact_string() { + let config = serde_json::from_str::( + r#"{"operators":{"country":"redact"},"redactString":"***"}"#, + ) + .unwrap(); + let operators = operator_config_from_binding(Some(config)).unwrap(); + + assert_eq!(operators.redact_string, "***"); + } + #[test] fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { let config = package_test_config(); diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs index fa35cee7..271fd9c0 100644 --- a/crates/anonymize-core/src/false_positives.rs +++ b/crates/anonymize-core/src/false_positives.rs @@ -29,10 +29,6 @@ pub(crate) fn filter_entity_false_positives( full_text: &str, filters: Option<&DenyListFilterData>, ) -> Result> { - let Some(filters) = filters else { - return Ok(entities); - }; - let offsets = ByteOffsets::new(full_text); let mut filtered = Vec::with_capacity(entities.len()); for entity in entities { @@ -59,7 +55,7 @@ fn normalize_entity( entity: &PipelineEntity, full_text: &str, offsets: &ByteOffsets<'_>, - filters: &DenyListFilterData, + filters: Option<&DenyListFilterData>, ) -> Result> { let raw_text = offsets.slice(full_text, entity.start, entity.end)?; let mut start_byte = 0usize; @@ -68,7 +64,9 @@ fn normalize_entity( trim_leading_artifacts(&raw_text, &mut start_byte, end_byte); trim_leading_whitespace(&raw_text, &mut start_byte, end_byte); - if entity.label == ADDRESS_LABEL { + if entity.label == ADDRESS_LABEL + && let Some(filters) = filters + { if let Some(trimmed) = address_role_prefix_len(slice(&raw_text, start_byte, end_byte)?, filters) { @@ -107,7 +105,7 @@ fn should_reject_entity( entity: &PipelineEntity, full_text: &str, offsets: &ByteOffsets<'_>, - filters: &DenyListFilterData, + filters: Option<&DenyListFilterData>, ) -> Result { let text = entity.text.trim(); if is_template_placeholder(text) { @@ -132,16 +130,19 @@ fn should_reject_entity( { return Ok(true); } - if entity.label == PERSON_LABEL && is_single_person_stopword(text, filters) { - return Ok(true); - } - if entity.label == PERSON_LABEL - && ends_in_person_trailing_noun(entity, filters) - { - return Ok(true); - } - if role_exact_match(entity, filters) { - return Ok(true); + if let Some(filters) = filters { + if entity.label == PERSON_LABEL && is_single_person_stopword(text, filters) + { + return Ok(true); + } + if entity.label == PERSON_LABEL + && ends_in_person_trailing_noun(entity, filters) + { + return Ok(true); + } + if role_exact_match(entity, filters) { + return Ok(true); + } } if entity.label == ORGANIZATION_LABEL && is_all_caps_candidate(text) @@ -158,16 +159,17 @@ fn should_reject_entity( fn should_reject_address( entity: &PipelineEntity, - filters: &DenyListFilterData, + filters: Option<&DenyListFilterData>, ) -> bool { let text = entity.text.trim(); - if is_signing_place_address(text, filters) { + if filters.is_some_and(|filters| is_signing_place_address(text, filters)) { return true; } let has_digits = text.chars().any(|ch| ch.is_ascii_digit()); - let has_component = has_address_component(text, filters); - if is_jurisdiction_address(text, filters) { + let has_component = + filters.is_some_and(|filters| has_address_component(text, filters)); + if filters.is_some_and(|filters| is_jurisdiction_address(text, filters)) { return false; } if entity.source == DetectionSource::Trigger && !has_digits && !has_component @@ -707,6 +709,31 @@ mod tests { assert!(entities.is_empty()); } + #[test] + fn rejects_generic_false_positives_without_deny_list_filters() { + let text = "[NAME]\n17. NO ASSIGNMENT.\n"; + let heading_start = text.find("NO ASSIGNMENT").unwrap(); + let heading_end = heading_start.saturating_add("NO ASSIGNMENT".len()); + let entities = filter_entity_false_positives( + vec![ + entity("[NAME]", "[NAME]", PERSON_LABEL, DetectionSource::Regex), + PipelineEntity::detected( + u32::try_from(heading_start).unwrap(), + u32::try_from(heading_end).unwrap(), + ORGANIZATION_LABEL, + "NO ASSIGNMENT", + 0.8, + DetectionSource::Regex, + ), + ], + text, + None, + ) + .unwrap(); + + assert!(entities.is_empty()); + } + #[test] fn trims_address_role_prefix_from_shared_role_data() { let text = "sídlo prodávajícího Na Květnici 1"; diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 9861639c..a726ca15 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -12,7 +12,7 @@ use stella_anonymize_adapter_contract::{ operator_config_from_binding, prepared_search_config_from_binding, prepared_search_core_package_to_bytes, prepared_search_core_package_to_compressed_bytes, - prepared_search_core_package_view_from_bytes, prepared_search_package_digest, + prepared_search_core_package_view_from_bytes, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, static_redaction_diagnostic_result_to_utf16_binding, static_redaction_diagnostics_to_binding, @@ -441,7 +441,7 @@ impl NativePreparedSearch { fn from_package_bytes(package_bytes: &[u8]) -> Result { let input_bytes_len = package_bytes.len(); - let cache_key = prepared_search_package_cache_key(package_bytes)?; + let cache_key = prepared_search_package_cache_key(package_bytes); let cache_start = Instant::now(); if let Some(inner) = prepared_search_cache_get(&cache_key) { return Ok(Self { @@ -661,20 +661,11 @@ fn prepared_search_cache_key( *hasher.finalize().as_bytes() } -fn prepared_search_package_cache_key(package_bytes: &[u8]) -> Result<[u8; 32]> { - let digest = prepared_search_package_digest(package_bytes) - .map_err(|error| to_napi_contract_error(&error))?; +fn prepared_search_package_cache_key(package_bytes: &[u8]) -> [u8; 32] { let mut hasher = blake3::Hasher::new(); hasher.update(b"prepared-package"); - hasher.update(&digest); - let len = u64::try_from(package_bytes.len()).map_err(|_| { - Error::from_reason(format!( - "Prepared package byte length exceeds u64 range: {}", - package_bytes.len() - )) - })?; - hasher.update(&len.to_le_bytes()); - Ok(*hasher.finalize().as_bytes()) + hasher.update(package_bytes); + *hasher.finalize().as_bytes() } fn with_prepared_search_cache( diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 8d5ebce0..b9ea4a38 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -500,7 +500,7 @@ function runNativeStaticFixtureSweep({ runner, fixtures }) { const start = Bun.nanoseconds(); const result = runner.prepared.redactStaticEntities(fullText, undefined); const ms = elapsedMs(start); - const snapshot = toNativeSnapshot(result); + const snapshot = toNativeSnapshot(fullText, result); results.push({ fixture: relative(FIXTURES_DIR, fixturePath), ms, @@ -667,7 +667,7 @@ function toSnapshot(indexModule, fullText, entities, context) { }; } -function toNativeSnapshot(result) { +function toNativeSnapshot(fullText, result) { const entities = result.resolvedEntities.toSorted( (left, right) => left.start - right.start || @@ -686,8 +686,8 @@ function toNativeSnapshot(result) { entities: entities.map(({ start, end, label, text, source }) => ({ start, end, - byteStart: start, - byteEnd: end, + byteStart: utf16OffsetToUtf8ByteOffset(fullText, start), + byteEnd: utf16OffsetToUtf8ByteOffset(fullText, end), label, text, source, diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 108655e2..e6234a8e 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -525,6 +525,34 @@ describe("native adapter parity", () => { ).toEqual(expectedJson); }); + test("prepared package cache verifies same-length corrupted bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchPackageBytes(configBytes); + + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + expect(prepared.redactStaticEntities(text)).toBeDefined(); + + const corrupted = Buffer.from(packageBytes); + const lastIndex = corrupted.length - 1; + const lastByte = corrupted.at(lastIndex); + if (lastByte === undefined) { + throw new Error("prepared package unexpectedly empty"); + } + corrupted.writeUInt8(lastByte ^ 0x01, lastIndex); + + expect(() => + adapters.native.NativePreparedSearch.fromPreparedPackageBytes(corrupted), + ).toThrow(); + }); + test("prepared search accepts compressed package bytes through TS and Python adapters", () => { const adapters = getAdapters(); const text = @@ -557,6 +585,26 @@ describe("native adapter parity", () => { ).toEqual(expectedJson); }); + test("JSON operator config accepts camel-case redactString", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const result = JSON.parse( + adapters.native.redactStaticEntitiesJson( + CONFIG_JSON, + text, + JSON.stringify({ + operators: { country: "redact" }, + redactString: "***", + }), + ), + ) as StaticRedactionResult; + + expect(result.redaction.redacted_text).toContain("***"); + }); + test("diagnostics JSON is identical through TS and Python adapters", () => { const adapters = getAdapters(); const text = From 942b095d3bac8c1a85be37b9dbb8cb25291780c4 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 12:36:23 +0200 Subject: [PATCH 034/130] fix: trim date span fillers --- .../anonymize-core/src/resolution/sanitize.rs | 49 +++++++++++++++++++ crates/anonymize-core/tests/resolution.rs | 26 ++++++++++ 2 files changed, 75 insertions(+) diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs index 01d8d5f4..df41c1f1 100644 --- a/crates/anonymize-core/src/resolution/sanitize.rs +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -66,6 +66,16 @@ fn clean_entity_text( break; } + trim_leading_date_artifacts(entity, raw_text, &mut start_byte, end_byte); + + while let Some((ch, len)) = first_char(raw_text.get(start_byte..end_byte)?) { + if ch.is_whitespace() { + start_byte = start_byte.saturating_add(len); + continue; + } + break; + } + while let Some((ch, len)) = last_char(raw_text.get(start_byte..end_byte)?) { if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { end_byte = end_byte.saturating_sub(len); @@ -137,6 +147,45 @@ fn is_leading_trim(ch: char, label: &str) -> bool { } } +fn trim_leading_date_artifacts( + entity: &PipelineEntity, + raw_text: &str, + start_byte: &mut usize, + end_byte: usize, +) { + if !matches!(entity.label.as_str(), "date" | "date of birth") { + return; + } + + let Some(text) = raw_text.get(*start_byte..end_byte) else { + return; + }; + let dot_len = leading_dot_run_len(text); + if dot_len == 0 { + return; + } + + let should_trim = dot_len >= 2 + || text + .get(dot_len..) + .and_then(|suffix| suffix.chars().next()) + .is_some_and(char::is_whitespace); + if should_trim { + *start_byte = (*start_byte).saturating_add(dot_len); + } +} + +fn leading_dot_run_len(text: &str) -> usize { + let mut len = 0usize; + for ch in text.chars() { + if ch != '.' { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + fn is_trailing_trim(ch: char, label: &str) -> bool { if label_allows_colon(label) { matches!( diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 85b1e885..2179a91e 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -240,6 +240,32 @@ fn sanitize_trims_punctuation_and_updates_byte_offsets() { assert_eq!(entity.end, 23); } +#[test] +fn sanitize_trims_leading_date_ellipsis() { + let mut input = text_entity("...2. 2. 2026", "date", DetectionSource::Regex); + input.start = 10; + input.end = 10_u32.saturating_add(byte_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "2. 2. 2026"); + assert_eq!(entity.start, 13); +} + +#[test] +fn sanitize_trims_single_dot_date_filler() { + let mut input = text_entity(". 2. 2. 2026", "date", DetectionSource::Regex); + input.start = 10; + input.end = 10_u32.saturating_add(byte_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "2. 2. 2026"); + assert_eq!(entity.start, 12); +} + #[test] fn sanitize_preserves_literal_dictionary_punctuation() { let result = sanitize_entities(&[ From a178545d34a90c3e34ea397d562f1c6938900ac6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 13:18:00 +0200 Subject: [PATCH 035/130] chore: use stdnum core validators --- Cargo.lock | 6 + crates/anonymize-core/Cargo.toml | 1 + crates/anonymize-core/src/validators.rs | 640 +----------------------- 3 files changed, 9 insertions(+), 638 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12cb1397..5717d1ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -587,6 +587,7 @@ dependencies = [ "fancy-regex", "regex", "serde", + "stella-stdnum-core", "stella-text-search-core", ] @@ -636,6 +637,11 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "stella-stdnum-core" +version = "2.1.1" +source = "git+https://github.com/stella/stdnum?rev=7c602fde6a6043e5c2efd2f6b73917c100b7e450#7c602fde6a6043e5c2efd2f6b73917c100b7e450" + [[package]] name = "stella-text-search-core" version = "1.0.6" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 3148368f..be5dd83c 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,6 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "7c602fde6a6043e5c2efd2f6b73917c100b7e450" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } [lints] diff --git a/crates/anonymize-core/src/validators.rs b/crates/anonymize-core/src/validators.rs index 632fb77a..e2e3dfa0 100644 --- a/crates/anonymize-core/src/validators.rs +++ b/crates/anonymize-core/src/validators.rs @@ -1,8 +1,5 @@ -const SPANISH_CHECK_LETTERS: &str = "TRWAGMYFPDXBNJZSQVHLCKE"; -const SPANISH_CIF_LETTERS: &str = "JABCDEFGHI"; - pub(crate) fn validate_named_id(validator: &str, value: &str) -> bool { - validate_id(validator, value, None) + stella_stdnum_core::validate_named_id(validator, value) } pub(crate) fn validate_id( @@ -10,638 +7,5 @@ pub(crate) fn validate_id( value: &str, input: Option<&str>, ) -> bool { - let candidate = validator_candidate(value, input); - match validator { - "au.abn" => validate_au_abn(&candidate), - "br.cnpj" => validate_cnpj(&candidate), - "br.cpf" => validate_cpf(&candidate), - "cz.dic" => validate_cz_dic(&candidate), - "cz.rc" => validate_cz_rc(&candidate), - "es.cif" => validate_es_cif(&candidate), - "es.dni" => validate_es_dni(&candidate), - "es.nie" => validate_es_nie(&candidate), - "gb.nhs" => validate_gb_nhs(&candidate), - "gb.nino" => validate_gb_nino(&candidate), - "no.mva" => validate_no_mva(&candidate), - "no.orgnr" => validate_no_orgnr(&candidate), - "us.ein" => validate_us_ein(&candidate), - "us.rtn" => validate_us_routing(&candidate), - _ => false, - } -} - -fn validator_candidate(value: &str, input: Option<&str>) -> String { - match input { - Some("digits-only") => decimal_digit_chars(value).collect(), - _ => value.to_owned(), - } -} - -fn validate_us_ein(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']); - if compact.len() != 9 || !is_ascii_digits(&compact) { - return false; - } - let Some(prefix) = compact.get(0..2) else { - return false; - }; - matches!( - prefix, - "01" - | "02" - | "03" - | "04" - | "05" - | "06" - | "10" - | "11" - | "12" - | "13" - | "14" - | "15" - | "16" - | "20" - | "21" - | "22" - | "23" - | "24" - | "25" - | "26" - | "27" - | "30" - | "31" - | "32" - | "33" - | "34" - | "35" - | "36" - | "37" - | "38" - | "39" - | "40" - | "41" - | "42" - | "43" - | "44" - | "45" - | "46" - | "47" - | "48" - | "50" - | "51" - | "52" - | "53" - | "54" - | "55" - | "56" - | "57" - | "58" - | "59" - | "60" - | "61" - | "62" - | "63" - | "64" - | "65" - | "66" - | "67" - | "68" - | "71" - | "72" - | "73" - | "74" - | "75" - | "76" - | "77" - | "80" - | "81" - | "82" - | "83" - | "84" - | "85" - | "86" - | "87" - | "88" - | "90" - | "91" - | "92" - | "93" - | "94" - | "95" - | "98" - | "99" - ) -} - -fn validate_cpf(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-', '.']); - let Ok(digits) = <[u32; 11]>::try_from(decimal_digits_strict(&compact)) - else { - return false; - }; - let [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10] = digits; - if digits.iter().all(|digit| *digit == d0) { - return false; - } - let first = cpf_digit(&[d0, d1, d2, d3, d4, d5, d6, d7, d8], 10); - let second = cpf_digit(&[d0, d1, d2, d3, d4, d5, d6, d7, d8, d9], 11); - d9 == first && d10 == second -} - -fn cpf_digit(digits: &[u32], weight_start: u32) -> u32 { - let sum = digits - .iter() - .enumerate() - .map(|(index, digit)| { - let index = u32::try_from(index).unwrap_or(u32::MAX); - digit.saturating_mul(weight_start.saturating_sub(index)) - }) - .sum::(); - let value = 11_u32.saturating_sub(sum.rem_euclid(11)); - if value >= 10 { 0 } else { value } -} - -fn validate_cnpj(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-', '.', '/']).to_uppercase(); - let chars = compact.chars().collect::>(); - let Ok(chars) = <[char; 14]>::try_from(chars) else { - return false; - }; - if !chars - .iter() - .all(|ch| ch.is_ascii_digit() || ch.is_ascii_uppercase()) - { - return false; - } - if chars.iter().take(12).all(|ch| *ch == '0') { - return false; - } - let first = cnpj_digit( - chars.get(..12).unwrap_or(&[]), - &[5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2], - ); - let second = cnpj_digit( - chars.get(..13).unwrap_or(&[]), - &[6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2], - ); - chars.get(12).and_then(|ch| ascii_digit_value(*ch)) == Some(first) - && chars.get(13).and_then(|ch| ascii_digit_value(*ch)) == Some(second) -} - -fn cnpj_digit(chars: &[char], weights: &[u32]) -> u32 { - let sum = chars - .iter() - .zip(weights) - .filter_map(|(ch, weight)| { - cnpj_char_value(*ch).map(|value| value.saturating_mul(*weight)) - }) - .sum::(); - let value = sum.rem_euclid(11); - if value < 2 { - 0 - } else { - 11_u32.saturating_sub(value) - } -} - -fn cnpj_char_value(ch: char) -> Option { - (ch.is_ascii_digit() || ch.is_ascii_uppercase()) - .then(|| u32::from(ch).saturating_sub(u32::from('0'))) -} - -fn validate_cz_rc(value: &str) -> bool { - let compact = compact_without(value, &[' ', '/']); - let digits = decimal_digits_strict(&compact); - let len = digits.len(); - if len != 9 && len != 10 { - return false; - } - - let Some(yy) = number_from_digits(digits.get(0..2)) else { - return false; - }; - let Some(raw_month) = number_from_digits(digits.get(2..4)) else { - return false; - }; - let Some(day) = number_from_digits(digits.get(4..6)) else { - return false; - }; - - let mut year = 1900_u32.saturating_add(yy); - if len == 9 { - if year >= 1980 { - year = year.saturating_sub(100); - } - if year > 1953 { - return false; - } - } else if year < 1954 { - year = year.saturating_add(100); - } - - let Some(month) = decode_cz_month(raw_month, year, len) else { - return false; - }; - if !valid_date(year, month, day) { - return false; - } - if len != 10 { - return true; - } - - let Some(front) = number_from_digits(digits.get(0..9)) else { - return false; - }; - let Some(check) = digits.get(9).copied() else { - return false; - }; - (front % 11) % 10 == check -} - -fn decode_cz_month(raw_month: u32, year: u32, len: usize) -> Option { - let offsets: &[u32] = if len == 10 && year >= 2004 { - &[0, 50, 20, 70] - } else { - &[0, 50] - }; - offsets.iter().find_map(|offset| { - let month = raw_month.checked_sub(*offset)?; - (1..=12).contains(&month).then_some(month) - }) -} - -fn validate_cz_dic(value: &str) -> bool { - let mut compact = compact_without(value, &[' ', '-']); - if compact.starts_with("CZ") || compact.starts_with("cz") { - compact = compact.chars().skip(2).collect(); - } - let digits = decimal_digits_strict(&compact); - if !(8..=10).contains(&digits.len()) { - return false; - } - match digits.len() { - 8 => validate_cz_dic_legal(&digits), - 9 if digits.first() == Some(&6) => validate_cz_dic_special(&digits), - 9 | 10 => validate_cz_rc(&compact), - _ => false, - } -} - -fn validate_cz_dic_legal(digits: &[u32]) -> bool { - if digits.first() == Some(&9) { - return false; - } - let Some(check) = digits.get(7).copied() else { - return false; - }; - let sum = - weighted_sum(digits.get(0..7).unwrap_or(&[]), &[8, 7, 6, 5, 4, 3, 2]) - .rem_euclid(11); - let v11 = 11_u32.saturating_sub(sum).rem_euclid(11); - let expected = if v11 == 0 { 1 } else { v11 % 10 }; - check == expected -} - -fn validate_cz_dic_special(digits: &[u32]) -> bool { - let Some(check_digit) = digits.get(8).copied() else { - return false; - }; - let sum = - weighted_sum(digits.get(1..8).unwrap_or(&[]), &[8, 7, 6, 5, 4, 3, 2]) - .rem_euclid(11); - let inner = 10_u32.saturating_add(11).saturating_sub(sum).rem_euclid(11); - let check = 8_u32 - .saturating_add(10) - .saturating_sub(inner) - .rem_euclid(10); - check_digit == check -} - -fn validate_gb_nhs(value: &str) -> bool { - let digits = decimal_digits_strict(value); - let Ok(digits) = <[u32; 10]>::try_from(digits) else { - return false; - }; - let [d0, d1, d2, d3, d4, d5, d6, d7, d8, d9] = digits; - let total = weighted_sum( - &[d0, d1, d2, d3, d4, d5, d6, d7, d8], - &[10, 9, 8, 7, 6, 5, 4, 3, 2], - ); - let check = 11_u32.saturating_sub(total.rem_euclid(11)); - let expected = match check { - 10 => return false, - 11 => 0, - candidate => candidate, - }; - d9 == expected -} - -fn validate_gb_nino(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']).to_uppercase(); - let chars = compact.chars().collect::>(); - let Ok(chars) = <[char; 9]>::try_from(chars) else { - return false; - }; - let [first, second, d0, d1, d2, d3, d4, d5, suffix] = chars; - if !matches!( - first, - 'A' - | 'B' - | 'C' - | 'E' - | 'G' - | 'H' - | 'J' - | 'K' - | 'L' - | 'M' - | 'N' - | 'O' - | 'P' - | 'R' - | 'S' - | 'T' - | 'W' - | 'X' - | 'Y' - | 'Z' - ) { - return false; - } - if !matches!( - second, - 'A' - | 'B' - | 'C' - | 'E' - | 'G' - | 'H' - | 'J' - | 'K' - | 'L' - | 'M' - | 'N' - | 'P' - | 'R' - | 'S' - | 'T' - | 'W' - | 'X' - | 'Y' - | 'Z' - ) { - return false; - } - if ![d0, d1, d2, d3, d4, d5].iter().all(char::is_ascii_digit) { - return false; - } - if !matches!(suffix, 'A' | 'B' | 'C' | 'D') { - return false; - } - let prefix = [first, second].iter().collect::(); - !matches!( - prefix.as_str(), - "BG" | "GB" | "NK" | "KN" | "TN" | "NT" | "ZZ" - ) -} - -fn validate_es_dni(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']).to_uppercase(); - let chars = compact.chars().collect::>(); - let Ok(chars) = <[char; 9]>::try_from(chars) else { - return false; - }; - let [d0, d1, d2, d3, d4, d5, d6, d7, letter] = chars; - let digits = [d0, d1, d2, d3, d4, d5, d6, d7]; - let Some(number) = number_from_ascii_digits(&digits) else { - return false; - }; - spanish_check_letter(number) == Some(letter) -} - -fn validate_es_nie(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']).to_uppercase(); - let chars = compact.chars().collect::>(); - let Ok(chars) = <[char; 9]>::try_from(chars) else { - return false; - }; - let [prefix, d0, d1, d2, d3, d4, d5, d6, letter] = chars; - let prefix_value: u32 = match prefix { - 'X' => 0, - 'Y' => 1, - 'Z' => 2, - _ => return false, - }; - let digits = [d0, d1, d2, d3, d4, d5, d6]; - let Some(number) = number_from_ascii_digits(&digits) else { - return false; - }; - spanish_check_letter( - prefix_value - .saturating_mul(10_000_000) - .saturating_add(number), - ) == Some(letter) -} - -fn validate_es_cif(value: &str) -> bool { - let mut compact = compact_without(value, &[' ', '-', '/', '.']); - if compact.starts_with("ES") || compact.starts_with("es") { - compact = compact.chars().skip(2).collect(); - } - let compact = compact.to_uppercase(); - let chars = compact.chars().collect::>(); - let Ok(chars) = <[char; 9]>::try_from(chars) else { - return false; - }; - let [prefix, d0, d1, d2, d3, d4, d5, d6, check] = chars; - if !matches!( - prefix, - 'A' - | 'B' - | 'C' - | 'D' - | 'E' - | 'F' - | 'G' - | 'H' - | 'J' - | 'N' - | 'P' - | 'Q' - | 'R' - | 'S' - | 'U' - | 'V' - | 'W' - ) { - return false; - } - let digits = [d0, d1, d2, d3, d4, d5, d6]; - if !digits.iter().all(char::is_ascii_digit) { - return false; - } - let Some(cif_check) = spanish_cif_checksum(&digits) else { - return false; - }; - ascii_digit_value(check) == Some(cif_check) - || char_at(SPANISH_CIF_LETTERS, cif_check) == Some(check) -} - -fn spanish_check_letter(number: u32) -> Option { - char_at(SPANISH_CHECK_LETTERS, number % 23) -} - -fn spanish_cif_checksum(digits: &[char; 7]) -> Option { - let mut even = 0_u32; - let mut odd = 0_u32; - for (index, ch) in digits.iter().enumerate() { - let digit = ascii_digit_value(*ch)?; - if index.is_multiple_of(2) { - let doubled = digit.saturating_mul(2); - odd = odd.saturating_add( - doubled - .div_euclid(10) - .saturating_add(doubled.rem_euclid(10)), - ); - } else { - even = even.saturating_add(digit); - } - } - Some( - 10_u32 - .saturating_sub(even.saturating_add(odd).rem_euclid(10)) - .rem_euclid(10), - ) -} - -fn validate_au_abn(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']); - let Ok(mut digits) = <[u32; 11]>::try_from(decimal_digits_strict(&compact)) - else { - return false; - }; - let Some(first) = digits.first_mut() else { - return false; - }; - *first = (*first).saturating_sub(1); - weighted_sum(&digits, &[10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]) - .is_multiple_of(89) -} - -fn validate_no_orgnr(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']); - let digits = decimal_digits_strict(&compact); - if digits.len() != 9 { - return false; - } - weighted_sum(&digits, &[3, 2, 7, 6, 5, 4, 3, 2, 1]).is_multiple_of(11) -} - -fn validate_no_mva(value: &str) -> bool { - let mut compact = compact_without(value, &[' ', '-']).to_uppercase(); - if compact.starts_with("NO") { - compact = compact.chars().skip(2).collect(); - } - if !compact.ends_with("MVA") { - return false; - } - let digits = compact - .get(..compact.len().saturating_sub(3)) - .unwrap_or_default(); - validate_no_orgnr(digits) -} - -fn validate_us_routing(value: &str) -> bool { - let compact = compact_without(value, &[' ', '-']); - let Ok(digits) = <[u32; 9]>::try_from(decimal_digits_strict(&compact)) else { - return false; - }; - let [d0, d1, d2, d3, d4, d5, d6, d7, d8] = digits; - let prefix = d0.saturating_mul(10).saturating_add(d1); - if !((1..=12).contains(&prefix) - || (21..=32).contains(&prefix) - || (61..=72).contains(&prefix) - || prefix == 80) - { - return false; - } - let first = d0.saturating_add(d3).saturating_add(d6).saturating_mul(3); - let second = d1.saturating_add(d4).saturating_add(d7).saturating_mul(7); - let checksum = first - .saturating_add(second) - .saturating_add(d2) - .saturating_add(d5) - .saturating_add(d8); - checksum.is_multiple_of(10) -} - -fn compact_without(value: &str, skipped: &[char]) -> String { - value.chars().filter(|ch| !skipped.contains(ch)).collect() -} - -fn decimal_digits(value: &str) -> Vec { - decimal_digit_chars(value) - .filter_map(|ch| ch.to_digit(10)) - .collect() -} - -fn decimal_digits_strict(value: &str) -> Vec { - if !is_ascii_digits(value) { - return Vec::new(); - } - decimal_digits(value) -} - -fn decimal_digit_chars(value: &str) -> impl Iterator + '_ { - value.chars().filter(char::is_ascii_digit) -} - -fn is_ascii_digits(value: &str) -> bool { - !value.is_empty() && value.chars().all(|ch| ch.is_ascii_digit()) -} - -fn ascii_digit_value(ch: char) -> Option { - ch.to_digit(10).filter(|_| ch.is_ascii_digit()) -} - -fn number_from_digits(digits: Option<&[u32]>) -> Option { - digits?.iter().try_fold(0_u32, |total, digit| { - total.checked_mul(10)?.checked_add(*digit) - }) -} - -fn number_from_ascii_digits(chars: &[char]) -> Option { - chars.iter().try_fold(0_u32, |total, ch| { - total.checked_mul(10)?.checked_add(ascii_digit_value(*ch)?) - }) -} - -fn char_at(text: &str, index: u32) -> Option { - usize::try_from(index) - .ok() - .and_then(|index| text.chars().nth(index)) -} - -fn weighted_sum(digits: &[u32], weights: &[u32]) -> u32 { - digits - .iter() - .zip(weights) - .map(|(digit, weight)| digit.saturating_mul(*weight)) - .sum() -} - -fn valid_date(year: u32, month: u32, day: u32) -> bool { - let days = match month { - 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31, - 4 | 6 | 9 | 11 => 30, - 2 if is_leap_year(year) => 29, - 2 => 28, - _ => return false, - }; - (1..=days).contains(&day) -} - -const fn is_leap_year(year: u32) -> bool { - year.is_multiple_of(4) && !year.is_multiple_of(100) - || year.is_multiple_of(400) + stella_stdnum_core::validate_id(validator, value, input) } From 401ad516000ffcaffedfa277b8bb045a7ef14273 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 13:28:18 +0200 Subject: [PATCH 036/130] chore: update stdnum validator pin --- Cargo.lock | 2 +- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5717d1ea..a9be974a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,7 +640,7 @@ dependencies = [ [[package]] name = "stella-stdnum-core" version = "2.1.1" -source = "git+https://github.com/stella/stdnum?rev=7c602fde6a6043e5c2efd2f6b73917c100b7e450#7c602fde6a6043e5c2efd2f6b73917c100b7e450" +source = "git+https://github.com/stella/stdnum?rev=9704bdeac1d7e58407a8b50bb7e95e23953175ca#9704bdeac1d7e58407a8b50bb7e95e23953175ca" [[package]] name = "stella-text-search-core" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index be5dd83c..e3f202e6 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,7 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } -stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "7c602fde6a6043e5c2efd2f6b73917c100b7e450" } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "9704bdeac1d7e58407a8b50bb7e95e23953175ca" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } [lints] From d5bf5c2896b2ec7882ce5440254f1f19f45b626c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 13:31:03 +0200 Subject: [PATCH 037/130] chore: update stdnum validator pin --- Cargo.lock | 2 +- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9be974a..560ef2a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,7 +640,7 @@ dependencies = [ [[package]] name = "stella-stdnum-core" version = "2.1.1" -source = "git+https://github.com/stella/stdnum?rev=9704bdeac1d7e58407a8b50bb7e95e23953175ca#9704bdeac1d7e58407a8b50bb7e95e23953175ca" +source = "git+https://github.com/stella/stdnum?rev=614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f#614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" [[package]] name = "stella-text-search-core" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index e3f202e6..77750485 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,7 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } -stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "9704bdeac1d7e58407a8b50bb7e95e23953175ca" } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } [lints] From d2f923c959d7c4186d7c4f9b6ae4e437e8e65b06 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 13:45:38 +0200 Subject: [PATCH 038/130] fix: tighten native static parity --- Cargo.lock | 2 +- crates/anonymize-adapter-contract/src/lib.rs | 20 ++- crates/anonymize-core/Cargo.toml | 2 +- crates/anonymize-core/src/prepared.rs | 43 +++++ crates/anonymize-core/src/search.rs | 6 + crates/anonymize-core/src/triggers.rs | 5 +- crates/anonymize-core/tests/prepared.rs | 166 +++++++++++++++++- crates/anonymize-core/tests/search.rs | 46 ++++- crates/anonymize-napi/src/lib.rs | 1 + .../src/__test__/dictionary-bundle.test.ts | 21 +++ .../anonymize/src/build-unified-search.ts | 6 +- packages/data/dictionaries/index.ts | 14 +- 12 files changed, 314 insertions(+), 18 deletions(-) create mode 100644 packages/anonymize/src/__test__/dictionary-bundle.test.ts diff --git a/Cargo.lock b/Cargo.lock index 560ef2a9..3e66cc33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -645,7 +645,7 @@ source = "git+https://github.com/stella/stdnum?rev=614ac70ef76161e2bbb6dcbe5ecc3 [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=a5d6e11f5c832be50cba42882d2844394adb9403#a5d6e11f5c832be50cba42882d2844394adb9403" +source = "git+https://github.com/stella/text-search?rev=f3a3b23adac4e093d8ede69c04e74f6f7a57ea91#f3a3b23adac4e093d8ede69c04e74f6f7a57ea91" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index b3b90ec4..bb46c1b9 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -104,6 +104,7 @@ pub struct BindingSearchOptions { pub literal_case_insensitive: Option, pub literal_whole_words: Option, pub regex_whole_words: Option, + pub regex_overlap_all: Option, pub fuzzy_case_insensitive: Option, pub fuzzy_whole_words: Option, pub fuzzy_normalize_diacritics: Option, @@ -1860,6 +1861,7 @@ fn search_options_from_binding( }, regex: RegexSearchOptions { whole_words: options.regex_whole_words.unwrap_or(false), + overlap_all: options.regex_overlap_all.unwrap_or(false), }, fuzzy: FuzzySearchOptions { case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), @@ -2032,8 +2034,8 @@ mod tests { #![allow(clippy::unwrap_used)] use super::{ - BindingOperatorConfig, BindingPreparedSearchConfig, BindingSearchPattern, - ContractError, operator_config_from_binding, + BindingOperatorConfig, BindingPreparedSearchConfig, BindingSearchOptions, + BindingSearchPattern, ContractError, operator_config_from_binding, prepared_search_config_from_binding, prepared_search_core_package_from_bytes, prepared_search_core_package_to_bytes, @@ -2092,6 +2094,20 @@ mod tests { assert_eq!(operators.redact_string, "***"); } + #[test] + fn binding_search_options_accept_regex_overlap_all() { + let config = BindingPreparedSearchConfig { + custom_regex_options: Some(BindingSearchOptions { + regex_overlap_all: Some(true), + ..BindingSearchOptions::default() + }), + ..BindingPreparedSearchConfig::default() + }; + let core = prepared_search_config_from_binding(config).unwrap(); + + assert!(core.custom_regex_options.regex.overlap_all); + } + #[test] fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { let config = package_test_config(); diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 77750485..aea707aa 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "a5d6e11f5c832be50cba42882d2844394adb9403" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "f3a3b23adac4e093d8ede69c04e74f6f7a57ea91" } [lints] workspace = true diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index a252892f..7b68e740 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1278,6 +1278,8 @@ fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { validate_legal_form_config(config)?; validate_trigger_config(config)?; validate_deny_list_config(config)?; + validate_gazetteer_config(config)?; + validate_country_config(config)?; validate_address_seed_config(config) } @@ -1331,6 +1333,47 @@ fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { ensure_supported_deny_list_sources(data) } +fn validate_gazetteer_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.gazetteer.is_empty() { + return Ok(()); + } + + let Some(data) = &config.gazetteer_data else { + return Err(Error::MissingStaticData { + field: "gazetteer_data", + }); + }; + + validate_static_data_length( + "gazetteer_data.labels", + config.slices.gazetteer, + data.labels.len(), + )?; + validate_static_data_length( + "gazetteer_data.is_fuzzy", + config.slices.gazetteer, + data.is_fuzzy.len(), + ) +} + +fn validate_country_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.countries.is_empty() { + return Ok(()); + } + + let Some(data) = &config.country_data else { + return Err(Error::MissingStaticData { + field: "country_data", + }); + }; + + validate_static_data_length( + "country_data.labels", + config.slices.countries, + data.labels.len(), + ) +} + const fn validate_address_seed_config( config: &PreparedSearchConfig, ) -> Result<()> { diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 6fa87ea0..f4653917 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -73,6 +73,7 @@ pub struct LiteralSearchOptions { )] pub struct RegexSearchOptions { pub whole_words: bool, + pub overlap_all: bool, } #[derive( @@ -527,6 +528,11 @@ fn regex_options( ) -> text_search::TextSearchOptions { text_search::TextSearchOptions { whole_words: options.whole_words, + overlap_strategy: if options.overlap_all { + text_search::OverlapStrategy::All + } else { + text_search::OverlapStrategy::Longest + }, ..text_search::TextSearchOptions::default() } } diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index e9b84aaf..67674240 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -673,12 +673,13 @@ fn apply_validations( text: &str, validations: &[PreparedTriggerValidation], ) -> bool { + let text_len = text.chars().count(); validations.iter().all(|validation| match validation { PreparedTriggerValidation::StartsUppercase => { text.chars().next().is_some_and(char::is_uppercase) } - PreparedTriggerValidation::MinLength(min) => text.len() >= *min, - PreparedTriggerValidation::MaxLength(max) => text.len() <= *max, + PreparedTriggerValidation::MinLength(min) => text_len >= *min, + PreparedTriggerValidation::MaxLength(max) => text_len <= *max, PreparedTriggerValidation::NoDigits => { !text.chars().any(|ch| ch.is_ascii_digit()) } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 2a16fbda..87be5eb4 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -262,11 +262,17 @@ fn prepared_search_emits_static_detector_entities() { }, ], regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, custom_regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, literal_options: SearchOptions { @@ -324,6 +330,60 @@ fn prepared_search_emits_static_detector_entities() { assert_eq!(result.country_entities[0].source, DetectionSource::Country); } +#[test] +fn prepared_search_preserves_overlapping_custom_regex_matches() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![ + SearchPattern::Regex(String::from("Alice")), + SearchPattern::Regex(String::from("Alice Smith")), + ], + custom_regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + custom_regex_meta: vec![ + RegexMatchMeta { + label: String::from("person"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }, + RegexMatchMeta { + label: String::from("person"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }, + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Alice Smith signed.") + .unwrap(); + let custom_texts = result + .custom_regex_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!(custom_texts, ["Alice", "Alice Smith"]); +} + #[test] fn prepared_search_drops_person_spans_ending_in_trailing_noun() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -331,7 +391,10 @@ fn prepared_search_drops_person_spans_ending_in_trailing_noun() { r"\bCOBRA Reimbursement Period\b", ))], regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, slices: PreparedSearchSlices { @@ -557,6 +620,49 @@ fn prepared_search_trigger_caps_by_characters_not_bytes() { ); } +#[test] +fn prepared_search_trigger_validations_count_characters_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("jméno"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("jméno"), + label: String::from("person"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: vec![ + TriggerValidation::MinLength(5), + TriggerValidation::MaxLength(5), + ], + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Smluvní jméno Áběčď bylo ověřeno.") + .unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "person" && entity.text == "Áběčď") + ); +} + #[test] fn prepared_search_rejects_lowercase_acronym_trigger_collisions() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -750,7 +856,10 @@ fn prepared_search_redacts_static_entities_end_to_end() { }, ], regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, custom_regex_options: SearchOptions::default(), @@ -813,7 +922,10 @@ fn prepared_search_reports_static_redaction_diagnostics() { whole_words: Some(false), }], regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, custom_regex_options: SearchOptions::default(), @@ -948,6 +1060,45 @@ fn prepared_search_rejects_unsupported_static_slices() { assert_eq!(error, Error::UnsupportedStaticSlice { slice: "deny_list" }); } +#[test] +fn prepared_search_requires_gazetteer_metadata_for_gazetteer_slice() { + let error = PreparedSearch::new(empty_config(PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + })) + .err() + .expect("gazetteer slice should require metadata"); + + assert_eq!( + error, + Error::MissingStaticData { + field: "gazetteer_data" + } + ); +} + +#[test] +fn prepared_search_rejects_truncated_country_metadata() { + let error = PreparedSearch::new(PreparedSearchConfig { + country_data: Some(CountryMatchData { labels: Vec::new() }), + ..empty_config(PreparedSearchSlices { + countries: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("truncated country metadata should be rejected"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "country_data.labels", + expected: 1, + actual: 0 + } + ); +} + #[test] fn prepared_search_requires_address_seed_data_for_street_types() { let error = PreparedSearch::new(empty_config(PreparedSearchSlices { @@ -1348,7 +1499,10 @@ fn prepared_search_does_not_cluster_address_seed_inside_register_span() { ))], regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], regex_options: SearchOptions { - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, ..SearchOptions::default() }, literal_patterns: vec![SearchPattern::LiteralWithOptions { diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index defb4edf..d231e10a 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -21,7 +21,10 @@ fn search_index_routes_literal_regex_and_fuzzy_patterns() { case_insensitive: false, whole_words: true, }, - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, fuzzy: FuzzySearchOptions { case_insensitive: true, whole_words: true, @@ -127,6 +130,42 @@ fn search_index_returns_overlapping_literal_matches() { ); } +#[test] +fn search_index_can_return_overlapping_regex_matches() { + let index = SearchIndex::new( + vec![ + SearchPattern::Regex(String::from("Alice")), + SearchPattern::Regex(String::from("Alice Smith")), + ], + SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Smith signed.").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 0, + end: 11, + }, + ] + ); +} + #[test] fn search_index_supports_per_pattern_literal_word_boundaries() { let index = SearchIndex::new( @@ -239,7 +278,10 @@ fn search_index_prepared_artifacts_match_direct_index() { case_insensitive: false, whole_words: true, }, - regex: RegexSearchOptions { whole_words: false }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, fuzzy: FuzzySearchOptions { case_insensitive: true, whole_words: true, diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index a726ca15..45c7a1a1 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -83,6 +83,7 @@ pub struct JsSearchOptions { pub literal_case_insensitive: Option, pub literal_whole_words: Option, pub regex_whole_words: Option, + pub regex_overlap_all: Option, pub fuzzy_case_insensitive: Option, pub fuzzy_whole_words: Option, pub fuzzy_normalize_diacritics: Option, diff --git a/packages/anonymize/src/__test__/dictionary-bundle.test.ts b/packages/anonymize/src/__test__/dictionary-bundle.test.ts new file mode 100644 index 00000000..38c5a81d --- /dev/null +++ b/packages/anonymize/src/__test__/dictionary-bundle.test.ts @@ -0,0 +1,21 @@ +import { describe, expect, setDefaultTimeout, test } from "bun:test"; + +setDefaultTimeout(60_000); + +import { loadDictionaryBundle } from "../../../data/dictionaries/index"; + +describe("dictionary bundle scoping", () => { + test("empty country scope keeps default city dictionaries", async () => { + const bundle = await loadDictionaryBundle({ countries: [] }); + + expect(bundle.cities.length).toBeGreaterThan(0); + expect(Object.keys(bundle.citiesByCountry)).toContain("CZ"); + }); + + test("unsupported name language scope falls back to packaged names", async () => { + const bundle = await loadDictionaryBundle({ nameLanguages: ["pt-br"] }); + + expect(Object.keys(bundle.firstNames).length).toBeGreaterThan(0); + expect(Object.keys(bundle.surnames).length).toBeGreaterThan(0); + }); +}); diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 9025e70c..f18766bb 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -109,6 +109,7 @@ export type NativeSearchOptions = { literal_case_insensitive?: boolean; literal_whole_words?: boolean; regex_whole_words?: boolean; + regex_overlap_all?: boolean; fuzzy_case_insensitive?: boolean; fuzzy_whole_words?: boolean; fuzzy_normalize_diacritics?: boolean; @@ -970,7 +971,10 @@ const buildNativeStaticConfig = ({ literal_whole_words: false, regex_whole_words: false, }, - custom_regex_options: { regex_whole_words: false }, + custom_regex_options: { + regex_whole_words: false, + regex_overlap_all: true, + }, literal_options: { literal_case_insensitive: true, literal_whole_words: canUseGlobalWholeWordLiterals, diff --git a/packages/data/dictionaries/index.ts b/packages/data/dictionaries/index.ts index 98d8ce91..150e36ae 100644 --- a/packages/data/dictionaries/index.ts +++ b/packages/data/dictionaries/index.ts @@ -1033,7 +1033,9 @@ export const loadDictionaryBundle = async ({ const countryScope = normalizeCountryCodes(countries); const scopedNameLanguages = normalizeNameLanguages(nameLanguages); const hasScopedNames = - nameLanguages !== undefined && nameLanguages.length > 0; + nameLanguages !== undefined && + nameLanguages.length > 0 && + scopedNameLanguages.length > 0; const dictionaryIds = ALL_DICTIONARY_IDS.filter((id) => dictionaryIdIsInScope(id, countryScope, hasScopedNames), ); @@ -1050,8 +1052,14 @@ export const loadDictionaryBundle = async ({ denyListMeta[id] = DICTIONARY_META[id]; } - const nameDictionaries = await loadNameDictionaries(scopedNameLanguages); - const cityScope = cityCountries ?? countries ?? DEFAULT_CITY_COUNTRIES; + const nameDictionaries = await loadNameDictionaries( + scopedNameLanguages.length > 0 ? scopedNameLanguages : undefined, + ); + const requestedCityScope = cityCountries ?? countries; + const cityScope = + requestedCityScope === undefined || requestedCityScope.length === 0 + ? DEFAULT_CITY_COUNTRIES + : requestedCityScope; const cityResults = await Promise.all( cityScope.map(async (country) => ({ country: country.toUpperCase(), From 5004645b2d79b8665ad992733c563e89f842e4c8 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 19:25:58 +0200 Subject: [PATCH 039/130] fix: tighten native redaction config --- Cargo.lock | 2 +- crates/anonymize-adapter-contract/src/lib.rs | 20 +- crates/anonymize-core/Cargo.toml | 2 +- crates/anonymize-core/src/prepared.rs | 147 +++++++++++- crates/anonymize-core/src/triggers.rs | 86 ++++++- crates/anonymize-core/tests/prepared.rs | 216 +++++++++++++++++- .../src/__test__/pipeline-config.test.ts | 16 ++ .../anonymize/src/build-unified-search.ts | 16 ++ 8 files changed, 474 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3e66cc33..f0f5fd44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -645,7 +645,7 @@ source = "git+https://github.com/stella/stdnum?rev=614ac70ef76161e2bbb6dcbe5ecc3 [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=f3a3b23adac4e093d8ede69c04e74f6f7a57ea91#f3a3b23adac4e093d8ede69c04e74f6f7a57ea91" +source = "git+https://github.com/stella/text-search?rev=0cfaad48a3df24f918cf52a2d5aaf32f5a031148#0cfaad48a3df24f918cf52a2d5aaf32f5a031148" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index bb46c1b9..246e6734 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -18,13 +18,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 3; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 4; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 1; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 2; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 2; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 3; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 2; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 3; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; @@ -386,6 +386,10 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub literal_patterns_from_deny_list_data: bool, #[serde(default)] + pub allowed_labels: Vec, + #[serde(default)] + pub threshold: f64, + #[serde(default)] pub slices: BindingPreparedSearchSlices, #[serde(default)] pub regex_meta: Vec, @@ -448,6 +452,8 @@ struct BinaryPreparedSearchConfig { custom_regex_options: Option, literal_options: Option, literal_patterns_from_deny_list_data: bool, + allowed_labels: Vec, + threshold: f64, slices: BindingPreparedSearchSlices, regex_meta: Vec, custom_regex_meta: Vec, @@ -643,6 +649,8 @@ impl From for BinaryPreparedSearchConfig { literal_options: config.literal_options, literal_patterns_from_deny_list_data: config .literal_patterns_from_deny_list_data, + allowed_labels: config.allowed_labels, + threshold: config.threshold, slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -669,6 +677,8 @@ impl From for BindingPreparedSearchConfig { literal_options: config.literal_options, literal_patterns_from_deny_list_data: config .literal_patterns_from_deny_list_data, + allowed_labels: config.allowed_labels, + threshold: config.threshold, slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -1101,6 +1111,8 @@ pub fn prepared_search_config_from_binding( config.custom_regex_options, ), literal_options: search_options_from_binding(config.literal_options), + allowed_labels: config.allowed_labels, + threshold: config.threshold, slices: slices_from_binding(&config.slices), regex_meta: regex_meta_from_binding(config.regex_meta)?, custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index aea707aa..ad0deb8b 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "f3a3b23adac4e093d8ede69c04e74f6f7a57ea91" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0cfaad48a3df24f918cf52a2d5aaf32f5a031148" } [lints] workspace = true diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 7b68e740..579e5825 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -42,6 +42,8 @@ pub struct PreparedSearch { legal_forms: SearchIndex, triggers: SearchIndex, literals: SearchIndex, + allowed_labels: Vec, + threshold: f64, slices: PreparedSearchSlices, regex_meta: Vec, custom_regex_meta: Vec, @@ -77,6 +79,10 @@ pub struct PreparedSearchConfig { pub regex_options: SearchOptions, pub custom_regex_options: SearchOptions, pub literal_options: SearchOptions, + #[serde(default)] + pub allowed_labels: Vec, + #[serde(default)] + pub threshold: f64, pub slices: PreparedSearchSlices, pub regex_meta: Vec, pub custom_regex_meta: Vec, @@ -256,7 +262,7 @@ impl PreparedSearch { pub fn prepare_artifacts( config: PreparedSearchConfig, ) -> Result { - validate_supported_config(&config)?; + validate_supported_config(&config, false)?; let regex_groups = split_regex_patterns(config.regex_patterns, &config.slices)?; Ok(PreparedSearchArtifacts { @@ -322,8 +328,12 @@ impl PreparedSearch { artifacts: Option<&PreparedSearchArtifacts>, ) -> Result { let total_start = Instant::now(); - validate_supported_config(&config)?; + let allow_literal_artifacts = + artifacts.is_some_and(|artifacts| !artifacts.literals.slots.is_empty()); + validate_supported_config(&config, allow_literal_artifacts)?; let slices = config.slices.clone(); + let allowed_labels = config.allowed_labels.clone(); + let threshold = config.threshold; let regex_groups = split_regex_patterns(config.regex_patterns, &slices)?; let regex_len = regex_groups.regex.len(); let custom_regex_len = config.custom_regex_patterns.len(); @@ -399,6 +409,8 @@ impl PreparedSearch { legal_forms, triggers, literals, + allowed_labels, + threshold, slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -804,7 +816,11 @@ impl PreparedSearch { ) -> Result { let detections = self .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; - let raw_entities = detections.all_entities(); + let raw_entities = filter_entities_for_config( + detections.all_entities(), + self.threshold, + &self.allowed_labels, + ); let merge_start = Instant::now(); let merged = merge_and_dedup(&raw_entities); if let Some(diagnostics) = &mut diagnostics { @@ -828,14 +844,18 @@ impl PreparedSearch { let sanitize_start = Instant::now(); let sanitized_entities = sanitize_entities_with_source(&consistent, full_text)?; - let resolved_entities = filter_entity_false_positives( - sanitized_entities, - full_text, - self - .deny_list_data - .as_ref() - .and_then(|data| data.filters.as_ref()), - )?; + let resolved_entities = filter_entities_for_config( + filter_entity_false_positives( + sanitized_entities, + full_text, + self + .deny_list_data + .as_ref() + .and_then(|data| data.filters.as_ref()), + )?, + self.threshold, + &self.allowed_labels, + ); if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( DiagnosticStage::Sanitize, @@ -874,6 +894,21 @@ fn process_signature_entities(full_text: &str) -> TimedEntities { } } +fn filter_entities_for_config( + entities: Vec, + threshold: f64, + allowed_labels: &[String], +) -> Vec { + entities + .into_iter() + .filter(|entity| entity.score >= threshold) + .filter(|entity| { + allowed_labels.is_empty() + || allowed_labels.iter().any(|label| label == &entity.label) + }) + .collect() +} + fn record_static_entity_diagnostics( diagnostics: &mut StaticRedactionDiagnostics, full_text: &str, @@ -1274,7 +1309,11 @@ fn remap_normalized_match( Ok(found.with_span(start, end)) } -fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { +fn validate_supported_config( + config: &PreparedSearchConfig, + allow_literal_artifacts: bool, +) -> Result<()> { + validate_search_config(config, allow_literal_artifacts)?; validate_legal_form_config(config)?; validate_trigger_config(config)?; validate_deny_list_config(config)?; @@ -1283,6 +1322,90 @@ fn validate_supported_config(config: &PreparedSearchConfig) -> Result<()> { validate_address_seed_config(config) } +fn validate_search_config( + config: &PreparedSearchConfig, + allow_literal_artifacts: bool, +) -> Result<()> { + validate_slice_bounds( + "slices.regex", + config.slices.regex, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.legal_forms", + config.slices.legal_forms, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.triggers", + config.slices.triggers, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.custom_regex", + config.slices.custom_regex, + config.custom_regex_patterns.len(), + )?; + if !allow_literal_artifacts || !config.literal_patterns.is_empty() { + validate_slice_bounds( + "slices.deny_list", + config.slices.deny_list, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.street_types", + config.slices.street_types, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.gazetteer", + config.slices.gazetteer, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.countries", + config.slices.countries, + config.literal_patterns.len(), + )?; + } + validate_static_data_length( + "regex_meta", + config.slices.regex, + config.regex_meta.len(), + )?; + validate_static_data_length( + "custom_regex_meta", + config.slices.custom_regex, + config.custom_regex_meta.len(), + ) +} + +fn validate_slice_bounds( + field: &'static str, + slice: PatternSlice, + pattern_count: usize, +) -> Result<()> { + if slice.start > slice.end { + return Err(Error::InvalidStaticData { + field, + reason: "slice start exceeds slice end".to_owned(), + }); + } + let Some(end) = usize::try_from(slice.end).ok() else { + return Err(Error::InvalidStaticData { + field, + reason: "slice end exceeds usize range".to_owned(), + }); + }; + if end <= pattern_count { + return Ok(()); + } + Err(Error::InvalidStaticData { + field, + reason: format!("slice end {end} exceeds pattern count {pattern_count}"), + }) +} + fn validate_legal_form_config(config: &PreparedSearchConfig) -> Result<()> { if config.slices.legal_forms.is_empty() { return Ok(()); diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 67674240..08028b77 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -1171,9 +1171,10 @@ fn has_known_legal_form_suffix(text: &str, suffixes: &[String]) -> bool { fn person_name_run_end(text: &str) -> Option { let mut end = 0; let mut saw_token = false; - for token in text.split_whitespace() { - let trimmed = token.trim_matches(','); - if trimmed.chars().next().is_some_and(char::is_uppercase) { + let tokens = text.split_whitespace().collect::>(); + for (index, token) in tokens.iter().enumerate() { + let trimmed = trim_name_token(token); + if is_person_name_run_token(trimmed, saw_token, &tokens, index) { let relative = text.get(end..)?.find(token)?; end = end.saturating_add(relative).saturating_add(token.len()); saw_token = true; @@ -1184,6 +1185,85 @@ fn person_name_run_end(text: &str) -> Option { saw_token.then_some(end) } +fn is_person_name_run_token( + token: &str, + saw_token: bool, + tokens: &[&str], + index: usize, +) -> bool { + if is_capitalized_name_token(token) { + return true; + } + if !saw_token { + return false; + } + if is_apostrophe_name_continuation(token) { + return true; + } + is_name_particle(token) && has_name_after_particle(tokens, index) +} + +fn has_name_after_particle(tokens: &[&str], index: usize) -> bool { + for token in tokens.iter().skip(index.saturating_add(1)) { + let trimmed = trim_name_token(token); + if is_capitalized_name_token(trimmed) + || is_apostrophe_name_continuation(trimmed) + { + return true; + } + if is_name_particle(trimmed) { + continue; + } + return false; + } + false +} + +fn is_capitalized_name_token(token: &str) -> bool { + token.chars().next().is_some_and(char::is_uppercase) +} + +fn is_apostrophe_name_continuation(token: &str) -> bool { + token + .strip_prefix("d'") + .or_else(|| token.strip_prefix("d’")) + .is_some_and(is_capitalized_name_token) +} + +fn is_name_particle(token: &str) -> bool { + matches!( + token, + "de" + | "del" + | "della" + | "der" + | "den" + | "di" + | "du" + | "da" + | "das" + | "do" + | "dos" + | "el" + | "la" + | "le" + | "van" + | "von" + | "y" + | "zu" + | "af" + | "ben" + | "bin" + | "al" + | "d'" + | "d’" + ) +} + +fn trim_name_token(token: &str) -> &str { + token.trim_matches(',') +} + fn u32_len(text: &str) -> u32 { u32::try_from(text.len()).unwrap_or(u32::MAX) } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 87be5eb4..b1797c36 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -21,6 +21,8 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { regex_options: SearchOptions::default(), custom_regex_options: SearchOptions::default(), literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, slices, regex_meta: vec![], custom_regex_meta: vec![], @@ -111,6 +113,8 @@ fn prepared_search_runs_normalized_literal_pass() { regex_options: SearchOptions::default(), custom_regex_options: SearchOptions::default(), literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { gazetteer: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() @@ -152,6 +156,8 @@ fn prepared_search_artifacts_match_direct_prepare() { regex_options: SearchOptions::default(), custom_regex_options: SearchOptions::default(), literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -283,6 +289,8 @@ fn prepared_search_emits_static_detector_entities() { fuzzy: FuzzySearchOptions::default(), ..SearchOptions::default() }, + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, custom_regex: PatternSlice { start: 0, end: 1 }, @@ -870,6 +878,8 @@ fn prepared_search_redacts_static_entities_end_to_end() { }, ..SearchOptions::default() }, + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -909,6 +919,132 @@ fn prepared_search_redacts_static_entities_end_to_end() { assert_eq!(result.resolved_entities.len(), 3); } +#[test] +fn prepared_search_applies_threshold_before_merge() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from("Acme")), + SearchPattern::Regex(String::from(r"Acme s\.r\.o\.")), + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + threshold: 0.5, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("organization", 0.9), + RegexMatchMeta::new("organization", 0.4), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Acme s.r.o. signed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] s.r.o. signed." + ); + assert_eq!(result.resolved_entities.len(), 1); + assert_eq!(result.resolved_entities[0].text, "Acme"); +} + +#[test] +fn prepared_search_applies_allowed_labels_before_redaction() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + allowed_labels: vec![String::from("date")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("person", 1.0)], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Alice signed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.redaction.redacted_text, "Alice signed."); + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_keeps_person_name_particles_after_trigger() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Pan"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("Pan"), + label: String::from("person"), + strategy: TriggerStrategy::ToEndOfLine, + validations: vec![TriggerValidation::StartsUppercase], + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let with_apostrophe = prepared + .detect_static_entities("Pan Jean d'Arc přijel pozdě.") + .unwrap(); + assert!( + with_apostrophe + .trigger_entities + .iter() + .any(|entity| entity.text == "Jean d'Arc") + ); + + let with_particle = prepared + .detect_static_entities("Pan João dos Santos přijel pozdě.") + .unwrap(); + assert!( + with_particle + .trigger_entities + .iter() + .any(|entity| entity.text == "João dos Santos") + ); + + let trailing_particle = prepared + .detect_static_entities("Pan Novák von tady odešel.") + .unwrap(); + assert!( + trailing_particle + .trigger_entities + .iter() + .any(|entity| entity.text == "Novák") + ); + assert!( + trailing_particle + .trigger_entities + .iter() + .all(|entity| !entity.text.contains("von")) + ); +} + #[test] fn prepared_search_reports_static_redaction_diagnostics() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -936,6 +1072,8 @@ fn prepared_search_reports_static_redaction_diagnostics() { }, ..SearchOptions::default() }, + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -1012,6 +1150,8 @@ fn prepared_search_redacts_custom_deny_list_entities() { }, ..SearchOptions::default() }, + allowed_labels: vec![], + threshold: 0.0, slices: PreparedSearchSlices { deny_list: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() @@ -1050,10 +1190,13 @@ fn prepared_search_redacts_custom_deny_list_entities() { #[test] fn prepared_search_rejects_unsupported_static_slices() { let unsupported = PatternSlice { start: 0, end: 1 }; - let error = PreparedSearch::new(empty_config(PreparedSearchSlices { - deny_list: unsupported, - ..PreparedSearchSlices::default() - })) + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Secret"))], + ..empty_config(PreparedSearchSlices { + deny_list: unsupported, + ..PreparedSearchSlices::default() + }) + }) .err() .expect("unsupported slice should be rejected"); @@ -1062,10 +1205,13 @@ fn prepared_search_rejects_unsupported_static_slices() { #[test] fn prepared_search_requires_gazetteer_metadata_for_gazetteer_slice() { - let error = PreparedSearch::new(empty_config(PreparedSearchSlices { - gazetteer: PatternSlice { start: 0, end: 1 }, - ..PreparedSearchSlices::default() - })) + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Acme"))], + ..empty_config(PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) .err() .expect("gazetteer slice should require metadata"); @@ -1080,6 +1226,7 @@ fn prepared_search_requires_gazetteer_metadata_for_gazetteer_slice() { #[test] fn prepared_search_rejects_truncated_country_metadata() { let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Turkey"))], country_data: Some(CountryMatchData { labels: Vec::new() }), ..empty_config(PreparedSearchSlices { countries: PatternSlice { start: 0, end: 1 }, @@ -1100,12 +1247,59 @@ fn prepared_search_rejects_truncated_country_metadata() { } #[test] -fn prepared_search_requires_address_seed_data_for_street_types() { +fn prepared_search_rejects_missing_regex_metadata() { + let error = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d+\b"))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + ..empty_config(PreparedSearchSlices::default()) + }) + .err() + .expect("regex slice should require parallel metadata"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "regex_meta", + expected: 1, + actual: 0 + } + ); +} + +#[test] +fn prepared_search_rejects_literal_slices_outside_patterns() { let error = PreparedSearch::new(empty_config(PreparedSearchSlices { - street_types: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() })) .err() + .expect("slice outside the literal pattern table should be rejected"); + + assert!( + matches!( + error, + Error::InvalidStaticData { + field: "slices.gazetteer", + .. + } + ), + "unexpected error: {error}" + ); +} + +#[test] +fn prepared_search_requires_address_seed_data_for_street_types() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Street"))], + ..empty_config(PreparedSearchSlices { + street_types: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() .expect("street types should require address seed data"); assert_eq!( @@ -1598,6 +1792,7 @@ fn prepared_search_redacts_curated_deny_list_entities() { #[test] fn prepared_search_rejects_curated_deny_list_without_filters() { let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Prague"))], deny_list_data: Some(DenyListMatchData { labels: vec![vec![String::from("address")]].into(), custom_labels: vec![vec![]].into(), @@ -1624,6 +1819,7 @@ fn prepared_search_rejects_curated_deny_list_without_filters() { #[test] fn prepared_search_rejects_truncated_deny_list_data() { let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Secret Code"))], deny_list_data: Some(DenyListMatchData { labels: vec![vec![String::from("matter")]].into(), custom_labels: vec![].into(), diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index cd346d18..bd0b6b1b 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -112,6 +112,22 @@ describe("pipeline config semantics", () => { expect(regexCount).toBe(expected); }); + test("native config carries final label and threshold filters", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + labels: ["person"], + threshold: 0.93, + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.allowed_labels).toEqual(["person"]); + expect(search.nativeStaticConfig.threshold).toBe(0.93); + }); + test("content language scopes deny-list search build", async () => { const testDictionaries = await getDictionaries(); const config = { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index f18766bb..f1b32690 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -229,6 +229,8 @@ export type NativePreparedSearchConfig = { custom_regex_options: NativeSearchOptions; literal_options: NativeSearchOptions; literal_patterns_from_deny_list_data?: boolean; + allowed_labels: string[]; + threshold: number; slices: { regex: PatternSlice; custom_regex: PatternSlice; @@ -333,6 +335,8 @@ type UnifiedSearchSources = { nativeCurrencyPatternRange: PatternSlice; nativeDatePatternRange: PatternSlice; nativeSigningPatternRange: PatternSlice; + nativeAllowedLabels: readonly string[]; + threshold: number; slices: UnifiedSearchInstance["slices"]; literalAllPatterns: PatternEntry[] | string[]; canUseGlobalWholeWordLiterals: boolean; @@ -673,6 +677,8 @@ const buildUnifiedSearchSources = async ( nativeCurrencyPatternRange, nativeDatePatternRange, nativeSigningPatternRange, + nativeAllowedLabels: config.labels, + threshold: config.threshold, slices: { regex: regexSlice, customRegex: customRegexSlice, @@ -727,6 +733,8 @@ export const buildNativeStaticSearchBundle = async ( countryData: sources.countryResult?.data ?? null, canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, + allowedLabels: sources.nativeAllowedLabels, + threshold: sources.threshold, }), slices: sources.slices, regexMeta: sources.regexMeta, @@ -806,6 +814,8 @@ export const buildUnifiedSearch = async ( countryData: sources.countryResult?.data ?? null, canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, + allowedLabels: sources.nativeAllowedLabels, + threshold: sources.threshold, }); return { @@ -846,6 +856,8 @@ type BuildNativeStaticConfigArgs = { countryData: CountryData | null; canUseGlobalWholeWordLiterals: boolean; customDenyListNeedsWholeWords: (pattern: string) => boolean; + allowedLabels: readonly string[]; + threshold: number; }; const buildNativeStaticConfig = ({ @@ -871,6 +883,8 @@ const buildNativeStaticConfig = ({ countryData, canUseGlobalWholeWordLiterals, customDenyListNeedsWholeWords, + allowedLabels, + threshold, }: BuildNativeStaticConfigArgs): NativePreparedSearchConfig => { const nativeRegexPatterns: NativeSearchPattern[] = []; const nativeRegexMeta: NativeRegexMatchMeta[] = []; @@ -983,6 +997,8 @@ const buildNativeStaticConfig = ({ fuzzy_normalize_diacritics: true, }, literal_patterns_from_deny_list_data: denyListPatternsFromData, + allowed_labels: [...allowedLabels], + threshold, slices: { regex: { start: 0, end: nativeRegexPatterns.length }, custom_regex: { start: 0, end: nativeCustomRegexPatterns.length }, From cfd7edc8820565d9153b1de6f2fe8bd653c4ef9e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 20:20:36 +0200 Subject: [PATCH 040/130] fix: tighten native fixture spans --- crates/anonymize-core/src/resolution/merge.rs | 4 +- crates/anonymize-core/src/triggers.rs | 68 ++++++++++++++----- crates/anonymize-core/tests/prepared.rs | 44 ++++++++++++ crates/anonymize-core/tests/resolution.rs | 29 ++++++++ .../src/data/address-boundaries.json | 2 + 5 files changed, 130 insertions(+), 17 deletions(-) diff --git a/crates/anonymize-core/src/resolution/merge.rs b/crates/anonymize-core/src/resolution/merge.rs index ddb16d7a..d9897a8f 100644 --- a/crates/anonymize-core/src/resolution/merge.rs +++ b/crates/anonymize-core/src/resolution/merge.rs @@ -370,7 +370,9 @@ fn u32_char_len(ch: char) -> u32 { fn regex_shape_preferred_label(label: &str) -> bool { matches!( label, - "phone number" + "date" + | "date of birth" + | "phone number" | "tax identification number" | "registration number" | "national identification number" diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 08028b77..9b0db946 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -339,7 +339,7 @@ fn extract_value( extract_to_end_of_line(remaining, stripped, value_start_byte, label) } PreparedTriggerStrategy::NWords { count } => { - extract_n_words(stripped, value_start_byte, *count) + extract_n_words(stripped, value_start_byte, *count, label) } PreparedTriggerStrategy::CompanyIdValue => { extract_company_id_value(text, trigger_end_byte) @@ -438,36 +438,72 @@ fn extract_n_words( value_text: &str, value_start_byte: usize, count: usize, + label: &str, ) -> Option { let cell_end = value_text.find('\t').unwrap_or(value_text.len()); let cell = value_text.get(..cell_end)?; - let mut words = Vec::<&str>::new(); + let mut words = Vec::>::new(); for word in cell.split_whitespace() { if punctuation_only(word) || number_marker(word) { continue; } - words.push(word); - if words.len() >= count { + let search_pos = + words.last().map_or(0, |entry| entry.end.saturating_add(1)); + let relative = cell.get(search_pos..)?.find(word)?; + let start = search_pos.saturating_add(relative); + words.push(WordToken { + text: word, + start, + end: start.saturating_add(word.len()), + }); + if words.len() >= date_aware_word_count(label, count, &words) { break; } } let first = words.first().copied()?; - let first_index = cell.find(first)?; - let mut actual_end = first_index.saturating_add(first.len()); - let mut search_pos = actual_end; - for word in words.iter().skip(1) { - let relative = cell.get(search_pos..)?.find(word)?; - let index = search_pos.saturating_add(relative); - actual_end = index.saturating_add(word.len()); - search_pos = actual_end; - } + let last = words.last().copied()?; byte_value( - cell.get(first_index..actual_end)?, - value_start_byte.saturating_add(first_index), - actual_end.saturating_sub(first_index), + cell.get(first.start..last.end)?, + value_start_byte.saturating_add(first.start), + last.end.saturating_sub(first.start), ) } +#[derive(Clone, Copy)] +struct WordToken<'a> { + text: &'a str, + start: usize, + end: usize, +} + +fn date_aware_word_count( + label: &str, + configured_count: usize, + words: &[WordToken<'_>], +) -> usize { + if configured_count != 1 || !matches!(label, "date" | "date of birth") { + return configured_count; + } + if words + .first() + .is_some_and(|word| starts_written_day_token(word.text)) + { + return 3; + } + configured_count +} + +fn starts_written_day_token(text: &str) -> bool { + let token = text.trim_end_matches(|ch: char| { + matches!(ch, ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”' | '’') + }); + let Some(day) = token.strip_suffix('.') else { + return false; + }; + let digit_count = day.chars().filter(char::is_ascii_digit).count(); + (1..=2).contains(&digit_count) && day.chars().all(|ch| ch.is_ascii_digit()) +} + fn extract_company_id_value( text: &str, trigger_end_byte: usize, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index b1797c36..7bae239b 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -530,6 +530,50 @@ fn prepared_search_extracts_written_date_of_birth_trigger() { ); } +#[test] +fn prepared_search_extends_single_word_written_date_trigger() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Herr Müller, geboren am 21. März 1968, ist Geschäftsführer.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "date of birth" + && entity.text == "21. März 1968") + ); +} + #[test] fn prepared_search_extracts_year_after_duplicate_year_word_noise() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index 2179a91e..e1f51207 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -140,6 +140,35 @@ fn structured_regex_span_beats_trigger_fragment_with_trailing_punctuation() { assert_eq!(kept.end, byte_len(regex_text)); } +#[test] +fn structured_date_regex_span_beats_trigger_fragment() { + let regex_text = "21. März 1968"; + let trigger_text = "21."; + let result = merge_and_dedup(&[ + PipelineEntity::detected( + 0, + byte_len(regex_text), + "date of birth", + regex_text, + 0.9, + DetectionSource::Regex, + ), + PipelineEntity::detected( + 0, + byte_len(trigger_text), + "date of birth", + trigger_text, + 0.95, + DetectionSource::Trigger, + ), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.text, regex_text); +} + #[test] fn person_regex_span_beats_inner_name_fragment() { let result = merge_and_dedup(&[ diff --git a/packages/anonymize/src/data/address-boundaries.json b/packages/anonymize/src/data/address-boundaries.json index 12c5cc35..cbb8adc4 100644 --- a/packages/anonymize/src/data/address-boundaries.json +++ b/packages/anonymize/src/data/address-boundaries.json @@ -5,6 +5,7 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +36,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" From 072dfd177f4f54db5128e762ca5211a604dd8914 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 20:25:08 +0200 Subject: [PATCH 041/130] fix: sync address boundary mirror --- packages/data/config/address-boundaries.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/data/config/address-boundaries.json b/packages/data/config/address-boundaries.json index 12c5cc35..cbb8adc4 100644 --- a/packages/data/config/address-boundaries.json +++ b/packages/data/config/address-boundaries.json @@ -5,6 +5,7 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +36,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" From 386011e2b0a23f28793a18f5153127cc2e526741 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 20:43:28 +0200 Subject: [PATCH 042/130] feat: add native anonymizer facade --- .../__test__/native-adapter-parity.test.ts | 107 +++++--- packages/anonymize/src/index-shared.ts | 20 ++ packages/anonymize/src/native.ts | 241 ++++++++++++++++++ 3 files changed, 338 insertions(+), 30 deletions(-) create mode 100644 packages/anonymize/src/native.ts diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index e6234a8e..5cda5073 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -5,43 +5,32 @@ import { join } from "node:path"; import { createRequire } from "node:module"; import { describe, expect, setDefaultTimeout, test } from "bun:test"; import fc from "fast-check"; +import { + createNativeAnonymizerFromPackage, + prepareNativeSearchPackage, + type NativeAnonymizeBinding, + type NativeOperatorConfig, + type NativePreparedSearchBinding, +} from "../native"; setDefaultTimeout(120_000); -type NativeAdapter = { - NativePreparedSearch: { - new (configJson: string): { - redactStaticEntities: ( - fullText: string, - operators?: Record, - ) => StaticRedactionResult; - }; - fromConfigJsonBytes: (configJson: Buffer) => { - redactStaticEntities: ( - fullText: string, - operators?: Record, - ) => StaticRedactionResult; - }; +type NativeAdapter = Omit< + NativeAnonymizeBinding, + | "prepareStaticSearchPackageBytes" + | "prepareStaticSearchCompressedPackageBytes" +> & { + normalizeForSearch: (text: string) => string; + NativePreparedSearch: NativeAnonymizeBinding["NativePreparedSearch"] & { + new (configJson: string): NativePreparedSearchBinding; fromConfigJsonAndArtifactBytes: ( configJson: Buffer, artifactBytes: Buffer, - ) => { - redactStaticEntities: ( - fullText: string, - operators?: Record, - ) => StaticRedactionResult; - }; - fromPreparedPackageBytes: (packageBytes: Buffer) => { - redactStaticEntities: ( - fullText: string, - operators?: Record, - ) => StaticRedactionResult; - }; + ) => NativePreparedSearchBinding; }; - normalizeForSearch: (text: string) => string; prepareStaticSearchArtifactsBytes: (configJson: Buffer) => Buffer; - prepareStaticSearchPackageBytes: (configJson: Buffer) => Buffer; - prepareStaticSearchCompressedPackageBytes: (configJson: Buffer) => Buffer; + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => Buffer; + prepareStaticSearchCompressedPackageBytes: (configJson: Uint8Array) => Buffer; redactStaticEntitiesJson: ( configJson: string, fullText: string, @@ -74,7 +63,7 @@ type StaticRedactionResult = { redaction_map: RedactionEntry[]; operator_map: Array<{ placeholder: string; - operator: string; + operator: "replace" | "redact"; }>; entity_count: number; }; @@ -585,6 +574,56 @@ describe("native adapter parity", () => { ).toEqual(expectedJson); }); + test("native facade redacts from compressed package bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const operators: NativeOperatorConfig = { + operators: { country: "redact" }, + redactString: "***", + }; + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: JSON.parse(CONFIG_JSON), + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + const expected: StaticRedactionResult = JSON.parse( + adapters.native.redactStaticEntitiesJson( + CONFIG_JSON, + text, + JSON.stringify(operators), + ), + ); + + const result = anonymizer.redactStaticEntities(text, operators); + + expect(result.resolvedEntities).toEqual( + expected.resolved_entities.map(toNativeFacadeEntity), + ); + expect(result.redaction.redactedText).toBe( + expected.redaction.redacted_text, + ); + expect(result.redaction.entityCount).toBe(expected.redaction.entity_count); + expect([...result.redaction.redactionMap.entries()]).toEqual( + expected.redaction.redaction_map.map(({ placeholder, original }) => [ + placeholder, + original, + ]), + ); + expect([...result.redaction.operatorMap.entries()]).toEqual( + expected.redaction.operator_map.map(({ placeholder, operator }) => [ + placeholder, + operator, + ]), + ); + expect(result.redaction.redactedText).toContain("***"); + }); + test("JSON operator config accepts camel-case redactString", () => { const adapters = getAdapters(); const text = @@ -954,6 +993,14 @@ const stripDiagnosticTimings = ( }, }); +const toNativeFacadeEntity = ({ + source_detail: sourceDetail, + ...entity +}: StaticRedactionResult["resolved_entities"][number]) => ({ + ...entity, + ...(sourceDetail ? { sourceDetail } : {}), +}); + const operatorConfigJson = ( operators: Record | null, ): string | undefined => { diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index 511355fb..3af7f1ce 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -49,6 +49,26 @@ export type { PipelineSearchOptions, } from "./pipeline"; +// ── Native Adapter ─────────────────────────────── +export { + PreparedNativeAnonymizer, + createNativeAnonymizerFromConfig, + createNativeAnonymizerFromPackage, + encodeNativeSearchConfig, + prepareNativeSearchPackage, +} from "./native"; +export type { + NativeAnonymizeBinding, + NativeAnonymizerFromConfigOptions, + NativeAnonymizerFromPackageOptions, + NativeOperatorConfig, + NativePipelineEntity, + NativePreparedSearchBinding, + NativeRedactionResult, + NativeSearchPackageOptions, + NativeStaticRedactionResult, +} from "./native"; + // ── Redaction ───────────────────────────────────── export { redactText, diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts new file mode 100644 index 00000000..63e38999 --- /dev/null +++ b/packages/anonymize/src/native.ts @@ -0,0 +1,241 @@ +import type { NativePreparedSearchConfig } from "./build-unified-search"; +import type { OperatorType } from "./types"; + +type NativeBindingOperatorConfig = { + operators?: Record; + redactString?: string; +}; + +type NativeBindingRedactionEntry = { + placeholder: string; + original: string; +}; + +type NativeBindingOperatorEntry = { + placeholder: string; + operator: OperatorType; +}; + +type NativeBindingPipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + sourceDetail?: string | null; +}; + +type NativeBindingRedactionResult = { + redactedText: string; + redactionMap: NativeBindingRedactionEntry[]; + operatorMap: NativeBindingOperatorEntry[]; + entityCount: number; +}; + +type NativeBindingStaticRedactionResult = { + resolvedEntities: NativeBindingPipelineEntity[]; + redaction: NativeBindingRedactionResult; +}; + +export type NativePreparedSearchBinding = { + prepareDiagnosticsJson?: () => string; + redactStaticEntities: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => NativeBindingStaticRedactionResult; + redactStaticEntitiesDiagnosticsJson?: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => string; +}; + +export type NativeAnonymizeBinding = { + NativePreparedSearch: { + fromConfigJsonBytes: ( + configJson: Uint8Array, + ) => NativePreparedSearchBinding; + fromPreparedPackageBytes: ( + packageBytes: Uint8Array, + ) => NativePreparedSearchBinding; + }; + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => Uint8Array; + prepareStaticSearchCompressedPackageBytes: ( + configJson: Uint8Array, + ) => Uint8Array; +}; + +export type NativeOperatorConfig = { + operators?: Record; + redactString?: string; +}; + +export type NativePipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + sourceDetail?: string; +}; + +export type NativeRedactionResult = { + redactedText: string; + redactionMap: Map; + operatorMap: Map; + entityCount: number; +}; + +export type NativeStaticRedactionResult = { + resolvedEntities: NativePipelineEntity[]; + redaction: NativeRedactionResult; +}; + +export type NativeSearchPackageOptions = { + binding: NativeAnonymizeBinding; + config: NativePreparedSearchConfig; + compressed?: boolean; +}; + +export type NativeAnonymizerFromConfigOptions = { + binding: NativeAnonymizeBinding; + config: NativePreparedSearchConfig; +}; + +export type NativeAnonymizerFromPackageOptions = { + binding: NativeAnonymizeBinding; + packageBytes: Uint8Array; +}; + +export class PreparedNativeAnonymizer { + readonly #prepared: NativePreparedSearchBinding; + + constructor(prepared: NativePreparedSearchBinding) { + this.#prepared = prepared; + } + + prepareDiagnosticsJson(): string | null { + return this.#prepared.prepareDiagnosticsJson?.() ?? null; + } + + redactStaticEntities( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return toNativeStaticRedactionResult( + this.#prepared.redactStaticEntities( + fullText, + toBindingOperatorConfig(operators), + ), + ); + } + + redactStaticEntitiesDiagnosticsJson( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + const run = this.#prepared.redactStaticEntitiesDiagnosticsJson; + if (!run) { + return null; + } + return run(fullText, toBindingOperatorConfig(operators)); + } +} + +export const encodeNativeSearchConfig = ( + config: NativePreparedSearchConfig, +): Uint8Array => new TextEncoder().encode(JSON.stringify(config)); + +export const prepareNativeSearchPackage = ({ + binding, + config, + compressed = true, +}: NativeSearchPackageOptions): Uint8Array => { + const configBytes = encodeNativeSearchConfig(config); + return compressed + ? binding.prepareStaticSearchCompressedPackageBytes(configBytes) + : binding.prepareStaticSearchPackageBytes(configBytes); +}; + +export const createNativeAnonymizerFromConfig = ({ + binding, + config, +}: NativeAnonymizerFromConfigOptions): PreparedNativeAnonymizer => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfig(config), + ), + ); + +export const createNativeAnonymizerFromPackage = ({ + binding, + packageBytes, +}: NativeAnonymizerFromPackageOptions): PreparedNativeAnonymizer => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromPreparedPackageBytes(packageBytes), + ); + +const toBindingOperatorConfig = ( + config: NativeOperatorConfig | undefined, +): NativeBindingOperatorConfig | undefined => { + if (!config) { + return undefined; + } + const bindingConfig: NativeBindingOperatorConfig = {}; + if (config.operators !== undefined) { + bindingConfig.operators = config.operators; + } + if (config.redactString !== undefined) { + bindingConfig.redactString = config.redactString; + } + return bindingConfig; +}; + +const toNativeStaticRedactionResult = ( + result: NativeBindingStaticRedactionResult, +): NativeStaticRedactionResult => ({ + resolvedEntities: result.resolvedEntities.map(toNativePipelineEntity), + redaction: toNativeRedactionResult(result.redaction), +}); + +const toNativePipelineEntity = ( + entity: NativeBindingPipelineEntity, +): NativePipelineEntity => ({ + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + ...(entity.sourceDetail ? { sourceDetail: entity.sourceDetail } : {}), +}); + +const toNativeRedactionResult = ( + result: NativeBindingRedactionResult, +): NativeRedactionResult => ({ + redactedText: result.redactedText, + redactionMap: toRedactionMap(result.redactionMap), + operatorMap: toOperatorMap(result.operatorMap), + entityCount: result.entityCount, +}); + +const toRedactionMap = ( + entries: readonly NativeBindingRedactionEntry[], +): Map => { + const map = new Map(); + for (const entry of entries) { + map.set(entry.placeholder, entry.original); + } + return map; +}; + +const toOperatorMap = ( + entries: readonly NativeBindingOperatorEntry[], +): Map => { + const map = new Map(); + for (const entry of entries) { + map.set(entry.placeholder, entry.operator); + } + return map; +}; From 8dee9dc31721739ace00be56895bf1a69a9c78eb Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 20:50:41 +0200 Subject: [PATCH 043/130] feat: expose native anonymize entrypoint --- .github/tools/check-packlist.mjs | 2 ++ packages/anonymize/package.json | 5 +++++ packages/anonymize/scripts/dist-smoke.mjs | 5 +++++ packages/anonymize/tsdown.config.ts | 2 +- 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 3e039419..1631236e 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -7,6 +7,8 @@ const PACKAGES = [ expected: [ "dist/index.d.mts", "dist/index.mjs", + "dist/native.d.mts", + "dist/native.mjs", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index b4334eee..748556d9 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -13,6 +13,11 @@ "types": "./dist/constants.d.mts", "import": "./dist/constants.mjs", "default": "./dist/constants.mjs" + }, + "./native": { + "types": "./dist/native.d.mts", + "import": "./dist/native.mjs", + "default": "./dist/native.mjs" } }, "types": "dist/index.d.mts", diff --git a/packages/anonymize/scripts/dist-smoke.mjs b/packages/anonymize/scripts/dist-smoke.mjs index 1beadc68..3d4facff 100644 --- a/packages/anonymize/scripts/dist-smoke.mjs +++ b/packages/anonymize/scripts/dist-smoke.mjs @@ -10,6 +10,11 @@ * Run after `bun run build`: `bun run smoke:dist`. */ import { createPipelineContext, runPipeline } from "../dist/index.mjs"; +import { createNativeAnonymizerFromPackage } from "../dist/native.mjs"; + +if (typeof createNativeAnonymizerFromPackage !== "function") { + throw new TypeError("dist native entrypoint is missing its package loader"); +} const warnings = []; const originalWarn = console.warn; diff --git a/packages/anonymize/tsdown.config.ts b/packages/anonymize/tsdown.config.ts index 57fff9ff..4f35e505 100644 --- a/packages/anonymize/tsdown.config.ts +++ b/packages/anonymize/tsdown.config.ts @@ -2,7 +2,7 @@ import { defineConfig } from "tsdown"; export default defineConfig([ { - entry: ["src/index.ts", "src/constants.ts"], + entry: ["src/index.ts", "src/constants.ts", "src/native.ts"], outDir: "dist", format: ["esm"], dts: true, From b30a8b0f318c3191c7252155053a5d8f58c01dd5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 21:01:04 +0200 Subject: [PATCH 044/130] test: cover native fixture adapter parity --- .../__test__/native-adapter-parity.test.ts | 94 ++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 5cda5073..5d0f8144 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1,5 +1,11 @@ import { spawnSync } from "node:child_process"; -import { copyFileSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { + copyFileSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createRequire } from "node:module"; @@ -11,7 +17,11 @@ import { type NativeAnonymizeBinding, type NativeOperatorConfig, type NativePreparedSearchBinding, + type NativeStaticRedactionResult, } from "../native"; +import { createPipelineContext, preparePipelineSearch } from "../index"; +import { contractTestConfig } from "./contract-config"; +import { loadTestDictionaries } from "./load-dictionaries"; setDefaultTimeout(120_000); @@ -624,6 +634,61 @@ describe("native adapter parity", () => { expect(result.redaction.redactedText).toContain("***"); }); + test("native facade and Python match on a contract fixture package", async () => { + const adapters = getAdapters(); + const fixtureText = readFileSync( + join( + ROOT_DIR, + "packages", + "anonymize", + "src", + "__test__", + "fixtures", + "contracts", + "en", + "software-license-agreement.txt", + ), + "utf8", + ); + const dictionaries = await loadTestDictionaries({ + denyListCountries: ["US"], + nameCorpusLanguages: ["en"], + }); + const search = await preparePipelineSearch({ + config: { + ...contractTestConfig("native-facade-fixture-parity"), + dictionaries, + language: "en", + }, + context: createPipelineContext(), + }); + const configJson = JSON.stringify(search.nativeStaticConfig); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + + const tsResult = toBindingStaticResult( + anonymizer.redactStaticEntities(fixtureText), + ); + const pyResult = callPythonPreparedWithPackage( + adapters.pythonModulePath, + adapters.tempDir, + Buffer.from(packageBytes), + fixtureText, + null, + "prepare_static_search_compressed_package_bytes", + configJson, + ); + + expect(tsResult).toEqual(pyResult); + }); + test("JSON operator config accepts camel-case redactString", () => { const adapters = getAdapters(); const text = @@ -904,6 +969,7 @@ const callPythonPreparedWithPackage = ( text: string, operators: Record | null, prepareFn = "prepare_static_search_package_bytes", + configJson = CONFIG_JSON, ): StaticRedactionResult => { const payloadPath = join(tempDir, "prepared-package-payload.json"); const packagePath = join(tempDir, "prepared-package.bin"); @@ -911,7 +977,7 @@ const callPythonPreparedWithPackage = ( writeFileSync( payloadPath, JSON.stringify({ - config_json: CONFIG_JSON, + config_json: configJson, text, operators_json: operatorConfigJson(operators), }), @@ -1001,6 +1067,30 @@ const toNativeFacadeEntity = ({ ...(sourceDetail ? { sourceDetail } : {}), }); +const toBindingStaticResult = ( + result: NativeStaticRedactionResult, +): StaticRedactionResult => ({ + resolved_entities: result.resolvedEntities.map(toBindingPipelineEntity), + redaction: { + redacted_text: result.redaction.redactedText, + redaction_map: [...result.redaction.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.redaction.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.redaction.entityCount, + }, +}); + +const toBindingPipelineEntity = ({ + sourceDetail, + ...entity +}: NativeStaticRedactionResult["resolvedEntities"][number]) => ({ + ...entity, + source_detail: sourceDetail ?? null, +}); + const operatorConfigJson = ( operators: Record | null, ): string | undefined => { From aa7bc1fca7fc4581b864950f672dae8e78d9fcfe Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 21:10:56 +0200 Subject: [PATCH 045/130] test: broaden native adapter fixture parity --- .../__test__/native-adapter-parity.test.ts | 204 +++++++++++++----- 1 file changed, 154 insertions(+), 50 deletions(-) diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 5d0f8144..8ce6068f 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -3,6 +3,7 @@ import { copyFileSync, mkdtempSync, readFileSync, + readdirSync, rmSync, writeFileSync, } from "node:fs"; @@ -20,6 +21,7 @@ import { type NativeStaticRedactionResult, } from "../native"; import { createPipelineContext, preparePipelineSearch } from "../index"; +import { applyPipelineLanguageScope } from "../language-scope"; import { contractTestConfig } from "./contract-config"; import { loadTestDictionaries } from "./load-dictionaries"; @@ -109,8 +111,23 @@ type GeneratedNativeCase = { sensitiveValues: string[]; }; +type ContractFixtureCase = { + name: string; + text: string; +}; + const ROOT_DIR = join(import.meta.dir, "..", "..", "..", ".."); const TARGET_DIR = join(ROOT_DIR, "target", "debug"); +const CONTRACT_FIXTURES_DIR = join( + ROOT_DIR, + "packages", + "anonymize", + "src", + "__test__", + "fixtures", + "contracts", +); +const CONTRACT_FIXTURE_LANGUAGES = ["cs", "de", "en"] as const; const CONFIG_JSON = JSON.stringify({ regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], @@ -277,6 +294,42 @@ print( ) `; +const PYTHON_PREPARED_PACKAGE_CASES_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +prepare_fn_name = os.environ.get( + "STELLA_ANONYMIZE_PACKAGE_PREPARE_FN", + "prepare_static_search_package_bytes", +) +if getattr(module, prepare_fn_name)(payload["config_json"]) != package_bytes: + raise AssertionError("prepared package bytes differ") +prepared = module.PreparedSearch.from_prepared_package_bytes(package_bytes) +results = [ + json.loads( + prepared.redact_static_entities_json( + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps(results)) +`; + let loadedAdapters: { native: NativeAdapter; pythonModulePath: string; @@ -634,59 +687,62 @@ describe("native adapter parity", () => { expect(result.redaction.redactedText).toContain("***"); }); - test("native facade and Python match on a contract fixture package", async () => { + test("native facade and Python match on contract fixture packages", async () => { const adapters = getAdapters(); - const fixtureText = readFileSync( - join( - ROOT_DIR, - "packages", - "anonymize", - "src", - "__test__", - "fixtures", - "contracts", - "en", - "software-license-agreement.txt", - ), - "utf8", - ); - const dictionaries = await loadTestDictionaries({ - denyListCountries: ["US"], - nameCorpusLanguages: ["en"], - }); - const search = await preparePipelineSearch({ - config: { - ...contractTestConfig("native-facade-fixture-parity"), - dictionaries, - language: "en", - }, - context: createPipelineContext(), - }); - const configJson = JSON.stringify(search.nativeStaticConfig); - const packageBytes = prepareNativeSearchPackage({ - binding: adapters.native, - config: search.nativeStaticConfig, - compressed: true, - }); - const anonymizer = createNativeAnonymizerFromPackage({ - binding: adapters.native, - packageBytes, - }); + for (const language of CONTRACT_FIXTURE_LANGUAGES) { + const fixtures = loadContractFixtureCases(language); + const scopedConfig = applyPipelineLanguageScope({ + ...contractTestConfig(`native-facade-fixture-parity-${language}`), + language, + }); + const dictionaryScope: Parameters[0] = {}; + if (scopedConfig.denyListCountries !== undefined) { + dictionaryScope.denyListCountries = scopedConfig.denyListCountries; + } + if (scopedConfig.nameCorpusLanguages !== undefined) { + dictionaryScope.nameCorpusLanguages = scopedConfig.nameCorpusLanguages; + } + const dictionaries = await loadTestDictionaries(dictionaryScope); + const search = await preparePipelineSearch({ + config: { + ...scopedConfig, + dictionaries, + }, + context: createPipelineContext(), + }); + const configJson = JSON.stringify(search.nativeStaticConfig); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); - const tsResult = toBindingStaticResult( - anonymizer.redactStaticEntities(fixtureText), - ); - const pyResult = callPythonPreparedWithPackage( - adapters.pythonModulePath, - adapters.tempDir, - Buffer.from(packageBytes), - fixtureText, - null, - "prepare_static_search_compressed_package_bytes", - configJson, - ); + const tsResults = fixtures.map(({ text }) => + toBindingStaticResult(anonymizer.redactStaticEntities(text)), + ); + const pyResults = callPythonPreparedPackageCases( + adapters.pythonModulePath, + adapters.tempDir, + Buffer.from(packageBytes), + fixtures.map(({ text }) => ({ text, operators: null })), + "prepare_static_search_compressed_package_bytes", + configJson, + ); - expect(tsResult).toEqual(pyResult); + for (const [index, fixture] of fixtures.entries()) { + expect({ + fixture: `${language}/${fixture.name}`, + result: pyResults.at(index), + }).toEqual({ + fixture: `${language}/${fixture.name}`, + result: tsResults.at(index), + }); + } + } }); test("JSON operator config accepts camel-case redactString", () => { @@ -991,6 +1047,43 @@ const callPythonPreparedWithPackage = ( return JSON.parse(output); }; +const callPythonPreparedPackageCases = ( + pythonModulePath: string, + tempDir: string, + packageBytes: Buffer, + cases: Array<{ + text: string; + operators: Record | null; + }>, + prepareFn = "prepare_static_search_package_bytes", + configJson = CONFIG_JSON, +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "prepared-package-cases-payload.json"); + const packagePath = join(tempDir, "prepared-package-cases.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: configJson, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: operatorConfigJson(operators), + })), + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_PREPARED_PACKAGE_CASES_SCRIPT], + { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PACKAGE_PREPARE_FN: prepareFn, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + const callPythonDiagnostics = ( pythonModulePath: string, text: string, @@ -1091,6 +1184,17 @@ const toBindingPipelineEntity = ({ source_detail: sourceDetail ?? null, }); +const loadContractFixtureCases = ( + language: (typeof CONTRACT_FIXTURE_LANGUAGES)[number], +): ContractFixtureCase[] => + readdirSync(join(CONTRACT_FIXTURES_DIR, language)) + .filter((name) => name.endsWith(".txt")) + .toSorted() + .map((name) => ({ + name, + text: readFileSync(join(CONTRACT_FIXTURES_DIR, language, name), "utf8"), + })); + const operatorConfigJson = ( operators: Record | null, ): string | undefined => { From 4b14fa58ef84aaa4a017e933d7cf433d59898c53 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 21:22:04 +0200 Subject: [PATCH 046/130] feat: expose native binding version --- crates/anonymize-napi/src/lib.rs | 6 ++ crates/anonymize-py/src/lib.rs | 7 +++ .../__test__/native-adapter-parity.test.ts | 61 +++++++++++++++++++ packages/anonymize/src/index-shared.ts | 3 + packages/anonymize/src/native.ts | 22 +++++++ 5 files changed, 99 insertions(+) diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 45c7a1a1..c05bde15 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -227,6 +227,12 @@ pub fn normalize_for_search(text: String) -> String { stella_anonymize_core::normalize_for_search(&text) } +#[napi] +#[must_use] +pub fn native_package_version() -> String { + String::from(env!("CARGO_PKG_VERSION")) +} + #[napi] #[allow(clippy::needless_pass_by_value)] pub fn redact_static_entities_json( diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index 7bd7716e..a9aa52fa 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -265,6 +265,12 @@ fn normalize_for_search(text: &str) -> String { stella_anonymize_core::normalize_for_search(text) } +#[pyfunction] +#[allow(clippy::missing_const_for_fn)] +fn native_package_version() -> &'static str { + env!("CARGO_PKG_VERSION") +} + fn parse_prepared_search_config( config_json: &str, ) -> PyResult { @@ -446,5 +452,6 @@ fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { module )?)?; module.add_function(wrap_pyfunction!(normalize_for_search, module)?)?; + module.add_function(wrap_pyfunction!(native_package_version, module)?)?; Ok(()) } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 8ce6068f..10b300c5 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -13,7 +13,9 @@ import { createRequire } from "node:module"; import { describe, expect, setDefaultTimeout, test } from "bun:test"; import fc from "fast-check"; import { + assertNativeBindingVersion, createNativeAnonymizerFromPackage, + getNativeBindingVersion, prepareNativeSearchPackage, type NativeAnonymizeBinding, type NativeOperatorConfig, @@ -33,6 +35,7 @@ type NativeAdapter = Omit< | "prepareStaticSearchCompressedPackageBytes" > & { normalizeForSearch: (text: string) => string; + nativePackageVersion: () => string; NativePreparedSearch: NativeAnonymizeBinding["NativePreparedSearch"] & { new (configJson: string): NativePreparedSearchBinding; fromConfigJsonAndArtifactBytes: ( @@ -231,6 +234,21 @@ results = [ print(json.dumps(results)) `; +const PYTHON_VERSION_SCRIPT = ` +import importlib.util +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +print(module.native_package_version()) +`; + const PYTHON_PREPARED_ARTIFACT_SCRIPT = ` import importlib.util import json @@ -425,6 +443,26 @@ const generatedCaseArb: fc.Arbitrary = fc ); describe("native adapter parity", () => { + test("native adapter versions match package metadata", () => { + const adapters = getAdapters(); + const packageVersion = packageJsonVersion(); + + expect(getNativeBindingVersion(adapters.native)).toBe(packageVersion); + expect(callPythonVersion(adapters.pythonModulePath)).toBe(packageVersion); + expect(() => + assertNativeBindingVersion({ + binding: adapters.native, + expectedVersion: packageVersion, + }), + ).not.toThrow(); + expect(() => + assertNativeBindingVersion({ + binding: adapters.native, + expectedVersion: "0.0.0", + }), + ).toThrow(); + }); + test("normalization is identical through TS and Python adapters", () => { const adapters = getAdapters(); const text = "Číslo\u00a0PAS - 1234 / Fuzztovn"; @@ -855,6 +893,10 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { Object(loaded), "NativePreparedSearch", ); + const nativePackageVersion = Reflect.get( + Object(loaded), + "nativePackageVersion", + ); const redactStaticEntitiesJson = Reflect.get( Object(loaded), "redactStaticEntitiesJson", @@ -878,6 +920,7 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { if ( typeof NativePreparedSearch !== "function" || typeof normalizeForSearch !== "function" || + typeof nativePackageVersion !== "function" || typeof prepareStaticSearchArtifactsBytes !== "function" || typeof prepareStaticSearchPackageBytes !== "function" || typeof prepareStaticSearchCompressedPackageBytes !== "function" || @@ -890,6 +933,7 @@ const loadNativeAdapter = (nativePath: string): NativeAdapter => { NativePreparedSearch: NativePreparedSearch as NativeAdapter["NativePreparedSearch"], normalizeForSearch, + nativePackageVersion, prepareStaticSearchArtifactsBytes, prepareStaticSearchPackageBytes, prepareStaticSearchCompressedPackageBytes, @@ -988,6 +1032,11 @@ print(module.normalize_for_search(payload["text"])) } }; +const callPythonVersion = (pythonModulePath: string): string => + runCommand("python3", ["-c", PYTHON_VERSION_SCRIPT], { + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }).trimEnd(); + const callPythonPreparedWithArtifacts = ( pythonModulePath: string, tempDir: string, @@ -1195,6 +1244,18 @@ const loadContractFixtureCases = ( text: readFileSync(join(CONTRACT_FIXTURES_DIR, language, name), "utf8"), })); +const packageJsonVersion = (): string => { + const packageJson = JSON.parse( + readFileSync(join(ROOT_DIR, "packages", "anonymize", "package.json"), { + encoding: "utf8", + }), + ) as { version?: unknown }; + if (typeof packageJson.version !== "string") { + throw new TypeError("Package version is missing"); + } + return packageJson.version; +}; + const operatorConfigJson = ( operators: Record | null, ): string | undefined => { diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index 3af7f1ce..ac2a3ad3 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -52,15 +52,18 @@ export type { // ── Native Adapter ─────────────────────────────── export { PreparedNativeAnonymizer, + assertNativeBindingVersion, createNativeAnonymizerFromConfig, createNativeAnonymizerFromPackage, encodeNativeSearchConfig, + getNativeBindingVersion, prepareNativeSearchPackage, } from "./native"; export type { NativeAnonymizeBinding, NativeAnonymizerFromConfigOptions, NativeAnonymizerFromPackageOptions, + NativeBindingVersionOptions, NativeOperatorConfig, NativePipelineEntity, NativePreparedSearchBinding, diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index 63e38999..be58ed60 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -51,6 +51,7 @@ export type NativePreparedSearchBinding = { }; export type NativeAnonymizeBinding = { + nativePackageVersion: () => string; NativePreparedSearch: { fromConfigJsonBytes: ( configJson: Uint8Array, @@ -108,6 +109,11 @@ export type NativeAnonymizerFromPackageOptions = { packageBytes: Uint8Array; }; +export type NativeBindingVersionOptions = { + binding: NativeAnonymizeBinding; + expectedVersion: string; +}; + export class PreparedNativeAnonymizer { readonly #prepared: NativePreparedSearchBinding; @@ -147,6 +153,22 @@ export const encodeNativeSearchConfig = ( config: NativePreparedSearchConfig, ): Uint8Array => new TextEncoder().encode(JSON.stringify(config)); +export const getNativeBindingVersion = ( + binding: NativeAnonymizeBinding, +): string => binding.nativePackageVersion(); + +export const assertNativeBindingVersion = ({ + binding, + expectedVersion, +}: NativeBindingVersionOptions): void => { + const actualVersion = getNativeBindingVersion(binding); + if (actualVersion !== expectedVersion) { + throw new Error( + `Native anonymize binding version ${actualVersion} does not match ${expectedVersion}`, + ); + } +}; + export const prepareNativeSearchPackage = ({ binding, config, From 3a1f50eb08e898554ae10426e17adb52a3e09faf Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 21:38:33 +0200 Subject: [PATCH 047/130] feat: add native node loader --- .github/tools/check-packlist.mjs | 2 + packages/anonymize/package.json | 5 + packages/anonymize/scripts/dist-smoke.mjs | 4 + .../src/__test__/native-node.test.ts | 113 ++++++++++ packages/anonymize/src/native-node.ts | 196 ++++++++++++++++++ packages/anonymize/tsdown.config.ts | 7 +- 6 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 packages/anonymize/src/__test__/native-node.test.ts create mode 100644 packages/anonymize/src/native-node.ts diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 1631236e..70229370 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -9,6 +9,8 @@ const PACKAGES = [ "dist/index.mjs", "dist/native.d.mts", "dist/native.mjs", + "dist/native-node.d.mts", + "dist/native-node.mjs", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 748556d9..8f63448b 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -18,6 +18,11 @@ "types": "./dist/native.d.mts", "import": "./dist/native.mjs", "default": "./dist/native.mjs" + }, + "./native-node": { + "types": "./dist/native-node.d.mts", + "import": "./dist/native-node.mjs", + "default": "./dist/native-node.mjs" } }, "types": "dist/index.d.mts", diff --git a/packages/anonymize/scripts/dist-smoke.mjs b/packages/anonymize/scripts/dist-smoke.mjs index 3d4facff..23c73194 100644 --- a/packages/anonymize/scripts/dist-smoke.mjs +++ b/packages/anonymize/scripts/dist-smoke.mjs @@ -11,10 +11,14 @@ */ import { createPipelineContext, runPipeline } from "../dist/index.mjs"; import { createNativeAnonymizerFromPackage } from "../dist/native.mjs"; +import { loadNativeAnonymizeBinding } from "../dist/native-node.mjs"; if (typeof createNativeAnonymizerFromPackage !== "function") { throw new TypeError("dist native entrypoint is missing its package loader"); } +if (typeof loadNativeAnonymizeBinding !== "function") { + throw new TypeError("dist native-node entrypoint is missing its loader"); +} const warnings = []; const originalWarn = console.warn; diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts new file mode 100644 index 00000000..dd037b69 --- /dev/null +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, test } from "bun:test"; + +import type { NativeAnonymizeBinding } from "../native"; +import { + loadNativeAnonymizeBinding, + nativePlatformPackageName, +} from "../native-node"; + +describe("native node loader", () => { + test("maps supported platform package names", () => { + expect( + nativePlatformPackageName({ platform: "darwin", arch: "arm64" }), + ).toBe("@stll/anonymize-darwin-arm64"); + expect(nativePlatformPackageName({ platform: "darwin", arch: "x64" })).toBe( + "@stll/anonymize-darwin-x64", + ); + expect(nativePlatformPackageName({ platform: "linux", arch: "x64" })).toBe( + "@stll/anonymize-linux-x64-gnu", + ); + expect( + nativePlatformPackageName({ platform: "linux", arch: "arm64" }), + ).toBe("@stll/anonymize-linux-arm64-gnu"); + expect(nativePlatformPackageName({ platform: "win32", arch: "x64" })).toBe( + "@stll/anonymize-win32-x64-msvc", + ); + expect( + nativePlatformPackageName({ + platform: "linux", + arch: "x64", + libc: "musl", + }), + ).toBeNull(); + }); + + test("loads the platform package after the local loader", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0"); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "@stll/anonymize-darwin-arm64") { + return binding; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + expect(calls).toEqual(["../index.cjs", "@stll/anonymize-darwin-arm64"]); + }); + + test("loads an explicit native library path first", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0"); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + env: { STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH: "/tmp/anonymize.node" }, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "/tmp/anonymize.node") { + return { default: binding }; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + expect(calls).toEqual(["/tmp/anonymize.node"]); + }); + + test("rejects mismatched native binding versions", () => { + expect(() => + loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + if (specifier === "@stll/anonymize-darwin-arm64") { + return fakeNativeBinding("1.4.0"); + } + throw new Error("not found"); + }, + }), + ).toThrow("does not match 1.5.0"); + }); +}); + +const fakeNativeBinding = (version: string): NativeAnonymizeBinding => ({ + nativePackageVersion: () => version, + NativePreparedSearch: { + fromConfigJsonBytes: () => fakePreparedSearch(), + fromPreparedPackageBytes: () => fakePreparedSearch(), + }, + prepareStaticSearchPackageBytes: () => new Uint8Array(), + prepareStaticSearchCompressedPackageBytes: () => new Uint8Array(), +}); + +const fakePreparedSearch = () => ({ + redactStaticEntities: () => ({ + resolvedEntities: [], + redaction: { + redactedText: "", + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, + }), +}); diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts new file mode 100644 index 00000000..f5b45408 --- /dev/null +++ b/packages/anonymize/src/native-node.ts @@ -0,0 +1,196 @@ +import { createRequire } from "node:module"; +import process from "node:process"; + +import { + assertNativeBindingVersion, + type NativeAnonymizeBinding, +} from "./native"; + +export * from "./native"; + +export type NativeRequire = (specifier: string) => unknown; + +export type NativeLibc = "gnu" | "musl"; + +export type NativePlatformPackageOptions = { + platform: string; + arch: string; + libc?: NativeLibc; +}; + +export type LoadNativeBindingOptions = { + expectedVersion?: string; + platform?: string; + arch?: string; + libc?: NativeLibc; + env?: Record; + requireModule?: NativeRequire; +}; + +const LOCAL_NATIVE_LOADER = "../index.cjs"; +const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; + +export const nativePlatformPackageName = ({ + platform, + arch, + libc = "gnu", +}: NativePlatformPackageOptions): string | null => { + if (platform === "darwin" && (arch === "arm64" || arch === "x64")) { + return `@stll/anonymize-darwin-${arch}`; + } + if (platform === "linux" && (arch === "arm64" || arch === "x64")) { + return libc === "gnu" ? `@stll/anonymize-linux-${arch}-gnu` : null; + } + if (platform === "win32" && arch === "x64") { + return "@stll/anonymize-win32-x64-msvc"; + } + return null; +}; + +export const loadNativeAnonymizeBinding = ( + options: LoadNativeBindingOptions = {}, +): NativeAnonymizeBinding => { + const requireModule = options.requireModule ?? createRequire(import.meta.url); + const platform = options.platform ?? process.platform; + const arch = options.arch ?? process.arch; + const env = options.env ?? process.env; + const specifiers = nativeBindingSpecifiers({ + platform, + arch, + env, + ...(options.libc !== undefined ? { libc: options.libc } : {}), + }); + const errors: string[] = []; + + for (const specifier of specifiers) { + const binding = tryLoadNativeBinding({ + specifier, + requireModule, + errors, + }); + if (!binding) { + continue; + } + if (options.expectedVersion !== undefined) { + assertNativeBindingVersion({ + binding, + expectedVersion: options.expectedVersion, + }); + } + return binding; + } + + const packageName = nativePlatformPackageName({ + platform, + arch, + ...(options.libc !== undefined ? { libc: options.libc } : {}), + }); + const platformMessage = + packageName === null + ? `Unsupported native anonymize platform ${platform}/${arch}` + : `Unable to load native anonymize binding for ${platform}/${arch}`; + throw new Error(`${platformMessage}:\n${errors.join("\n")}`); +}; + +type NativeBindingSpecifiersOptions = { + platform: string; + arch: string; + libc?: NativeLibc; + env: Record; +}; + +const nativeBindingSpecifiers = ({ + platform, + arch, + libc, + env, +}: NativeBindingSpecifiersOptions): string[] => { + const specifiers: string[] = []; + const overridePath = env[PACKAGE_SPECIFIC_NATIVE_PATH]; + if (overridePath) { + specifiers.push(overridePath); + } + specifiers.push(LOCAL_NATIVE_LOADER); + + const packageName = nativePlatformPackageName({ + platform, + arch, + ...(libc !== undefined ? { libc } : {}), + }); + if (packageName) { + specifiers.push(packageName); + } + return specifiers; +}; + +type TryLoadNativeBindingOptions = { + specifier: string; + requireModule: NativeRequire; + errors: string[]; +}; + +const tryLoadNativeBinding = ({ + specifier, + requireModule, + errors, +}: TryLoadNativeBindingOptions): NativeAnonymizeBinding | null => { + try { + const loaded = requireModule(specifier); + const binding = toNativeAnonymizeBinding(loaded); + if (binding) { + return binding; + } + errors.push(`${specifier}: module does not match native binding shape`); + } catch (error) { + errors.push(`${specifier}: ${formatLoadError(error)}`); + } + return null; +}; + +const toNativeAnonymizeBinding = ( + value: unknown, +): NativeAnonymizeBinding | null => { + const candidate = + isRecord(value) && isRecord(value["default"]) ? value["default"] : value; + return isNativeAnonymizeBinding(candidate) ? candidate : null; +}; + +const isNativeAnonymizeBinding = ( + candidate: unknown, +): candidate is NativeAnonymizeBinding => { + if (!isRecord(candidate)) { + return false; + } + if (typeof candidate["nativePackageVersion"] !== "function") { + return false; + } + if (typeof candidate["prepareStaticSearchPackageBytes"] !== "function") { + return false; + } + if ( + typeof candidate["prepareStaticSearchCompressedPackageBytes"] !== "function" + ) { + return false; + } + const preparedSearch = candidate["NativePreparedSearch"]; + if (!isRecord(preparedSearch)) { + return false; + } + if (typeof preparedSearch["fromConfigJsonBytes"] !== "function") { + return false; + } + if (typeof preparedSearch["fromPreparedPackageBytes"] !== "function") { + return false; + } + return true; +}; + +const isRecord = (value: unknown): value is Record => + typeof value === "object" && value !== null; + +const formatLoadError = (error: unknown): string => { + if (error instanceof Error) { + return error.message; + } + return String(error); +}; diff --git a/packages/anonymize/tsdown.config.ts b/packages/anonymize/tsdown.config.ts index 4f35e505..df148b70 100644 --- a/packages/anonymize/tsdown.config.ts +++ b/packages/anonymize/tsdown.config.ts @@ -2,7 +2,12 @@ import { defineConfig } from "tsdown"; export default defineConfig([ { - entry: ["src/index.ts", "src/constants.ts", "src/native.ts"], + entry: [ + "src/index.ts", + "src/constants.ts", + "src/native.ts", + "src/native-node.ts", + ], outDir: "dist", format: ["esm"], dts: true, From ffff4a3b68b92993a681ed542c80bc2ec0c59988 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 22:28:52 +0200 Subject: [PATCH 048/130] fix: tighten native parity edges --- Cargo.lock | 2 +- crates/anonymize-adapter-contract/src/lib.rs | 20 ++ crates/anonymize-core/Cargo.toml | 2 +- crates/anonymize-core/src/address_seeds.rs | 2 +- crates/anonymize-core/src/false_positives.rs | 90 +++++++ crates/anonymize-core/src/processors.rs | 4 + crates/anonymize-core/src/triggers.rs | 254 ++++++++++++++---- .../tests/address_seed_parity.rs | 107 ++++++++ .../tests/false_positive_parity.rs | 148 ++++++++++ crates/anonymize-core/tests/prepared.rs | 9 + crates/anonymize-core/tests/trigger_parity.rs | 171 ++++++++++++ crates/anonymize-py/src/lib.rs | 177 ++++++------ .../__test__/native-adapter-parity.test.ts | 119 ++++++++ .../src/__test__/pipeline-config.test.ts | 24 ++ .../anonymize/src/build-unified-search.ts | 38 ++- .../src/data/false-positive-shapes.json | 17 ++ packages/anonymize/src/detectors/deny-list.ts | 80 +++++- packages/anonymize/src/detectors/regex.ts | 12 + 18 files changed, 1133 insertions(+), 143 deletions(-) create mode 100644 crates/anonymize-core/tests/address_seed_parity.rs create mode 100644 crates/anonymize-core/tests/false_positive_parity.rs create mode 100644 crates/anonymize-core/tests/trigger_parity.rs create mode 100644 packages/anonymize/src/data/false-positive-shapes.json diff --git a/Cargo.lock b/Cargo.lock index f0f5fd44..6fef9ddb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -640,7 +640,7 @@ dependencies = [ [[package]] name = "stella-stdnum-core" version = "2.1.1" -source = "git+https://github.com/stella/stdnum?rev=614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f#614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" +source = "git+https://github.com/stella/stdnum?rev=b4949ece8981b84c53a21c26f7a5068dba553142#b4949ece8981b84c53a21c26f7a5068dba553142" [[package]] name = "stella-text-search-core" diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 246e6734..4f7ad12f 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -161,6 +161,8 @@ pub struct BindingTriggerData { pub address_stop_keywords: Vec, #[serde(default)] pub party_position_terms: Vec, + #[serde(default)] + pub sentence_terminal_currency_terms: Vec, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] @@ -352,10 +354,18 @@ pub struct BindingDenyListFilterData { #[serde(default)] pub address_jurisdiction_prefixes: Vec, pub street_types: Vec, + #[serde(default)] + pub address_component_terms: Vec, pub first_names: Vec, pub generic_roles: Vec, + #[serde(default)] + pub number_abbrev_prefixes: Vec, pub sentence_starters: Vec, pub trailing_address_word_exclusions: Vec, + #[serde(default)] + pub document_heading_words: Vec, + #[serde(default)] + pub document_heading_ordinal_markers: Vec, pub defined_term_cues: Vec, #[serde(default)] pub signing_place_guards: Vec, @@ -472,6 +482,7 @@ struct BinaryTriggerData { rules: Vec, address_stop_keywords: Vec, party_position_terms: Vec, + sentence_terminal_currency_terms: Vec, } #[derive(Deserialize, Serialize)] @@ -704,6 +715,7 @@ impl From for BinaryTriggerData { .collect(), address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } } @@ -718,6 +730,7 @@ impl From for BindingTriggerData { .collect(), address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } } @@ -1683,12 +1696,18 @@ fn deny_list_filters_from_binding( filters.address_jurisdiction_prefixes, ), street_types: lower_set(filters.street_types), + address_component_terms: lower_set(filters.address_component_terms), first_names: lower_set(filters.first_names), generic_roles: lower_set(filters.generic_roles), + number_abbrev_prefixes: lower_set(filters.number_abbrev_prefixes), sentence_starters: lower_set(filters.sentence_starters), trailing_address_word_exclusions: lower_set( filters.trailing_address_word_exclusions, ), + document_heading_words: lower_set(filters.document_heading_words), + document_heading_ordinal_markers: lower_set( + filters.document_heading_ordinal_markers, + ), defined_term_cues: lower_set(filters.defined_term_cues), signing_place_guards: filters .signing_place_guards @@ -1714,6 +1733,7 @@ fn trigger_data_from_binding( address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, legal_form_suffixes, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index ad0deb8b..0c8573ec 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,7 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } -stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "614ac70ef76161e2bbb6dcbe5ecc3923bbe8c33f" } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "b4949ece8981b84c53a21c26f7a5068dba553142" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0cfaad48a3df24f918cf52a2d5aaf32f5a031148" } [lints] diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index e9cfc5a0..a08d749c 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -40,7 +40,7 @@ impl PreparedAddressSeedData { boundary_search: literal_search(data.boundary_words)?, br_cep_cue_search: literal_search(data.br_cep_cue_words)?, postal_code_re: compile_regex( - r"(?u)(?:\d{3}\s\d{2}|\d{2}[-‐‑‒–—―]\d{3}|\d{5}|\d{5}[-‐‑‒–—―]\d{3}|\d{5}[-‐‑‒–—―]\d{4})", + r"(?u)(?:\d{5}[-‐‑‒–—―]\d{4}|\d{5}[-‐‑‒–—―]\d{3}|\d{3}\s\d{2}|\d{2}[-‐‑‒–—―]\d{3}|\d{5})", )?, br_cep_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{3}$")?, us_zip_plus_four_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{4}$")?, diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs index 271fd9c0..f86b45ce 100644 --- a/crates/anonymize-core/src/false_positives.rs +++ b/crates/anonymize-core/src/false_positives.rs @@ -123,6 +123,13 @@ fn should_reject_entity( if is_standalone_year(text) && entity.source != DetectionSource::Trigger { return Ok(true); } + if entity.source != DetectionSource::Trigger + && text.chars().next().is_some_and(|ch| ch.is_ascii_digit()) + && let Some(filters) = filters + && has_number_abbrev_prefix(full_text, offsets, entity, filters)? + { + return Ok(true); + } if entity.label == REGISTRATION_NUMBER_LABEL && is_short_letter_run(text) { return Ok(true); } @@ -150,6 +157,12 @@ fn should_reject_entity( { return Ok(true); } + if entity.label == ORGANIZATION_LABEL + && filters + .is_some_and(|filters| is_document_structure_heading(text, filters)) + { + return Ok(true); + } if entity.label == ADDRESS_LABEL && should_reject_address(entity, filters) { return Ok(true); } @@ -241,6 +254,79 @@ fn is_standalone_year(text: &str) -> bool { && (trimmed.starts_with("19") || trimmed.starts_with("20")) } +fn has_number_abbrev_prefix( + full_text: &str, + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> Result { + let start = offsets.validate_offset(entity.start)?; + let before = full_text.get(..start).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + Ok(ends_with_number_abbrev(before, filters)) +} + +fn ends_with_number_abbrev(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.trim_end().to_lowercase(); + filters.number_abbrev_prefixes.iter().any(|prefix| { + let Some(before_prefix) = lower.strip_suffix(prefix) else { + return false; + }; + before_prefix + .chars() + .next_back() + .is_none_or(|ch| ch.is_whitespace() || ch == '(') + }) +} + +fn is_document_structure_heading( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let Some((word_end, word)) = first_word(text.trim_start()) else { + return false; + }; + if !filters + .document_heading_words + .contains(&word.to_lowercase()) + { + return false; + } + let Some(rest) = text.trim_start().get(word_end..) else { + return false; + }; + starts_with_ordinal_marker_digit(rest, filters) +} + +fn starts_with_ordinal_marker_digit( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let trimmed = text.trim_start(); + let lower = trimmed.to_lowercase(); + filters + .document_heading_ordinal_markers + .iter() + .any(|marker| { + if marker.is_empty() { + return false; + } + if !lower.starts_with(marker) { + return false; + } + let Some(rest) = trimmed.get(marker.len()..) else { + return false; + }; + rest + .trim_start() + .chars() + .next() + .is_some_and(|ch| ch.is_ascii_digit()) + }) +} + fn is_short_letter_run(text: &str) -> bool { let letters = text.trim(); (1..=2).contains(&letters.chars().count()) @@ -501,6 +587,10 @@ fn has_address_component(text: &str, filters: &DenyListFilterData) -> bool { .street_types .iter() .any(|component| contains_component(&lower, component)) + || filters + .address_component_terms + .iter() + .any(|component| contains_component(&lower, component)) } fn is_jurisdiction_address(text: &str, filters: &DenyListFilterData) -> bool { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 01f61cea..66562afe 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -254,10 +254,14 @@ pub struct DenyListFilterData { pub address_stopwords: BTreeSet, pub address_jurisdiction_prefixes: BTreeSet, pub street_types: BTreeSet, + pub address_component_terms: BTreeSet, pub first_names: BTreeSet, pub generic_roles: BTreeSet, + pub number_abbrev_prefixes: BTreeSet, pub sentence_starters: BTreeSet, pub trailing_address_word_exclusions: BTreeSet, + pub document_heading_words: BTreeSet, + pub document_heading_ordinal_markers: BTreeSet, pub defined_term_cues: BTreeSet, pub signing_place_guards: Vec, } diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 9b0db946..9e969e8c 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -21,6 +21,7 @@ pub struct TriggerData { pub address_stop_keywords: Vec, pub party_position_terms: Vec, pub legal_form_suffixes: Vec, + pub sentence_terminal_currency_terms: Vec, } #[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] @@ -73,6 +74,7 @@ pub(crate) struct PreparedTriggerData { address_stop_keywords: Vec, party_position_terms: Vec, legal_form_suffixes: Vec, + sentence_terminal_currency_terms: Vec, } struct PreparedTriggerRule { @@ -81,7 +83,6 @@ struct PreparedTriggerRule { strategy: PreparedTriggerStrategy, validations: Vec, include_trigger: bool, - requires_exact_case: bool, } enum PreparedTriggerStrategy { @@ -119,6 +120,12 @@ struct ExtractedValue { text: String, } +struct TriggerExtractionData<'a> { + address_stop_keywords: &'a [String], + party_position_terms: &'a [String], + sentence_terminal_currency_terms: &'a [String], +} + impl PreparedTriggerData { pub(crate) fn new(data: TriggerData) -> Result { let rules = data @@ -131,13 +138,17 @@ impl PreparedTriggerData { address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, legal_form_suffixes: data.legal_form_suffixes, + sentence_terminal_currency_terms: data + .sentence_terminal_currency_terms + .into_iter() + .filter(|term| !term.is_empty()) + .collect(), }) } } impl PreparedTriggerRule { fn new(rule: TriggerRule) -> Result { - let requires_exact_case = requires_exact_case_trigger(&rule.trigger); Ok(Self { trigger: rule.trigger, label: rule.label, @@ -148,7 +159,6 @@ impl PreparedTriggerRule { .map(PreparedTriggerValidation::new) .collect::>>()?, include_trigger: rule.include_trigger, - requires_exact_case, }) } } @@ -208,6 +218,11 @@ pub(crate) fn process_trigger_matches( ) -> Result> { let offsets = ByteOffsets::new(full_text); let mut results = Vec::new(); + let extraction_data = TriggerExtractionData { + address_stop_keywords: &data.address_stop_keywords, + party_position_terms: &data.party_position_terms, + sentence_terminal_currency_terms: &data.sentence_terminal_currency_terms, + }; for found in matches { let Some(local_index) = slice.local_index(found.pattern()) else { @@ -222,20 +237,13 @@ pub(crate) fn process_trigger_matches( if !has_right_boundary(full_text, &offsets, found.end(), &rule.trigger)? { continue; } - if rule.requires_exact_case - && !matches_trigger_case(full_text, &offsets, found, rule)? - { - continue; - } - let Some(raw_value) = extract_value( full_text, &offsets, found.end(), &rule.strategy, &rule.label, - &data.address_stop_keywords, - &data.party_position_terms, + &extraction_data, )? else { continue; @@ -305,8 +313,7 @@ fn extract_value( trigger_end: u32, strategy: &PreparedTriggerStrategy, label: &str, - address_stop_keywords: &[String], - party_position_terms: &[String], + data: &TriggerExtractionData<'_>, ) -> Result> { let trigger_end_byte = offsets.validate_offset(trigger_end)?; let lookahead = get_trigger_lookahead(strategy); @@ -334,6 +341,7 @@ fn extract_value( label, stop_words, max_length.unwrap_or(MAX_TRIGGER_VALUE_LEN), + data.sentence_terminal_currency_terms, ), PreparedTriggerStrategy::ToEndOfLine => { extract_to_end_of_line(remaining, stripped, value_start_byte, label) @@ -348,8 +356,9 @@ fn extract_value( stripped, value_start_byte, max_chars.unwrap_or(120), - address_stop_keywords, - party_position_terms, + data.address_stop_keywords, + data.party_position_terms, + data.sentence_terminal_currency_terms, ), PreparedTriggerStrategy::MatchPattern { regex } => { extract_match_pattern(stripped, value_start_byte, regex) @@ -364,6 +373,7 @@ fn extract_to_next_comma( label: &str, stop_words: &[String], length_cap: usize, + sentence_terminal_currency_terms: &[String], ) -> Option { let mut end = 0; while end < value_text.len() { @@ -373,7 +383,13 @@ fn extract_to_next_comma( if matches!(ch, '\n' | '(' | ')' | '[' | ']' | '\t' | ';') { break; } - if ch == '.' && is_sentence_terminator(value_text, end) { + if ch == '.' + && is_sentence_terminator( + value_text, + end, + sentence_terminal_currency_terms, + ) + { break; } if hits_stop_word(value_text, end, stop_words) { @@ -545,6 +561,7 @@ fn extract_address( max_len: usize, stop_keywords: &[String], party_position_terms: &[String], + sentence_terminal_currency_terms: &[String], ) -> Option { if let Some(trimmed) = trim_leading_party_position(value_text, party_position_terms) @@ -585,7 +602,11 @@ fn extract_address( next_ch.is_alphabetic() || next_ch.is_ascii_digit() }) }) - && !is_sentence_terminator(value_text, end) + && !is_sentence_terminator( + value_text, + end, + sentence_terminal_currency_terms, + ) { end = end.saturating_add(len); continue; @@ -807,32 +828,6 @@ fn has_right_boundary( ) } -fn matches_trigger_case( - text: &str, - offsets: &ByteOffsets<'_>, - found: &SearchMatch, - rule: &PreparedTriggerRule, -) -> Result { - Ok(offsets.slice(text, found.start(), found.end())? == rule.trigger) -} - -fn requires_exact_case_trigger(trigger: &str) -> bool { - let mut letters = 0usize; - for ch in trigger.chars() { - if ch.is_whitespace() { - return false; - } - if !ch.is_alphabetic() { - continue; - } - letters = letters.saturating_add(1); - if !ch.is_uppercase() { - return false; - } - } - letters >= 2 -} - fn char_at( text: &str, offsets: &ByteOffsets<'_>, @@ -943,28 +938,112 @@ fn post_nominal_len(text: &str) -> Option { (token_end > 0).then_some(len_before.saturating_add(token_end)) } -fn is_sentence_terminator(text: &str, period_byte: usize) -> bool { +fn is_sentence_terminator( + text: &str, + period_byte: usize, + sentence_terminal_currency_terms: &[String], +) -> bool { let Some(tail) = text.get(period_byte..) else { return false; }; - let starts_next = tail.strip_prefix('.').is_some_and(|after| { - after.trim_start().is_empty() || after.starts_with(char::is_whitespace) - }); - if !starts_next { + if !next_is_sentence_start(tail) { return false; } let head = text.get(..period_byte).unwrap_or_default(); - head - .chars() - .rev() - .take_while(|ch| ch.is_alphabetic()) - .filter(|ch| ch.is_lowercase()) - .count() - >= 5 + lowercase_tail_len(head) >= 5 + || currency_tail(head, sentence_terminal_currency_terms) || head .chars() .next_back() .is_some_and(|ch| ch.is_ascii_digit()) + || (proper_noun_tail(head) && next_is_real_sentence(tail)) +} + +fn next_is_sentence_start(tail: &str) -> bool { + let Some(after_period) = tail.strip_prefix('.') else { + return false; + }; + if after_period.trim_start().is_empty() { + return true; + } + if !after_period.starts_with(char::is_whitespace) { + return false; + } + after_period + .trim_start() + .chars() + .next() + .is_some_and(char::is_uppercase) +} + +fn next_is_real_sentence(tail: &str) -> bool { + let Some(after_period) = tail.strip_prefix('.') else { + return false; + }; + if !after_period.starts_with(char::is_whitespace) { + return false; + } + let mut chars = after_period.trim_start().chars(); + chars.next().is_some_and(char::is_uppercase) + && chars.next().is_some_and(char::is_lowercase) + && chars.next().is_some_and(char::is_lowercase) +} + +fn lowercase_tail_len(text: &str) -> usize { + text + .chars() + .rev() + .take_while(|ch| ch.is_lowercase()) + .count() +} + +fn currency_tail( + text: &str, + sentence_terminal_currency_terms: &[String], +) -> bool { + sentence_terminal_currency_terms + .iter() + .any(|term| has_currency_code_tail(text, term)) +} + +fn has_currency_code_tail(text: &str, code: &str) -> bool { + let Some(start) = text.len().checked_sub(code.len()) else { + return false; + }; + let Some(tail) = text.get(start..) else { + return false; + }; + if tail.to_lowercase() != code.to_lowercase() { + return false; + } + text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphabetic()) +} + +fn proper_noun_tail(text: &str) -> bool { + let mut start = text.len(); + for (index, ch) in text.char_indices().rev() { + if !ch.is_alphabetic() { + break; + } + start = index; + } + let Some(word) = text.get(start..) else { + return false; + }; + let mut chars = word.chars(); + if !chars.next().is_some_and(char::is_uppercase) { + return false; + } + if chars.clone().count() < 3 || !chars.all(char::is_lowercase) { + return false; + } + text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphabetic() && ch != '.') } fn punctuation_only(text: &str) -> bool { @@ -1003,9 +1082,60 @@ fn phone_shape_end(text: &str) -> Option { { end = end.saturating_sub(next_len_backward(text, end)); } + if let Some(extension_len) = + text.get(end..).and_then(phone_extension_suffix_len) + { + end = end.saturating_add(extension_len); + } (end > 0).then_some(end) } +fn phone_extension_suffix_len(text: &str) -> Option { + let leading = text.len().saturating_sub(text.trim_start().len()); + let trimmed = text.get(leading..)?; + for label in ["extension", "ext", "x"] { + let Some(rest) = ascii_case_prefix_rest(trimmed, label) else { + continue; + }; + let (rest, dot_len) = if label == "ext" { + rest + .strip_prefix('.') + .map_or((rest, 0_usize), |after_dot| (after_dot, 1_usize)) + } else { + (rest, 0_usize) + }; + let whitespace = rest.len().saturating_sub(rest.trim_start().len()); + let digits = rest.get(whitespace..)?; + let mut digit_end = 0; + let mut digit_count = 0_usize; + for (index, ch) in digits.char_indices() { + if !ch.is_ascii_digit() || digit_count >= 6 { + break; + } + digit_count = digit_count.saturating_add(1); + digit_end = index.saturating_add(ch.len_utf8()); + } + if digit_count > 0 { + return Some( + leading + .saturating_add(label.len()) + .saturating_add(dot_len) + .saturating_add(whitespace) + .saturating_add(digit_end), + ); + } + } + None +} + +fn ascii_case_prefix_rest<'a>(text: &'a str, prefix: &str) -> Option<&'a str> { + let head = text.get(..prefix.len())?; + if !head.eq_ignore_ascii_case(prefix) { + return None; + } + text.get(prefix.len()..) +} + fn next_len_backward(text: &str, byte: usize) -> usize { text .get(..byte) @@ -1173,7 +1303,19 @@ fn id_value_prefix(text: &str) -> Option<&str> { } end = index.saturating_add(ch.len_utf8()); } - (digits >= 2 && end >= 5).then(|| text.get(..end)).flatten() + let candidate = text.get(..end)?; + (digits >= 2 && end >= 5 && !single_digit_dotted_prefix(candidate)) + .then_some(candidate) +} + +fn single_digit_dotted_prefix(text: &str) -> bool { + let mut chars = text.trim_start().chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_ascii_digit() + && chars.next() == Some('.') + && chars.next().is_some_and(|ch| ch.is_ascii_digit()) } fn has_known_legal_form_suffix(text: &str, suffixes: &[String]) -> bool { diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs new file mode 100644 index 00000000..f280c220 --- /dev/null +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -0,0 +1,107 @@ +#![allow(clippy::expect_used)] + +use stella_anonymize_core::{ + AddressSeedData, LiteralSearchOptions, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, SearchOptions, + SearchPattern, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + gazetteer_data: None, + country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, + } +} + +fn address_texts( + result: &stella_anonymize_core::StaticRedactionResult, +) -> Vec<&str> { + result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect() +} + +#[test] +fn detects_state_qualified_zip_plus_four_address_seed() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Registered office: CA 94304-1050. Notices follow.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"CA 94304-1050"), + "resolved address entities: {:?}", + result.resolved_entities, + ); + assert!(!result.redaction.redacted_text.contains("94304-1050")); +} + +#[test] +fn detects_cue_gated_br_cep_address_seed() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Rua"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + street_types: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + address_seed_data: Some(AddressSeedData { + boundary_words: Vec::new(), + br_cep_cue_words: vec![String::from("CEP")], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Enviar para CEP 01001-000, Rua Boa Vista, 100. Obrigado.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"CEP 01001-000, Rua Boa Vista, 100"), + "resolved address entities: {:?}", + result.resolved_entities, + ); + assert!(!result.redaction.redacted_text.contains("01001-000")); +} diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs new file mode 100644 index 00000000..f38695b3 --- /dev/null +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -0,0 +1,148 @@ +#![allow(clippy::expect_used, clippy::unwrap_used)] + +use std::collections::BTreeSet; + +use stella_anonymize_core::{ + DenyListFilterData, DenyListMatchData, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + SearchOptions, SearchPattern, TriggerData, TriggerRule, TriggerStrategy, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + gazetteer_data: None, + country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, + } +} + +fn empty_deny_list_data(filters: DenyListFilterData) -> DenyListMatchData { + DenyListMatchData { + labels: Vec::>::new().into(), + custom_labels: Vec::>::new().into(), + originals: vec![], + sources: Vec::>::new().into(), + filters: Some(filters), + } +} + +fn set(values: [&str; N]) -> BTreeSet { + values.into_iter().map(String::from).collect() +} + +fn resolved_texts(prepared: &PreparedSearch, text: &str) -> Vec { + prepared + .redact_static_entities(text, &OperatorConfig::default()) + .unwrap() + .resolved_entities + .into_iter() + .map(|entity| entity.text) + .collect() +} + +#[test] +fn keeps_trigger_address_with_extra_component_anchor() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("bytem"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("bytem"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(80), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + address_component_terms: set(["sídliště"]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert_eq!( + resolved_texts(&prepared, "Trvale bytem: sídliště Barrandov."), + [String::from("sídliště Barrandov")] + ); +} + +#[test] +fn rejects_non_trigger_numbers_after_number_abbreviations() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\b\d{4}\b"))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + number_abbrev_prefixes: set(["no.", "č.", "nr."]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "Invoice No. 1234, položka č. 5678, Akte Nr. 9012, account 7777."; + + assert_eq!(resolved_texts(&prepared, text), [String::from("7777")]); +} + +#[test] +fn rejects_document_structure_heading_organizations() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Schedule No\. 4|Příloha č\. 2|Acme No\. 4", + ))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("organization", 0.9)], + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + document_heading_words: set(["schedule", "příloha"]), + document_heading_ordinal_markers: set(["no.", "č."]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "Schedule No. 4\nPříloha č. 2\nAcme No. 4 signed."; + + assert_eq!( + resolved_texts(&prepared, text), + [String::from("Acme No. 4")] + ); +} diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 7bae239b..02a273be 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -509,6 +509,7 @@ fn prepared_search_extracts_written_date_of_birth_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -553,6 +554,7 @@ fn prepared_search_extends_single_word_written_date_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -613,6 +615,7 @@ fn prepared_search_extracts_year_after_duplicate_year_word_noise() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -655,6 +658,7 @@ fn prepared_search_trigger_caps_by_characters_not_bytes() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -698,6 +702,7 @@ fn prepared_search_trigger_validations_count_characters_not_bytes() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -738,6 +743,7 @@ fn prepared_search_rejects_lowercase_acronym_trigger_collisions() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -785,6 +791,7 @@ fn prepared_search_trims_party_position_before_triggered_address() { address_stop_keywords: Vec::new(), party_position_terms: vec![String::from("prodávajícího")], legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -1047,6 +1054,7 @@ fn prepared_search_keeps_person_name_particles_after_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -1538,6 +1546,7 @@ fn prepared_search_expands_plain_postal_city_addresses() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), }), deny_list_data: Some(DenyListMatchData { labels: vec![vec![String::from("address")]].into(), diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs new file mode 100644 index 00000000..efa3ef25 --- /dev/null +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -0,0 +1,171 @@ +#![allow(clippy::expect_used)] + +use stella_anonymize_core::{ + PatternSlice, PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, + SearchOptions, SearchPattern, StaticDetectionResult, TriggerData, + TriggerRule, TriggerStrategy, TriggerValidation, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + gazetteer_data: None, + country_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + date_data: None, + monetary_data: None, + } +} + +fn prepared_for_trigger( + trigger: &str, + label: &str, + strategy: TriggerStrategy, +) -> PreparedSearch { + PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: trigger.to_lowercase(), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: trigger.to_owned(), + label: label.to_owned(), + strategy, + validations: Vec::::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: vec![String::from("Kč")], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("trigger config should prepare") +} + +fn trigger_texts(result: &StaticDetectionResult) -> Vec<&str> { + result + .trigger_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect() +} + +#[test] +fn uppercase_configured_id_triggers_accept_lowercase_source_forms() { + for (trigger, text, expected) in [ + ("CPF", "cpf: 123.456.789-00", "123.456.789-00"), + ("CNPJ", "cnpj: 12.345.678/0001-95", "12.345.678/0001-95"), + ("DNI", "dni 12345678Z", "12345678Z"), + ("CP", "cp: 08001", "08001"), + ] { + let prepared = prepared_for_trigger( + trigger, + "tax identification number", + TriggerStrategy::CompanyIdValue, + ); + let result = prepared + .detect_static_entities(text) + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&expected), + "trigger {trigger} should extract {expected:?}; entities: {:?}", + result.trigger_entities, + ); + } +} + +#[test] +fn labelled_phone_trigger_keeps_extension_suffixes() { + let prepared = + prepared_for_trigger("PHONE", "phone number", TriggerStrategy::ToEndOfLine); + + for (text, expected) in [ + ( + "PHONE: +1 555 123 4567 ext. 89\nNext line.", + "+1 555 123 4567 ext. 89", + ), + ( + "PHONE: +1 555 123 4567 extension 42\nNext line.", + "+1 555 123 4567 extension 42", + ), + ( + "PHONE: +1 555 123 4567 x42\nNext line.", + "+1 555 123 4567 x42", + ), + ] { + let result = prepared + .detect_static_entities(text) + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&expected), + "phone trigger should keep extension in {expected:?}; entities: {:?}", + result.trigger_entities, + ); + } +} + +#[test] +fn to_next_comma_stops_after_short_currency_abbreviation_sentence_tail() { + let prepared = prepared_for_trigger( + "fee", + "monetary amount", + TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: Some(100), + }, + ); + + let result = prepared + .detect_static_entities("fee 100 Kč. Termin splatnosti je zítra.") + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&"100 Kč"), + "currency sentence tail should stop the capture; entities: {:?}", + result.trigger_entities, + ); +} + +#[test] +fn address_trigger_stops_after_short_proper_noun_before_real_sentence() { + let prepared = prepared_for_trigger( + "office", + "address", + TriggerStrategy::Address { + max_chars: Some(120), + }, + ); + + let result = prepared + .detect_static_entities("office Brno. Section begins here.") + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&"Brno"), + "proper-noun sentence tail should stop the address; entities: {:?}", + result.trigger_entities, + ); +} diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index a9aa52fa..e4e23a47 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -10,12 +10,12 @@ use stella_anonymize_adapter_contract::{ prepared_search_core_package_view_from_bytes, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, static_redaction_diagnostic_result_to_utf16_binding, - static_redaction_diagnostics_to_binding, + static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, static_redaction_result_to_utf16_binding, }; use stella_anonymize_core::{ PreparedSearch as CorePreparedSearch, PreparedSearchArtifacts, - StaticRedactionDiagnostics, + StaticRedactionDiagnostics, StaticRedactionResult, }; #[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] @@ -143,16 +143,8 @@ impl PyPreparedSearch { full_text: &str, operators_json: Option<&str>, ) -> PyResult { - let operators = parse_operator_config(operators_json)?; - let result = self - .inner - .redact_static_entities( - full_text, - &operator_config_from_binding(operators) - .map_err(|error| to_py_contract_error(&error))?, - ) - .map_err(|error| to_py_core_error(&error))?; - static_redaction_result_to_utf16_binding(result, full_text) + let result = self.redact_static_entities_core(full_text, operators_json)?; + static_redaction_result_to_python_binding(result, full_text) .map_err(|error| to_py_contract_error(&error)) .map(to_py_static_redaction_result) } @@ -162,9 +154,10 @@ impl PyPreparedSearch { full_text: &str, operators_json: Option<&str>, ) -> PyResult { - let result = self.redact_static_entities(full_text, operators_json)?; - serde_json::to_string(&to_binding_static_redaction_result(result)) - .map_err(|error| to_py_serde_error(&error)) + let result = self.redact_static_entities_core(full_text, operators_json)?; + let result = static_redaction_result_to_utf16_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error))?; + serde_json::to_string(&result).map_err(|error| to_py_serde_error(&error)) } fn redact_static_entities_diagnostics_json( @@ -192,6 +185,24 @@ impl PyPreparedSearch { } } +impl PyPreparedSearch { + fn redact_static_entities_core( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let operators = parse_operator_config(operators_json)?; + self + .inner + .redact_static_entities( + full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map_err(|error| to_py_core_error(&error)) + } +} + #[pyfunction] fn redact_static_entities_json( config_json: &str, @@ -295,6 +306,80 @@ fn parse_operator_config( .map_err(|error| to_py_serde_error(&error)) } +fn static_redaction_result_to_python_binding( + result: StaticRedactionResult, + full_text: &str, +) -> std::result::Result { + let offsets = PythonOffsetMap::new(full_text)?; + let mut result = static_redaction_result_to_binding(result); + convert_pipeline_entity_offsets_to_python( + &mut result.resolved_entities, + &offsets, + )?; + Ok(result) +} + +fn convert_pipeline_entity_offsets_to_python( + entities: &mut [BindingPipelineEntity], + offsets: &PythonOffsetMap, +) -> std::result::Result<(), ContractError> { + for entity in entities { + entity.start = offsets.convert(entity.start)?; + entity.end = offsets.convert(entity.end)?; + } + Ok(()) +} + +struct PythonOffsetMap { + boundaries: Vec<(u32, u32)>, +} + +impl PythonOffsetMap { + fn new(text: &str) -> std::result::Result { + let mut boundaries = Vec::new(); + let mut code_point_offset = 0_u32; + boundaries.push((0, 0)); + + for (byte_start, ch) in text.char_indices() { + code_point_offset = + code_point_offset.checked_add(1).ok_or_else(|| { + ContractError::InvalidPreparedSearchPackage { + reason: String::from("Python offset exceeds u32 range"), + } + })?; + let byte_end = byte_start.saturating_add(ch.len_utf8()); + boundaries.push((u32_from_usize(byte_end)?, code_point_offset)); + } + + Ok(Self { boundaries }) + } + + fn convert(&self, offset: u32) -> std::result::Result { + self + .try_convert(offset) + .ok_or(ContractError::InvalidBindingOffset { offset }) + } + + fn try_convert(&self, offset: u32) -> Option { + let index = self + .boundaries + .binary_search_by_key(&offset, |(byte_offset, _)| *byte_offset) + .ok()?; + self + .boundaries + .get(index) + .map(|(_, code_point_offset)| *code_point_offset) + } +} + +fn u32_from_usize(value: usize) -> std::result::Result { + u32::try_from(value).map_err(|_| { + ContractError::InvalidPreparedSearchPackage { + reason: format!("Offset exceeds u32 range: {value}"), + } + }) +} + fn to_py_static_redaction_result( result: BindingStaticRedactionResult, ) -> PyStaticRedactionResult { @@ -351,68 +436,6 @@ fn to_py_operator_entry(entry: BindingOperatorEntry) -> PyOperatorEntry { } } -fn to_binding_static_redaction_result( - result: PyStaticRedactionResult, -) -> BindingStaticRedactionResult { - BindingStaticRedactionResult { - resolved_entities: result - .resolved_entities - .into_iter() - .map(to_binding_pipeline_entity) - .collect(), - redaction: to_binding_redaction_result(result.redaction), - } -} - -fn to_binding_pipeline_entity( - entity: PyPipelineEntity, -) -> BindingPipelineEntity { - BindingPipelineEntity { - start: entity.start, - end: entity.end, - label: entity.label, - text: entity.text, - score: entity.score, - source: entity.source, - source_detail: entity.source_detail, - } -} - -fn to_binding_redaction_result( - result: PyRedactionResult, -) -> BindingRedactionResult { - BindingRedactionResult { - redacted_text: result.redacted_text, - redaction_map: result - .redaction_map - .into_iter() - .map(to_binding_redaction_entry) - .collect(), - operator_map: result - .operator_map - .into_iter() - .map(to_binding_operator_entry) - .collect(), - entity_count: result.entity_count, - } -} - -fn to_binding_redaction_entry( - entry: PyRedactionEntry, -) -> BindingRedactionEntry { - BindingRedactionEntry { - placeholder: entry.placeholder, - original: entry.original, - } -} - -fn to_binding_operator_entry(entry: PyOperatorEntry) -> BindingOperatorEntry { - BindingOperatorEntry { - placeholder: entry.placeholder, - operator: entry.operator, - } -} - fn to_py_core_error(error: &stella_anonymize_core::Error) -> PyErr { PyValueError::new_err(error.to_string()) } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 10b300c5..26127eb3 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -119,6 +119,13 @@ type ContractFixtureCase = { text: string; }; +type PythonNativeOffsetSlice = { + start: number; + end: number; + slice: string; + text: string; +}; + const ROOT_DIR = join(import.meta.dir, "..", "..", "..", ".."); const TARGET_DIR = join(ROOT_DIR, "target", "debug"); const CONTRACT_FIXTURES_DIR = join( @@ -196,6 +203,7 @@ const CONFIG_JSON = JSON.stringify({ generic_roles: [], sentence_starters: [], trailing_address_word_exclusions: [], + document_heading_words: [], defined_term_cues: [], }, }, @@ -234,6 +242,53 @@ results = [ print(json.dumps(results)) `; +const PYTHON_NATIVE_OFFSET_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "stella_anonymize_core_py", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +prepared = module.PreparedSearch(payload["config_json"]) +result = prepared.redact_static_entities( + payload["text"], + payload.get("operators_json"), +) +entity = next( + ( + item + for item in result.resolved_entities + if item.label == payload["label"] + ), + None, +) +if entity is None: + raise AssertionError(f"entity not found: {payload['label']}") +sliced = payload["text"][entity.start:entity.end] +if sliced != payload["expected"]: + raise AssertionError( + f"slice mismatch: {sliced!r} at {entity.start}:{entity.end}" + ) +print( + json.dumps( + { + "start": entity.start, + "end": entity.end, + "slice": sliced, + "text": entity.text, + } + ) +) +`; + const PYTHON_VERSION_SCRIPT = ` import importlib.util import os @@ -533,6 +588,38 @@ describe("native adapter parity", () => { expect(text.slice(registration.start, registration.end)).toBe("AB1234"); }); + test("Python-native offsets slice source text after astral prefixes", () => { + const adapters = getAdapters(); + const text = "🙂 Reference AB1234 for Acme s.r.o."; + + const tsResult = runTsAdapter(adapters.native, text, null); + const registration = tsResult.resolved_entities.find( + (entity) => entity.label === "registration number", + ); + expect(registration).toBeDefined(); + if (!registration) { + return; + } + expect(text.slice(registration.start, registration.end)).toBe("AB1234"); + + const pythonSlice = callPythonNativeOffsetSlice( + adapters.pythonModulePath, + text, + "registration number", + "AB1234", + null, + ); + + expect(pythonSlice).toEqual({ + start: 12, + end: 18, + slice: "AB1234", + text: "AB1234", + }); + expect(pythonSlice.start).toBe(registration.start - 1); + expect(pythonSlice.end).toBe(registration.end - 1); + }); + test("prepared search accepts config JSON bytes", () => { const adapters = getAdapters(); const text = @@ -992,6 +1079,38 @@ const runPythonAdapters = ( return JSON.parse(output); }; +const callPythonNativeOffsetSlice = ( + pythonModulePath: string, + text: string, + label: string, + expected: string, + operators: Record | null, +): PythonNativeOffsetSlice => { + const payloadDir = mkdtempSync( + join(tmpdir(), "stella-anonymize-py-offsets-"), + ); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + label, + expected, + operators_json: operatorConfigJson(operators), + }), + ); + try { + const output = runCommand("python3", ["-c", PYTHON_NATIVE_OFFSET_SCRIPT], { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + const callPythonNormalize = ( pythonModulePath: string, text: string, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index bd0b6b1b..ef03b01e 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -128,6 +128,30 @@ describe("pipeline config semantics", () => { expect(search.nativeStaticConfig.threshold).toBe(0.93); }); + test("native config keeps unsupported validator regexes fail-fast", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + labels: ["national identification number"], + }, + [], + createPipelineContext(), + ); + + const patternIndex = search.nativeStaticConfig.regex_patterns.findIndex( + (pattern) => + pattern.kind === "regex" && pattern.pattern.includes("\\d{17}"), + ); + expect(patternIndex).toBeGreaterThanOrEqual(0); + const meta = search.nativeStaticConfig.regex_meta.at(patternIndex); + expect(meta).toMatchObject({ + label: "national identification number", + requires_validation: true, + }); + expect(meta?.validator_id).toBeUndefined(); + }); + test("content language scopes deny-list search build", async () => { const testDictionaries = await getDictionaries(); const config = { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index f1b32690..e20279a3 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -133,10 +133,14 @@ export type NativeDenyListFilterData = { address_stopwords: string[]; address_jurisdiction_prefixes: string[]; street_types: string[]; + address_component_terms: string[]; first_names: string[]; generic_roles: string[]; + number_abbrev_prefixes: string[]; sentence_starters: string[]; trailing_address_word_exclusions: string[]; + document_heading_words: string[]; + document_heading_ordinal_markers: string[]; defined_term_cues: string[]; signing_place_guards: NativeSigningPlaceGuardData[]; }; @@ -188,6 +192,7 @@ export type NativeTriggerData = { rules: NativeTriggerRule[]; address_stop_keywords: string[]; party_position_terms: string[]; + sentence_terminal_currency_terms: string[]; }; export type NativeLegalFormData = { @@ -416,7 +421,8 @@ const buildUnifiedSearchSources = async ( config.enableRegex && labelIsAllowed("date", allowedLabels) ? getYearWordData() : Promise.resolve(null), - config.enableRegex && labelIsAllowed("monetary amount", allowedLabels) + config.enableTriggerPhrases || + (config.enableRegex && labelIsAllowed("monetary amount", allowedLabels)) ? getMonetaryData() : Promise.resolve(null), labelIsAllowed("address", allowedLabels) @@ -893,7 +899,7 @@ const buildNativeStaticConfig = ({ continue; } const meta = regexMeta[index]; - if (!meta || !nativeSupportsRegexMeta(meta)) { + if (!meta) { continue; } nativeRegexPatterns.push(toNativeRegexPattern(pattern)); @@ -1039,6 +1045,8 @@ const buildNativeStaticConfig = ({ rules: triggerRules.map(toNativeTriggerRule), address_stop_keywords: [...getAddressStopKeywordsSync()], party_position_terms: [...partyPositionTerms], + sentence_terminal_currency_terms: + sentenceTerminalCurrencyTerms(monetaryData), }; } if (legalFormData) { @@ -1272,11 +1280,12 @@ const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { result.source_detail = meta.sourceDetail; } if (meta.validator) { + const isSupportedValidator = nativeSupportsRegexMeta(meta); result.requires_validation = true; - if (meta.validatorId) { + if (isSupportedValidator && meta.validatorId) { result.validator_id = meta.validatorId; } - if (meta.validatorInputKind) { + if (isSupportedValidator && meta.validatorInputKind) { result.validator_input = meta.validatorInputKind; } } @@ -1320,6 +1329,23 @@ const toNativeDenyListData = (data: DenyListData): NativeDenyListMatchData => { return result; }; +const sentenceTerminalCurrencyTerms = ( + monetaryData: NativeMonetaryData | null, +): string[] => { + if (monetaryData === null) { + return []; + } + return [ + ...new Set( + [ + ...monetaryData.currencies.codes, + ...monetaryData.currencies.symbols, + ...monetaryData.currencies.local_names, + ].filter((term) => term.length > 0), + ), + ].toSorted(); +}; + const createStringGroupEncoder = (): { table: string[]; encode: (values: string | readonly string[] | undefined) => number[]; @@ -1364,10 +1390,14 @@ const toNativeDenyListFilters = ( address_stopwords: filters.addressStopwords, address_jurisdiction_prefixes: filters.addressJurisdictionPrefixes, street_types: filters.streetTypes, + address_component_terms: filters.addressComponentTerms, first_names: filters.firstNames, generic_roles: filters.genericRoles, + number_abbrev_prefixes: filters.numberAbbrevPrefixes, sentence_starters: filters.sentenceStarters, trailing_address_word_exclusions: filters.trailingAddressWordExclusions, + document_heading_words: filters.documentHeadingWords, + document_heading_ordinal_markers: filters.documentHeadingOrdinalMarkers, defined_term_cues: filters.definedTermCues, signing_place_guards: filters.signingPlaceGuards.map((entry) => ({ prefix_phrases: entry.prefixPhrases, diff --git a/packages/anonymize/src/data/false-positive-shapes.json b/packages/anonymize/src/data/false-positive-shapes.json new file mode 100644 index 00000000..49552bab --- /dev/null +++ b/packages/anonymize/src/data/false-positive-shapes.json @@ -0,0 +1,17 @@ +{ + "_comment": "Language-keyed lexical markers used by false-positive shape guards.", + "addressComponentTerms": { + "cs": ["č.p.", "č.ev.", "č.", "sídliště"] + }, + "numberAbbrevPrefixes": { + "cs": ["čís.", "č."], + "de": ["nr."], + "en": ["no.", "n."] + }, + "documentHeadingOrdinalMarkers": { + "cs": ["č.", "č"], + "de": ["nr.", "nr"], + "en": ["no.", "no", "n.", "n"], + "global": ["#"] + } +} diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index 2f7a64bf..524baa44 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -492,6 +492,12 @@ type DenyListLanguageFilters = { definedTermCues?: readonly string[]; }; +type FalsePositiveShapeFilters = { + addressComponentTerms: string[]; + numberAbbrevPrefixes: string[]; + documentHeadingOrdinalMarkers: string[]; +}; + type SigningClauseData = { patterns: readonly { guardPrefixPhrases?: readonly string[]; @@ -507,10 +513,14 @@ export type DenyListFilterData = { addressStopwords: string[]; addressJurisdictionPrefixes: string[]; streetTypes: string[]; + addressComponentTerms: string[]; firstNames: string[]; genericRoles: string[]; + numberAbbrevPrefixes: string[]; sentenceStarters: string[]; trailingAddressWordExclusions: string[]; + documentHeadingWords: string[]; + documentHeadingOrdinalMarkers: string[]; definedTermCues: string[]; signingPlaceGuards: DenyListSigningPlaceGuardData[]; }; @@ -1124,6 +1134,7 @@ const loadSigningPlaceFilters = (): Promise => { let trailingAddressWordExclusionsPromise: Promise> | null = null; +let documentHeadingWordsPromise: Promise | null = null; let addressJurisdictionPrefixesPromise: Promise | null = null; const loadLanguageWordFile = async ( @@ -1134,6 +1145,62 @@ const loadLanguageWordFile = async ( return collectLanguageWordValues(parsed as Record); }; +const isRecord = (value: unknown): value is Record => + typeof value === "object" && value !== null && !Array.isArray(value); + +const languageWordValues = (value: unknown): string[] => + isRecord(value) ? collectLanguageWordValues(value) : []; + +let falsePositiveShapeFiltersPromise: Promise | null = + null; + +const loadFalsePositiveShapeFilters = + (): Promise => { + if (falsePositiveShapeFiltersPromise) { + return falsePositiveShapeFiltersPromise; + } + + falsePositiveShapeFiltersPromise = (async () => { + const mod = await import("../data/false-positive-shapes.json"); + const defaultValue = isRecord(mod) ? mod.default : undefined; + let data: Record = {}; + if (isRecord(defaultValue)) { + data = defaultValue; + } else if (isRecord(mod)) { + data = mod; + } + return { + addressComponentTerms: languageWordValues( + data["addressComponentTerms"], + ), + numberAbbrevPrefixes: languageWordValues(data["numberAbbrevPrefixes"]), + documentHeadingOrdinalMarkers: languageWordValues( + data["documentHeadingOrdinalMarkers"], + ), + }; + })().catch((error) => { + falsePositiveShapeFiltersPromise = null; + throw error; + }); + + return falsePositiveShapeFiltersPromise; + }; + +const loadDocumentHeadingWords = (): Promise => { + if (documentHeadingWordsPromise) { + return documentHeadingWordsPromise; + } + + documentHeadingWordsPromise = loadLanguageWordFile( + () => import("../data/document-structure-headings.json"), + ).catch((error) => { + documentHeadingWordsPromise = null; + throw error; + }); + + return documentHeadingWordsPromise; +}; + const loadTrailingAddressWordExclusions = async (): Promise< ReadonlySet > => { @@ -1147,9 +1214,7 @@ const loadTrailingAddressWordExclusions = async (): Promise< loadLanguageWordFile( () => import("../data/organization-unit-heads.json"), ), - loadLanguageWordFile( - () => import("../data/document-structure-headings.json"), - ), + loadDocumentHeadingWords(), ]); return new Set( lowerSortedUnique([ @@ -1189,10 +1254,14 @@ const buildDenyListFilterData = async ( signingPlaceFilters, trailingAddressWordExclusions, addressJurisdictionPrefixes, + falsePositiveShapeFilters, + documentHeadingWords, ] = await Promise.all([ loadSigningPlaceFilters(), loadTrailingAddressWordExclusions(), loadAddressJurisdictionPrefixes(), + loadFalsePositiveShapeFilters(), + loadDocumentHeadingWords(), ]); return { @@ -1203,13 +1272,18 @@ const buildDenyListFilterData = async ( addressStopwords: [...getAddressStopwords(ctx)], addressJurisdictionPrefixes, streetTypes: await buildStreetTypeFilterValues(), + addressComponentTerms: falsePositiveShapeFilters.addressComponentTerms, firstNames: [...getNameCorpusFirstNames(ctx)], genericRoles: [ ...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES), ...getLegalRoleHeadsSync(), ], + numberAbbrevPrefixes: falsePositiveShapeFilters.numberAbbrevPrefixes, sentenceStarters: [...DENY_LIST_STATIC_FILTERS.sentenceStarters], trailingAddressWordExclusions: [...trailingAddressWordExclusions], + documentHeadingWords, + documentHeadingOrdinalMarkers: + falsePositiveShapeFilters.documentHeadingOrdinalMarkers, definedTermCues: [...DENY_LIST_STATIC_FILTERS.definedTermCues], signingPlaceGuards: signingPlaceFilters.guards.map((entry) => ({ prefixPhrases: [...entry.prefixPhrases], diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index a3cc9227..384c9c0e 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -282,18 +282,30 @@ const VALIDATOR_IDS = new Map([ export const NATIVE_REGEX_VALIDATOR_IDS: ReadonlySet = new Set([ "au.abn", + "au.acn", + "at.tin", "br.cnpj", "br.cpf", "cz.dic", "cz.rc", + "de.idnr", + "de.stnr", + "dk.cpr", "es.cif", "es.dni", "es.nie", + "es.nss", + "fi.hetu", + "fi.vat", + "fi.ytunnus", + "fr.siren", "gb.nhs", "gb.nino", + "ie.pps", "no.mva", "no.orgnr", "us.ein", + "us.rtn", ]); // ── stdnum validator entries ──────────────────────── From 9dce65284d4fec7b887afa4c37ab57eed6d3c526 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 22:38:50 +0200 Subject: [PATCH 049/130] fix: address native review edges --- .github/tools/check-packlist.mjs | 1 + crates/anonymize-adapter-contract/src/lib.rs | 3 + crates/anonymize-core/src/false_positives.rs | 75 ++++++++++++++++++- crates/anonymize-core/src/prepared.rs | 24 +++++- crates/anonymize-core/src/processors.rs | 1 + crates/anonymize-core/src/triggers.rs | 30 ++++++-- .../tests/false_positive_parity.rs | 45 +++++++++++ crates/anonymize-core/tests/prepared.rs | 15 ++++ crates/anonymize-core/tests/trigger_parity.rs | 33 ++++++++ packages/anonymize/index.cjs | 3 + packages/anonymize/package.json | 4 +- .../__test__/native-adapter-parity.test.ts | 1 + .../src/__test__/native-node.test.ts | 50 +++++++++++-- .../anonymize/src/build-unified-search.ts | 2 + .../src/data/false-positive-shapes.json | 3 + packages/anonymize/src/detectors/deny-list.ts | 7 ++ packages/anonymize/src/native-node.ts | 12 +-- 17 files changed, 284 insertions(+), 25 deletions(-) create mode 100644 packages/anonymize/index.cjs diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 70229370..0694c5a7 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -11,6 +11,7 @@ const PACKAGES = [ "dist/native.mjs", "dist/native-node.d.mts", "dist/native-node.mjs", + "index.cjs", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 4f7ad12f..8c0d5dd4 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -356,6 +356,8 @@ pub struct BindingDenyListFilterData { pub street_types: Vec, #[serde(default)] pub address_component_terms: Vec, + #[serde(default)] + pub ambiguous_street_type_terms: Vec, pub first_names: Vec, pub generic_roles: Vec, #[serde(default)] @@ -1697,6 +1699,7 @@ fn deny_list_filters_from_binding( ), street_types: lower_set(filters.street_types), address_component_terms: lower_set(filters.address_component_terms), + ambiguous_street_type_terms: lower_set(filters.ambiguous_street_type_terms), first_names: lower_set(filters.first_names), generic_roles: lower_set(filters.generic_roles), number_abbrev_prefixes: lower_set(filters.number_abbrev_prefixes), diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs index f86b45ce..ef0acaf4 100644 --- a/crates/anonymize-core/src/false_positives.rs +++ b/crates/anonymize-core/src/false_positives.rs @@ -185,9 +185,14 @@ fn should_reject_address( if filters.is_some_and(|filters| is_jurisdiction_address(text, filters)) { return false; } - if entity.source == DetectionSource::Trigger && !has_digits && !has_component - { - return true; + if entity.source == DetectionSource::Trigger && !has_digits { + if filters.is_some_and(|filters| is_only_ambiguous_component(text, filters)) + { + return true; + } + if !has_component { + return true; + } } text.chars().count() > 40 @@ -593,6 +598,70 @@ fn has_address_component(text: &str, filters: &DenyListFilterData) -> bool { .any(|component| contains_component(&lower, component)) } +fn is_only_ambiguous_component( + text: &str, + filters: &DenyListFilterData, +) -> bool { + filters + .ambiguous_street_type_terms + .iter() + .any(|term| is_only_ambiguous_component_term(text, filters, term)) +} + +fn is_only_ambiguous_component_term( + text: &str, + filters: &DenyListFilterData, + term: &str, +) -> bool { + if term.is_empty() { + return false; + } + let Some((start, end)) = find_ambiguous_component_occurrence(text, term) + else { + return false; + }; + if text + .get(end..) + .is_some_and(starts_with_capitalized_token_after_space) + { + return false; + } + let mut stripped = String::with_capacity(text.len()); + stripped.push_str(text.get(..start).unwrap_or_default()); + stripped.push(' '); + stripped.push_str(text.get(end..).unwrap_or_default()); + !has_address_component(&stripped, filters) +} + +fn find_ambiguous_component_occurrence( + text: &str, + term: &str, +) -> Option<(usize, usize)> { + text.match_indices(term).find_map(|(start, _)| { + let end = start.saturating_add(term.len()); + let left_ok = text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary); + let right_ok = text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(is_right_component_boundary); + (left_ok && right_ok).then_some((start, end)) + }) +} + +fn starts_with_capitalized_token_after_space(text: &str) -> bool { + let leading = leading_whitespace_len(text); + if leading == 0 { + return false; + } + text + .get(leading..) + .and_then(|tail| tail.chars().next()) + .is_some_and(char::is_uppercase) +} + fn is_jurisdiction_address(text: &str, filters: &DenyListFilterData) -> bool { let lower = text.to_lowercase(); filters.address_jurisdiction_prefixes.iter().any(|prefix| { diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 579e5825..bcc65df4 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -9,7 +9,9 @@ use crate::legal_forms::{ LegalFormData, PreparedLegalFormData, process_legal_form_matches, }; use crate::money::{MonetaryData, PreparedMonetaryData}; -use crate::normalize::normalize_for_search_with_byte_map; +use crate::normalize::{ + NormalizedSearchText, normalize_for_search_with_byte_map, +}; use crate::processors::{ CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, RegexMatchMeta, ensure_supported_deny_list_sources, process_country_matches, @@ -464,8 +466,9 @@ impl PreparedSearch { } let legal_form_start = Instant::now(); - let legal_forms = offset_matches( - self.legal_forms.find_iter(full_text)?, + let legal_forms = normalized_offset_matches( + &self.legal_forms, + &normalized, self.slices.legal_forms.start, )?; if let Some(diagnostics) = &mut diagnostics { @@ -1248,6 +1251,19 @@ fn offset_matches( .collect() } +fn normalized_offset_matches( + search: &SearchIndex, + normalized: &NormalizedSearchText, + offset: u32, +) -> Result> { + search + .find_iter(normalized.as_str())? + .into_iter() + .map(|found| remap_normalized_match(normalized, found)) + .map(|found| found.and_then(|value| offset_match(value, offset))) + .collect() +} + fn offset_match(found: SearchMatch, offset: u32) -> Result { let pattern = found.pattern().checked_add(offset).ok_or_else(|| { Error::PatternIndexNotAddressable { @@ -1302,7 +1318,7 @@ fn sort_matches(matches: &mut [SearchMatch]) { } fn remap_normalized_match( - normalized: &crate::normalize::NormalizedSearchText, + normalized: &NormalizedSearchText, found: SearchMatch, ) -> Result { let (start, end) = normalized.map_span(found.start(), found.end())?; diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 66562afe..4480a4a2 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -255,6 +255,7 @@ pub struct DenyListFilterData { pub address_jurisdiction_prefixes: BTreeSet, pub street_types: BTreeSet, pub address_component_terms: BTreeSet, + pub ambiguous_street_type_terms: BTreeSet, pub first_names: BTreeSet, pub generic_roles: BTreeSet, pub number_abbrev_prefixes: BTreeSet, diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 9e969e8c..2e05af0f 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -904,16 +904,36 @@ fn hits_stop_word(text: &str, byte: usize, stop_words: &[String]) -> bool { return false; }; stop_words.iter().any(|word| { - tail - .get(..word.len()) - .is_some_and(|head| head.eq_ignore_ascii_case(word)) - && tail - .get(word.len()..) + unicode_case_prefix_len(tail, word).is_some_and(|word_len| { + tail + .get(word_len..) .and_then(|after| after.chars().next()) .is_none_or(|ch| !ch.is_alphanumeric()) + }) }) } +fn unicode_case_prefix_len(text: &str, prefix: &str) -> Option { + if prefix.is_empty() { + return None; + } + let prefix_chars = prefix.chars().count(); + let mut end = 0usize; + let mut count = 0usize; + for (index, ch) in text.char_indices() { + if count == prefix_chars { + break; + } + count = count.saturating_add(1); + end = index.saturating_add(ch.len_utf8()); + } + if count != prefix_chars { + return None; + } + let candidate = text.get(..end)?; + (candidate.to_lowercase() == prefix.to_lowercase()).then_some(end) +} + fn is_decimal_comma(text: &str) -> bool { let mut chars = text.chars(); if chars.next() != Some(',') { diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index f38695b3..ead8c86f 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -146,3 +146,48 @@ fn rejects_document_structure_heading_organizations() { [String::from("Acme No. 4")] ); } + +#[test] +fn rejects_only_ambiguous_street_type_trigger_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("demeurant"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("demeurant"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(80), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + street_types: set(["cours"]), + ambiguous_street_type_terms: set(["cours"]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert!( + resolved_texts(&prepared, "demeurant au cours du contrat.").is_empty() + ); + assert_eq!( + resolved_texts(&prepared, "demeurant Cours Mirabeau."), + [String::from("Cours Mirabeau")] + ); +} diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 02a273be..34a66463 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -69,7 +69,9 @@ fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { String::from("as"), String::from("co"), String::from("inc"), + String::from("ltd"), String::from("llc"), + String::from("pty"), String::from("sro"), ], normalized_in_name_words: vec![String::from("co")], @@ -77,7 +79,9 @@ fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { String::from("as"), String::from("co"), String::from("inc"), + String::from("ltd"), String::from("llc"), + String::from("pty"), String::from("sro"), ], connector_words: vec![ @@ -100,6 +104,17 @@ fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { .unwrap() } +#[test] +fn prepared_search_runs_legal_form_pass_on_normalized_text() { + let prepared = legal_form_prepared_search(vec!["Pty Ltd"]); + let result = prepared + .detect_static_entities("Acme Pty\u{00a0}Ltd signed the agreement.") + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!(result.legal_form_entities[0].text, "Acme Pty\u{00a0}Ltd"); +} + #[test] fn prepared_search_runs_normalized_literal_pass() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index efa3ef25..05a70776 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -149,6 +149,39 @@ fn to_next_comma_stops_after_short_currency_abbreviation_sentence_tail() { ); } +#[test] +fn to_next_comma_stops_on_unicode_case_stop_words() { + let prepared = prepared_for_trigger( + "court", + "organization", + TriggerStrategy::ToNextComma { + stop_words: vec![String::from("dňa")], + max_length: Some(100), + }, + ); + + let result = prepared + .detect_static_entities("court Okresný súd DŇA 1.1.2025, other text.") + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), [String::from("Okresný súd")]); +} + +#[test] +fn company_id_trigger_rejects_single_digit_dotted_date() { + let prepared = prepared_for_trigger( + "DNI", + "national identification number", + TriggerStrategy::CompanyIdValue, + ); + + let result = prepared + .detect_static_entities("DNI 6.11.2025") + .expect("static detection should succeed"); + + assert!(result.trigger_entities.is_empty()); +} + #[test] fn address_trigger_stops_after_short_proper_noun_before_real_sentence() { let prepared = prepared_for_trigger( diff --git a/packages/anonymize/index.cjs b/packages/anonymize/index.cjs new file mode 100644 index 00000000..9636dfab --- /dev/null +++ b/packages/anonymize/index.cjs @@ -0,0 +1,3 @@ +"use strict"; + +module.exports = require("./stella_anonymize_napi.node"); diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 8f63448b..bd0a92cd 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -27,7 +27,9 @@ }, "types": "dist/index.d.mts", "files": [ - "dist" + "dist", + "index.cjs", + "*.node" ], "publishConfig": { "access": "public" diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 26127eb3..6d00eddd 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -199,6 +199,7 @@ const CONFIG_JSON = JSON.stringify({ person_stopwords: [], address_stopwords: [], street_types: [], + ambiguous_street_type_terms: [], first_names: [], generic_roles: [], sentence_starters: [], diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index dd037b69..8d10ed12 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -72,6 +72,28 @@ describe("native node loader", () => { expect(calls).toEqual(["/tmp/anonymize.node"]); }); + test("accepts a napi class constructor on the native binding", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0", { + preparedSearchAsConstructor: true, + }); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "@stll/anonymize-darwin-arm64") { + return binding; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + }); + test("rejects mismatched native binding versions", () => { expect(() => loadNativeAnonymizeBinding({ @@ -90,15 +112,29 @@ describe("native node loader", () => { }); }); -const fakeNativeBinding = (version: string): NativeAnonymizeBinding => ({ - nativePackageVersion: () => version, - NativePreparedSearch: { +type FakeNativeBindingOptions = { + preparedSearchAsConstructor?: boolean; +}; + +const fakeNativeBinding = ( + version: string, + options: FakeNativeBindingOptions = {}, +): NativeAnonymizeBinding => { + const preparedSearch = { fromConfigJsonBytes: () => fakePreparedSearch(), fromPreparedPackageBytes: () => fakePreparedSearch(), - }, - prepareStaticSearchPackageBytes: () => new Uint8Array(), - prepareStaticSearchCompressedPackageBytes: () => new Uint8Array(), -}); + }; + const NativePreparedSearch = options.preparedSearchAsConstructor + ? Object.assign(function NativePreparedSearch() {}, preparedSearch) + : preparedSearch; + + return { + nativePackageVersion: () => version, + NativePreparedSearch, + prepareStaticSearchPackageBytes: () => new Uint8Array(), + prepareStaticSearchCompressedPackageBytes: () => new Uint8Array(), + }; +}; const fakePreparedSearch = () => ({ redactStaticEntities: () => ({ diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index e20279a3..156affc4 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -134,6 +134,7 @@ export type NativeDenyListFilterData = { address_jurisdiction_prefixes: string[]; street_types: string[]; address_component_terms: string[]; + ambiguous_street_type_terms: string[]; first_names: string[]; generic_roles: string[]; number_abbrev_prefixes: string[]; @@ -1391,6 +1392,7 @@ const toNativeDenyListFilters = ( address_jurisdiction_prefixes: filters.addressJurisdictionPrefixes, street_types: filters.streetTypes, address_component_terms: filters.addressComponentTerms, + ambiguous_street_type_terms: filters.ambiguousStreetTypeTerms, first_names: filters.firstNames, generic_roles: filters.genericRoles, number_abbrev_prefixes: filters.numberAbbrevPrefixes, diff --git a/packages/anonymize/src/data/false-positive-shapes.json b/packages/anonymize/src/data/false-positive-shapes.json index 49552bab..e30eb764 100644 --- a/packages/anonymize/src/data/false-positive-shapes.json +++ b/packages/anonymize/src/data/false-positive-shapes.json @@ -3,6 +3,9 @@ "addressComponentTerms": { "cs": ["č.p.", "č.ev.", "č.", "sídliště"] }, + "ambiguousStreetTypeTerms": { + "fr": ["cours"] + }, "numberAbbrevPrefixes": { "cs": ["čís.", "č."], "de": ["nr."], diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index 524baa44..895d52be 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -494,6 +494,7 @@ type DenyListLanguageFilters = { type FalsePositiveShapeFilters = { addressComponentTerms: string[]; + ambiguousStreetTypeTerms: string[]; numberAbbrevPrefixes: string[]; documentHeadingOrdinalMarkers: string[]; }; @@ -514,6 +515,7 @@ export type DenyListFilterData = { addressJurisdictionPrefixes: string[]; streetTypes: string[]; addressComponentTerms: string[]; + ambiguousStreetTypeTerms: string[]; firstNames: string[]; genericRoles: string[]; numberAbbrevPrefixes: string[]; @@ -1173,6 +1175,9 @@ const loadFalsePositiveShapeFilters = addressComponentTerms: languageWordValues( data["addressComponentTerms"], ), + ambiguousStreetTypeTerms: languageWordValues( + data["ambiguousStreetTypeTerms"], + ), numberAbbrevPrefixes: languageWordValues(data["numberAbbrevPrefixes"]), documentHeadingOrdinalMarkers: languageWordValues( data["documentHeadingOrdinalMarkers"], @@ -1273,6 +1278,8 @@ const buildDenyListFilterData = async ( addressJurisdictionPrefixes, streetTypes: await buildStreetTypeFilterValues(), addressComponentTerms: falsePositiveShapeFilters.addressComponentTerms, + ambiguousStreetTypeTerms: + falsePositiveShapeFilters.ambiguousStreetTypeTerms, firstNames: [...getNameCorpusFirstNames(ctx)], genericRoles: [ ...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES), diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index f5b45408..4a022dde 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -151,14 +151,16 @@ const toNativeAnonymizeBinding = ( value: unknown, ): NativeAnonymizeBinding | null => { const candidate = - isRecord(value) && isRecord(value["default"]) ? value["default"] : value; + isPropertyBag(value) && isPropertyBag(value["default"]) + ? value["default"] + : value; return isNativeAnonymizeBinding(candidate) ? candidate : null; }; const isNativeAnonymizeBinding = ( candidate: unknown, ): candidate is NativeAnonymizeBinding => { - if (!isRecord(candidate)) { + if (!isPropertyBag(candidate)) { return false; } if (typeof candidate["nativePackageVersion"] !== "function") { @@ -173,7 +175,7 @@ const isNativeAnonymizeBinding = ( return false; } const preparedSearch = candidate["NativePreparedSearch"]; - if (!isRecord(preparedSearch)) { + if (!isPropertyBag(preparedSearch)) { return false; } if (typeof preparedSearch["fromConfigJsonBytes"] !== "function") { @@ -185,8 +187,8 @@ const isNativeAnonymizeBinding = ( return true; }; -const isRecord = (value: unknown): value is Record => - typeof value === "object" && value !== null; +const isPropertyBag = (value: unknown): value is Record => + (typeof value === "object" && value !== null) || typeof value === "function"; const formatLoadError = (error: unknown): string => { if (error instanceof Error) { From 8b574002da8cbd35b5859d2a11b08a5ab799fe6d Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 22:44:25 +0200 Subject: [PATCH 050/130] fix: mirror false-positive shape data --- .../data/config/false-positive-shapes.json | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 packages/data/config/false-positive-shapes.json diff --git a/packages/data/config/false-positive-shapes.json b/packages/data/config/false-positive-shapes.json new file mode 100644 index 00000000..e30eb764 --- /dev/null +++ b/packages/data/config/false-positive-shapes.json @@ -0,0 +1,20 @@ +{ + "_comment": "Language-keyed lexical markers used by false-positive shape guards.", + "addressComponentTerms": { + "cs": ["č.p.", "č.ev.", "č.", "sídliště"] + }, + "ambiguousStreetTypeTerms": { + "fr": ["cours"] + }, + "numberAbbrevPrefixes": { + "cs": ["čís.", "č."], + "de": ["nr."], + "en": ["no.", "n."] + }, + "documentHeadingOrdinalMarkers": { + "cs": ["č.", "č"], + "de": ["nr.", "nr"], + "en": ["no.", "no", "n.", "n"], + "global": ["#"] + } +} From 0eb7bb8308698ffbaf8152173d877221ed27f84e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 23:02:42 +0200 Subject: [PATCH 051/130] feat: expose native pipeline package path --- .../__test__/native-adapter-parity.test.ts | 153 +++++++++++++++- packages/anonymize/src/index-shared.ts | 16 ++ packages/anonymize/src/native-pipeline.ts | 173 ++++++++++++++++++ 3 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 packages/anonymize/src/native-pipeline.ts diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 6d00eddd..da4fd3ec 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -22,7 +22,22 @@ import { type NativePreparedSearchBinding, type NativeStaticRedactionResult, } from "../native"; -import { createPipelineContext, preparePipelineSearch } from "../index"; +import type { + Entity, + OperatorConfig, + PipelineConfig, + RedactionResult, +} from "../types"; +import { + createPipelineContext, + createNativePipelineFromPackage, + DEFAULT_ENTITY_LABELS, + getNativePipelineCompatibility, + preparePipelineSearch, + prepareNativePipelinePackage, + redactText, + runPipeline, +} from "../index"; import { applyPipelineLanguageScope } from "../language-scope"; import { contractTestConfig } from "./contract-config"; import { loadTestDictionaries } from "./load-dictionaries"; @@ -813,6 +828,117 @@ describe("native adapter parity", () => { expect(result.redaction.redactedText).toContain("***"); }); + test("native pipeline package matches TS static pipeline redaction", async () => { + const adapters = getAdapters(); + const fullText = + "Project Nebula and Blue Harbour signed MAT-123 on 2024-01-02. " + + "Acme s.r.o.\n/s/ Jane Doe"; + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: true, + customDenyList: [ + { + value: "Project Nebula", + label: "organization", + variants: ["Nebula Programme"], + }, + ], + customRegexes: [ + { pattern: "\\bMAT-\\d{3}\\b", label: "matter id", score: 1 }, + ], + enableGazetteer: true, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: [...DEFAULT_ENTITY_LABELS, "matter id"], + workspaceId: "native-pipeline-static-test", + }; + const gazetteerEntries = [ + { + id: "blue-harbour", + canonical: "Blue Harbor Capital", + label: "organization", + variants: ["Blue Harbour"], + workspaceId: "native-pipeline-static-test", + createdAt: 0, + source: "manual" as const, + }, + ]; + const operators: NativeOperatorConfig & OperatorConfig = { + operators: { "matter id": "redact" }, + redactString: "***", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + gazetteerEntries, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries, + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline compatibility rejects TS-only contextual passes", () => { + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableNer: true, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "native-pipeline-compat-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "unsupported", + unsupportedFeatures: [ + "enableNer", + "enableNameCorpus", + "enableConfidenceBoost", + "enableCoreference", + "enableZoneClassification", + "enableHotwordRules", + ], + }); + }); + test("native facade and Python match on contract fixture packages", async () => { const adapters = getAdapters(); for (const language of CONTRACT_FIXTURE_LANGUAGES) { @@ -1329,6 +1455,31 @@ const toNativeFacadeEntity = ({ ...(sourceDetail ? { sourceDetail } : {}), }); +const toBindingEntity = ( + entity: Entity, +): StaticRedactionResult["resolved_entities"][number] => ({ + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.sourceDetail ?? null, +}); + +const toBindingRedactionResult = ( + result: RedactionResult, +): StaticRedactionResult["redaction"] => ({ + redacted_text: result.redactedText, + redaction_map: [...result.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.entityCount, +}); + const toBindingStaticResult = ( result: NativeStaticRedactionResult, ): StaticRedactionResult => ({ diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index ac2a3ad3..4befc7a8 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -71,6 +71,22 @@ export type { NativeSearchPackageOptions, NativeStaticRedactionResult, } from "./native"; +export { + PreparedNativePipeline, + assertNativePipelineSupported, + createNativePipelineFromConfig, + createNativePipelineFromPackage, + getNativePipelineCompatibility, + prepareNativePipelineConfig, + prepareNativePipelinePackage, +} from "./native-pipeline"; +export type { + NativePipelineBuildOptions, + NativePipelineCompatibility, + NativePipelineFromPackageOptions, + NativePipelinePackageOptions, + NativePipelineUnsupportedFeature, +} from "./native-pipeline"; // ── Redaction ───────────────────────────────────── export { diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts new file mode 100644 index 00000000..44743c2a --- /dev/null +++ b/packages/anonymize/src/native-pipeline.ts @@ -0,0 +1,173 @@ +import { + buildNativeStaticSearchBundle, + type NativePreparedSearchConfig, +} from "./build-unified-search"; +import type { PipelineContext } from "./context"; +import { defaultContext } from "./context"; +import type { GazetteerEntry, PipelineConfig } from "./types"; +import { + createNativeAnonymizerFromConfig, + createNativeAnonymizerFromPackage, + prepareNativeSearchPackage, + PreparedNativeAnonymizer, + type NativeAnonymizeBinding, + type NativeOperatorConfig, + type NativeStaticRedactionResult, +} from "./native"; + +export type NativePipelineUnsupportedFeature = + | "enableNer" + | "enableNameCorpus" + | "enableConfidenceBoost" + | "enableCoreference" + | "enableZoneClassification" + | "enableHotwordRules"; + +export type NativePipelineCompatibility = + | { status: "supported" } + | { + status: "unsupported"; + unsupportedFeatures: NativePipelineUnsupportedFeature[]; + }; + +export type NativePipelineBuildOptions = { + binding: NativeAnonymizeBinding; + config: PipelineConfig; + gazetteerEntries?: GazetteerEntry[]; + context?: PipelineContext; +}; + +export type NativePipelinePackageOptions = NativePipelineBuildOptions & { + compressed?: boolean; +}; + +export type NativePipelineFromPackageOptions = { + binding: NativeAnonymizeBinding; + packageBytes: Uint8Array; +}; + +export class PreparedNativePipeline { + readonly #anonymizer: PreparedNativeAnonymizer; + + constructor(anonymizer: PreparedNativeAnonymizer) { + this.#anonymizer = anonymizer; + } + + prepareDiagnosticsJson(): string | null { + return this.#anonymizer.prepareDiagnosticsJson(); + } + + redactText( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.#anonymizer.redactStaticEntities(fullText, operators); + } + + redactTextDiagnosticsJson( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.#anonymizer.redactStaticEntitiesDiagnosticsJson( + fullText, + operators, + ); + } +} + +export const getNativePipelineCompatibility = ( + config: PipelineConfig, +): NativePipelineCompatibility => { + const unsupportedFeatures: NativePipelineUnsupportedFeature[] = []; + + if (config.enableNer) unsupportedFeatures.push("enableNer"); + if (config.enableNameCorpus) unsupportedFeatures.push("enableNameCorpus"); + if (config.enableConfidenceBoost) { + unsupportedFeatures.push("enableConfidenceBoost"); + } + if (config.enableCoreference) unsupportedFeatures.push("enableCoreference"); + if (config.enableZoneClassification === true) { + unsupportedFeatures.push("enableZoneClassification"); + } + if (config.enableHotwordRules === true) { + unsupportedFeatures.push("enableHotwordRules"); + } + + if (unsupportedFeatures.length === 0) { + return { status: "supported" }; + } + return { status: "unsupported", unsupportedFeatures }; +}; + +export const assertNativePipelineSupported = (config: PipelineConfig): void => { + const compatibility = getNativePipelineCompatibility(config); + if (compatibility.status === "supported") { + return; + } + throw new Error( + `Native pipeline does not yet support: ${compatibility.unsupportedFeatures.join(", ")}`, + ); +}; + +export const prepareNativePipelineConfig = async ({ + config, + gazetteerEntries = [], + context, +}: Omit< + NativePipelineBuildOptions, + "binding" +>): Promise => { + assertNativePipelineSupported(config); + const bundle = await buildNativeStaticSearchBundle( + config, + gazetteerEntries, + context ?? defaultContext, + ); + return bundle.nativeStaticConfig; +}; + +export const prepareNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries = [], + context, + compressed = true, +}: NativePipelinePackageOptions): Promise => { + const nativeConfig = await prepareNativePipelineConfig({ + config, + gazetteerEntries, + ...(context ? { context } : {}), + }); + return prepareNativeSearchPackage({ + binding, + config: nativeConfig, + compressed, + }); +}; + +export const createNativePipelineFromConfig = async ({ + binding, + config, + gazetteerEntries = [], + context, +}: NativePipelineBuildOptions): Promise => { + const nativeConfig = await prepareNativePipelineConfig({ + config, + gazetteerEntries, + ...(context ? { context } : {}), + }); + return new PreparedNativePipeline( + createNativeAnonymizerFromConfig({ + binding, + config: nativeConfig, + }), + ); +}; + +export const createNativePipelineFromPackage = ({ + binding, + packageBytes, +}: NativePipelineFromPackageOptions): PreparedNativePipeline => + new PreparedNativePipeline( + createNativeAnonymizerFromPackage({ binding, packageBytes }), + ); From 55069570a41c4b23167409bdaee14a7978698ac1 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 23:12:44 +0200 Subject: [PATCH 052/130] fix: cap prepared package payloads --- crates/anonymize-adapter-contract/src/lib.rs | 84 ++++++++++++++++++-- 1 file changed, 79 insertions(+), 5 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 8c0d5dd4..117235d9 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -27,6 +27,7 @@ const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 3; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; +const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; #[derive(Clone, Debug, Eq, PartialEq)] pub enum ContractError { @@ -1191,9 +1192,16 @@ impl<'a> PreparedSearchPackageParts<'a> { uncompressed_len, payload, .. - } => zstd::bulk::decompress(payload, uncompressed_len) - .map(Cow::Owned) - .map_err(|error| invalid_prepared_search_package(error.to_string())), + } => { + if uncompressed_len > MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES { + return Err(invalid_prepared_search_package( + "uncompressed payload length exceeds limit", + )); + } + zstd::bulk::decompress(payload, uncompressed_len) + .map(Cow::Owned) + .map_err(|error| invalid_prepared_search_package(error.to_string())) + } } } } @@ -2070,14 +2078,20 @@ mod tests { use super::{ BindingOperatorConfig, BindingPreparedSearchConfig, BindingSearchOptions, - BindingSearchPattern, ContractError, operator_config_from_binding, + BindingSearchPattern, ContractError, + MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES, + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_PACKAGE_DIGEST_BYTES, operator_config_from_binding, prepared_search_config_from_binding, prepared_search_core_package_from_bytes, prepared_search_core_package_to_bytes, prepared_search_core_package_to_compressed_bytes, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, prepared_search_package_to_bytes, - prepared_search_package_to_compressed_bytes, + prepared_search_package_to_compressed_bytes, write_package_header, }; #[test] @@ -2173,6 +2187,36 @@ mod tests { ); } + #[test] + fn prepared_search_compressed_package_rejects_oversized_payload_len() { + let bytes = compressed_package_with_len( + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + oversized_payload_len(), + ); + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert_invalid_package_reason( + error, + "uncompressed payload length exceeds limit", + ); + } + + #[test] + fn prepared_search_core_compressed_package_rejects_oversized_payload_len() { + let bytes = compressed_package_with_len( + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + oversized_payload_len(), + ); + let error = prepared_search_core_package_from_bytes(&bytes).unwrap_err(); + + assert_invalid_package_reason( + error, + "uncompressed payload length exceeds limit", + ); + } + #[test] fn prepared_search_core_package_roundtrips_config_and_artifacts() { let config = @@ -2232,4 +2276,34 @@ mod tests { ..BindingPreparedSearchConfig::default() } } + + fn compressed_package_with_len( + header: [u8; 8], + version: u32, + uncompressed_len: u64, + ) -> Vec { + let digest = [0; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES]; + let mut bytes = Vec::new(); + write_package_header(&mut bytes, header, version, &digest); + bytes.extend_from_slice(&uncompressed_len.to_le_bytes()); + bytes + } + + fn oversized_payload_len() -> u64 { + u64::try_from(MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES) + .unwrap() + .checked_add(1) + .unwrap() + } + + fn assert_invalid_package_reason(error: ContractError, expected: &str) { + assert!( + matches!( + error, + ContractError::InvalidPreparedSearchPackage { reason } + if reason == expected + ), + "expected invalid package reason: {expected}" + ); + } } From c1e28ded8477a2117116ed1106dcf715980fde31 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Thu, 25 Jun 2026 23:30:52 +0200 Subject: [PATCH 053/130] feat: support native confidence boost --- crates/anonymize-adapter-contract/src/lib.rs | 14 ++- crates/anonymize-core/src/prepared.rs | 89 ++++++++++++++++++- .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 45 ++++++++++ crates/anonymize-core/tests/trigger_parity.rs | 1 + .../__test__/native-adapter-parity.test.ts | 67 +++++++++++++- .../src/__test__/pipeline-config.test.ts | 2 + .../anonymize/src/build-unified-search.ts | 8 ++ packages/anonymize/src/native-pipeline.ts | 4 - 10 files changed, 221 insertions(+), 11 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 117235d9..e2ac920b 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -18,13 +18,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 4; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 5; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 2; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 3; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 3; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 4; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 3; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 4; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -403,6 +403,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub threshold: f64, #[serde(default)] + pub confidence_boost: bool, + #[serde(default)] pub slices: BindingPreparedSearchSlices, #[serde(default)] pub regex_meta: Vec, @@ -467,6 +469,7 @@ struct BinaryPreparedSearchConfig { literal_patterns_from_deny_list_data: bool, allowed_labels: Vec, threshold: f64, + confidence_boost: bool, slices: BindingPreparedSearchSlices, regex_meta: Vec, custom_regex_meta: Vec, @@ -665,6 +668,7 @@ impl From for BinaryPreparedSearchConfig { .literal_patterns_from_deny_list_data, allowed_labels: config.allowed_labels, threshold: config.threshold, + confidence_boost: config.confidence_boost, slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -693,6 +697,7 @@ impl From for BindingPreparedSearchConfig { .literal_patterns_from_deny_list_data, allowed_labels: config.allowed_labels, threshold: config.threshold, + confidence_boost: config.confidence_boost, slices: config.slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -1129,6 +1134,7 @@ pub fn prepared_search_config_from_binding( literal_options: search_options_from_binding(config.literal_options), allowed_labels: config.allowed_labels, threshold: config.threshold, + confidence_boost: config.confidence_boost, slices: slices_from_binding(&config.slices), regex_meta: regex_meta_from_binding(config.regex_meta)?, custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index bcc65df4..eb7df871 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -37,6 +37,10 @@ use crate::types::{ const PREPARED_SEARCH_ARTIFACTS_HEADER: [u8; 8] = *b"ANONPSR1"; const PREPARED_SEARCH_ARTIFACTS_VERSION: u32 = 1; +const NEAR_MISS_BAND: f64 = 0.15; +const BOOST_PER_NEIGHBOUR: f64 = 0.05; +const CONTEXT_WINDOW_CHARS: f64 = 150.0; +const HIGH_CONFIDENCE_FLOOR: f64 = 0.9; pub struct PreparedSearch { regex: SearchIndex, @@ -46,6 +50,7 @@ pub struct PreparedSearch { literals: SearchIndex, allowed_labels: Vec, threshold: f64, + confidence_boost: bool, slices: PreparedSearchSlices, regex_meta: Vec, custom_regex_meta: Vec, @@ -85,6 +90,8 @@ pub struct PreparedSearchConfig { pub allowed_labels: Vec, #[serde(default)] pub threshold: f64, + #[serde(default)] + pub confidence_boost: bool, pub slices: PreparedSearchSlices, pub regex_meta: Vec, pub custom_regex_meta: Vec, @@ -336,6 +343,7 @@ impl PreparedSearch { let slices = config.slices.clone(); let allowed_labels = config.allowed_labels.clone(); let threshold = config.threshold; + let confidence_boost = config.confidence_boost; let regex_groups = split_regex_patterns(config.regex_patterns, &slices)?; let regex_len = regex_groups.regex.len(); let custom_regex_len = config.custom_regex_patterns.len(); @@ -413,6 +421,7 @@ impl PreparedSearch { literals, allowed_labels, threshold, + confidence_boost, slices, regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, @@ -819,9 +828,10 @@ impl PreparedSearch { ) -> Result { let detections = self .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; - let raw_entities = filter_entities_for_config( + let raw_entities = filter_entities_for_redaction( detections.all_entities(), self.threshold, + self.confidence_boost, &self.allowed_labels, ); let merge_start = Instant::now(); @@ -901,10 +911,32 @@ fn filter_entities_for_config( entities: Vec, threshold: f64, allowed_labels: &[String], +) -> Vec { + filter_entities_for_threshold( + filter_entities_for_labels(entities, allowed_labels), + threshold, + ) +} + +fn filter_entities_for_redaction( + entities: Vec, + threshold: f64, + confidence_boost: bool, + allowed_labels: &[String], +) -> Vec { + let entities = filter_entities_for_labels(entities, allowed_labels); + if confidence_boost { + return boost_near_miss_entities(entities, threshold); + } + filter_entities_for_threshold(entities, threshold) +} + +fn filter_entities_for_labels( + entities: Vec, + allowed_labels: &[String], ) -> Vec { entities .into_iter() - .filter(|entity| entity.score >= threshold) .filter(|entity| { allowed_labels.is_empty() || allowed_labels.iter().any(|label| label == &entity.label) @@ -912,6 +944,59 @@ fn filter_entities_for_config( .collect() } +fn filter_entities_for_threshold( + entities: Vec, + threshold: f64, +) -> Vec { + entities + .into_iter() + .filter(|entity| entity.score >= threshold) + .collect() +} + +fn boost_near_miss_entities( + entities: Vec, + threshold: f64, +) -> Vec { + let near_miss_floor = f64::max(0.0, threshold - NEAR_MISS_BAND); + let anchors = entities + .iter() + .filter(|entity| entity.score >= HIGH_CONFIDENCE_FLOOR) + .map(entity_midpoint) + .collect::>(); + + entities + .into_iter() + .filter_map(|mut entity| { + if entity.score >= threshold { + return Some(entity); + } + if entity.score < near_miss_floor { + return None; + } + + let midpoint = entity_midpoint(&entity); + let neighbours = anchors + .iter() + .filter(|anchor| (midpoint - **anchor).abs() <= CONTEXT_WINDOW_CHARS) + .count(); + let neighbour_count = u32::try_from(neighbours).unwrap_or(u32::MAX); + let boosted_score = + f64::from(neighbour_count).mul_add(BOOST_PER_NEIGHBOUR, entity.score); + if boosted_score < threshold { + return None; + } + + entity.score = f64::min(1.0, boosted_score); + Some(entity) + }) + .collect() +} + +fn entity_midpoint(entity: &PipelineEntity) -> f64 { + f64::midpoint(f64::from(entity.start), f64::from(entity.end)) +} + fn record_static_entity_diagnostics( diagnostics: &mut StaticRedactionDiagnostics, full_text: &str, diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index f280c220..9bada913 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -16,6 +16,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices, regex_meta: vec![], custom_regex_meta: vec![], diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index ead8c86f..f985aa76 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -18,6 +18,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices, regex_meta: vec![], custom_regex_meta: vec![], diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 34a66463..484a7d45 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -23,6 +23,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices, regex_meta: vec![], custom_regex_meta: vec![], @@ -130,6 +131,7 @@ fn prepared_search_runs_normalized_literal_pass() { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { gazetteer: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() @@ -173,6 +175,7 @@ fn prepared_search_artifacts_match_direct_prepare() { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -306,6 +309,7 @@ fn prepared_search_emits_static_detector_entities() { }, allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, custom_regex: PatternSlice { start: 0, end: 1 }, @@ -946,6 +950,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { }, allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -1024,6 +1029,44 @@ fn prepared_search_applies_threshold_before_merge() { assert_eq!(result.resolved_entities[0].text, "Acme"); } +#[test] +fn prepared_search_boosts_near_miss_entities_when_enabled() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"\bANCHOR-\d+\b")), + SearchPattern::Regex(String::from(r"\bNEAR-\d+\b")), + ], + threshold: 0.5, + confidence_boost: true, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("registration number", 0.95), + RegexMatchMeta::new("matter id", 0.45), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "ANCHOR-123 signed with NEAR-456.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 2); + assert_eq!(result.resolved_entities[0].text, "ANCHOR-123"); + assert_eq!(result.resolved_entities[1].text, "NEAR-456"); + assert!((result.resolved_entities[1].score - 0.5).abs() < f64::EPSILON); + assert_eq!( + result.redaction.redacted_text, + "[REGISTRATION_NUMBER_1] signed with [MATTER_ID_1]." + ); +} + #[test] fn prepared_search_applies_allowed_labels_before_redaction() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -1141,6 +1184,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { }, allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, gazetteer: PatternSlice { start: 0, end: 1 }, @@ -1219,6 +1263,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { }, allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices: PreparedSearchSlices { deny_list: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 05a70776..3e400251 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -16,6 +16,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { literal_options: SearchOptions::default(), allowed_labels: vec![], threshold: 0.0, + confidence_boost: false, slices, regex_meta: vec![], custom_regex_meta: vec![], diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index da4fd3ec..c73f4dd1 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -908,6 +908,72 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS confidence boost redaction", async () => { + const adapters = getAdapters(); + const fullText = "ANCHOR-123 signed with NEAR-456."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + customRegexes: [ + { + pattern: "\\bANCHOR-\\d+\\b", + label: "registration number", + score: 0.95, + }, + { pattern: "\\bNEAR-\\d+\\b", label: "matter id", score: 0.45 }, + ], + labels: [...DEFAULT_ENTITY_LABELS, "matter id"], + workspaceId: "native-pipeline-confidence-boost-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities.some(({ text }) => text === "NEAR-456")).toBe(true); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, @@ -931,7 +997,6 @@ describe("native adapter parity", () => { unsupportedFeatures: [ "enableNer", "enableNameCorpus", - "enableConfidenceBoost", "enableCoreference", "enableZoneClassification", "enableHotwordRules", diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index ef03b01e..e002cd08 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -117,6 +117,7 @@ describe("pipeline config semantics", () => { { ...BASE_CONFIG, enableRegex: true, + enableConfidenceBoost: true, labels: ["person"], threshold: 0.93, }, @@ -126,6 +127,7 @@ describe("pipeline config semantics", () => { expect(search.nativeStaticConfig.allowed_labels).toEqual(["person"]); expect(search.nativeStaticConfig.threshold).toBe(0.93); + expect(search.nativeStaticConfig.confidence_boost).toBe(true); }); test("native config keeps unsupported validator regexes fail-fast", async () => { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 156affc4..649a920b 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -237,6 +237,7 @@ export type NativePreparedSearchConfig = { literal_patterns_from_deny_list_data?: boolean; allowed_labels: string[]; threshold: number; + confidence_boost: boolean; slices: { regex: PatternSlice; custom_regex: PatternSlice; @@ -343,6 +344,7 @@ type UnifiedSearchSources = { nativeSigningPatternRange: PatternSlice; nativeAllowedLabels: readonly string[]; threshold: number; + confidenceBoost: boolean; slices: UnifiedSearchInstance["slices"]; literalAllPatterns: PatternEntry[] | string[]; canUseGlobalWholeWordLiterals: boolean; @@ -686,6 +688,7 @@ const buildUnifiedSearchSources = async ( nativeSigningPatternRange, nativeAllowedLabels: config.labels, threshold: config.threshold, + confidenceBoost: config.enableConfidenceBoost, slices: { regex: regexSlice, customRegex: customRegexSlice, @@ -742,6 +745,7 @@ export const buildNativeStaticSearchBundle = async ( customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, allowedLabels: sources.nativeAllowedLabels, threshold: sources.threshold, + confidenceBoost: sources.confidenceBoost, }), slices: sources.slices, regexMeta: sources.regexMeta, @@ -823,6 +827,7 @@ export const buildUnifiedSearch = async ( customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, allowedLabels: sources.nativeAllowedLabels, threshold: sources.threshold, + confidenceBoost: sources.confidenceBoost, }); return { @@ -865,6 +870,7 @@ type BuildNativeStaticConfigArgs = { customDenyListNeedsWholeWords: (pattern: string) => boolean; allowedLabels: readonly string[]; threshold: number; + confidenceBoost: boolean; }; const buildNativeStaticConfig = ({ @@ -892,6 +898,7 @@ const buildNativeStaticConfig = ({ customDenyListNeedsWholeWords, allowedLabels, threshold, + confidenceBoost, }: BuildNativeStaticConfigArgs): NativePreparedSearchConfig => { const nativeRegexPatterns: NativeSearchPattern[] = []; const nativeRegexMeta: NativeRegexMatchMeta[] = []; @@ -1006,6 +1013,7 @@ const buildNativeStaticConfig = ({ literal_patterns_from_deny_list_data: denyListPatternsFromData, allowed_labels: [...allowedLabels], threshold, + confidence_boost: confidenceBoost, slices: { regex: { start: 0, end: nativeRegexPatterns.length }, custom_regex: { start: 0, end: nativeCustomRegexPatterns.length }, diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 44743c2a..1812e33d 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -18,7 +18,6 @@ import { export type NativePipelineUnsupportedFeature = | "enableNer" | "enableNameCorpus" - | "enableConfidenceBoost" | "enableCoreference" | "enableZoneClassification" | "enableHotwordRules"; @@ -82,9 +81,6 @@ export const getNativePipelineCompatibility = ( if (config.enableNer) unsupportedFeatures.push("enableNer"); if (config.enableNameCorpus) unsupportedFeatures.push("enableNameCorpus"); - if (config.enableConfidenceBoost) { - unsupportedFeatures.push("enableConfidenceBoost"); - } if (config.enableCoreference) unsupportedFeatures.push("enableCoreference"); if (config.enableZoneClassification === true) { unsupportedFeatures.push("enableZoneClassification"); From 7ee8fc80efe6fb6c486ee4152cbdcf7cdbea42d8 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 00:00:48 +0200 Subject: [PATCH 054/130] feat: support native hotword rules --- crates/anonymize-adapter-contract/src/lib.rs | 62 +++++- crates/anonymize-core/src/hotwords.rs | 210 ++++++++++++++++++ crates/anonymize-core/src/lib.rs | 2 + crates/anonymize-core/src/prepared.rs | 73 +++++- .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 69 +++++- crates/anonymize-core/tests/trigger_parity.rs | 1 + .../__test__/native-adapter-parity.test.ts | 66 +++++- .../src/__test__/pipeline-config.test.ts | 27 +++ .../anonymize/src/build-unified-search.ts | 72 +++++- .../anonymize/src/filters/hotword-rules.ts | 32 ++- packages/anonymize/src/native-pipeline.ts | 6 +- 13 files changed, 593 insertions(+), 29 deletions(-) create mode 100644 crates/anonymize-core/src/hotwords.rs diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index e2ac920b..c9733c3a 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -6,11 +6,12 @@ use stella_anonymize_core::{ AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, - LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, - OperatorConfig, OperatorType, PatternSlice, PreparedSearchConfig, - PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchEngine, - SearchOptions, SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, - SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, + HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, + MagnitudeSuffixData, MonetaryData, OperatorConfig, OperatorType, + PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, + ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, + StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, StringGroups, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; @@ -18,13 +19,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 5; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 6; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 3; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 4; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 4; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 5; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 4; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 5; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -131,6 +132,7 @@ pub struct BindingPreparedSearchSlices { pub street_types: Option, pub gazetteer: Option, pub countries: Option, + pub hotwords: Option, } #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] @@ -155,6 +157,24 @@ pub struct BindingCountryMatchData { pub labels: Vec, } +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingHotwordRuleData { + #[serde(default)] + pub rules: Vec, + #[serde(default)] + pub pattern_rule_indices: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingHotwordRule { + #[serde(default)] + pub target_labels: Vec, + pub score_adjustment: f64, + pub reclassify_to: Option, + pub proximity_before: u32, + pub proximity_after: u32, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingTriggerData { pub rules: Vec, @@ -417,6 +437,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub country_data: Option, #[serde(default)] + pub hotword_data: Option, + #[serde(default)] pub trigger_data: Option, #[serde(default)] pub legal_form_data: Option, @@ -476,6 +498,7 @@ struct BinaryPreparedSearchConfig { deny_list_data: Option, gazetteer_data: Option, country_data: Option, + hotword_data: Option, trigger_data: Option, legal_form_data: Option, address_seed_data: Option, @@ -675,6 +698,7 @@ impl From for BinaryPreparedSearchConfig { deny_list_data: config.deny_list_data, gazetteer_data: config.gazetteer_data, country_data: config.country_data, + hotword_data: config.hotword_data, trigger_data: config.trigger_data.map(BinaryTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, @@ -704,6 +728,7 @@ impl From for BindingPreparedSearchConfig { deny_list_data: config.deny_list_data, gazetteer_data: config.gazetteer_data, country_data: config.country_data, + hotword_data: config.hotword_data, trigger_data: config.trigger_data.map(BindingTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, @@ -1148,6 +1173,7 @@ pub fn prepared_search_config_from_binding( country_data: config.country_data.map(|data| CountryMatchData { labels: data.labels, }), + hotword_data: config.hotword_data.map(hotword_data_from_binding), trigger_data: config .trigger_data .map(|data| trigger_data_from_binding(data, legal_form_suffixes)), @@ -1474,6 +1500,23 @@ fn monetary_data_from_binding(data: BindingMonetaryData) -> MonetaryData { } } +fn hotword_data_from_binding(data: BindingHotwordRuleData) -> HotwordRuleData { + HotwordRuleData { + rules: data + .rules + .into_iter() + .map(|rule| HotwordRule { + target_labels: rule.target_labels, + score_adjustment: rule.score_adjustment, + reclassify_to: rule.reclassify_to, + proximity_before: rule.proximity_before, + proximity_after: rule.proximity_after, + }) + .collect(), + pattern_rule_indices: data.pattern_rule_indices, + } +} + pub fn operator_config_from_binding( config: Option, ) -> Result { @@ -1932,6 +1975,7 @@ fn slices_from_binding( street_types: slice_from_binding(slices.street_types), gazetteer: slice_from_binding(slices.gazetteer), countries: slice_from_binding(slices.countries), + hotwords: slice_from_binding(slices.hotwords), } } diff --git a/crates/anonymize-core/src/hotwords.rs b/crates/anonymize-core/src/hotwords.rs new file mode 100644 index 00000000..69e2972d --- /dev/null +++ b/crates/anonymize-core/src/hotwords.rs @@ -0,0 +1,210 @@ +use crate::byte_offsets::ByteOffsets; +use crate::processors::PatternSlice; +use crate::resolution::{PipelineEntity, SourceDetail}; +use crate::types::{Error, Result, SearchMatch}; + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct HotwordRuleData { + pub rules: Vec, + pub pattern_rule_indices: Vec, +} + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct HotwordRule { + pub target_labels: Vec, + pub score_adjustment: f64, + pub reclassify_to: Option, + pub proximity_before: u32, + pub proximity_after: u32, +} + +pub(crate) fn apply_hotword_rules( + entities: Vec, + full_text: &str, + matches: &[SearchMatch], + slice: PatternSlice, + data: &HotwordRuleData, + allowed_labels: &[String], +) -> Result> { + let hits_by_rule = collect_hits_by_rule(matches, slice, data)?; + let offsets = ByteOffsets::new(full_text); + let mut result = Vec::with_capacity(entities.len()); + + for entity in entities { + if caller_owned(&entity) { + result.push(entity); + continue; + } + + let adjusted = + apply_entity_rules(entity, full_text, &offsets, data, &hits_by_rule)?; + if label_allowed(&adjusted.label, allowed_labels) { + result.push(adjusted); + } + } + + Ok(result) +} + +fn collect_hits_by_rule( + matches: &[SearchMatch], + slice: PatternSlice, + data: &HotwordRuleData, +) -> Result>> { + let mut hits_by_rule = vec![Vec::new(); data.rules.len()]; + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(rule_index) = data.pattern_rule_indices.get(local_index) else { + continue; + }; + let Ok(rule_index) = usize::try_from(*rule_index) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index exceeds usize range"), + }); + }; + let Some(bucket) = hits_by_rule.get_mut(rule_index) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index out of range"), + }); + }; + bucket.push(*found); + } + + Ok(hits_by_rule) +} + +fn apply_entity_rules( + mut entity: PipelineEntity, + full_text: &str, + offsets: &ByteOffsets<'_>, + data: &HotwordRuleData, + hits_by_rule: &[Vec], +) -> Result { + let mut best = None::; + + for (rule_index, rule) in data.rules.iter().enumerate() { + if !rule + .target_labels + .iter() + .any(|label| label == &entity.label) + { + continue; + } + let Some(rule_hits) = hits_by_rule.get(rule_index) else { + continue; + }; + for hit in rule_hits { + let Some((distance, max_distance)) = + hotword_distance(full_text, offsets, &entity, hit, rule)? + else { + continue; + }; + let decay = if max_distance == 0 { + 1.0 + } else { + 1.0 - (f64::from(distance) / f64::from(max_distance)) + }; + let adjustment = rule.score_adjustment * decay; + if adjustment.abs() <= f64::EPSILON { + continue; + } + if best + .as_ref() + .is_some_and(|best| adjustment.abs() <= best.score.abs()) + { + continue; + } + + best = Some(HotwordAdjustment { + score: adjustment, + reclassify_to: if adjustment.is_sign_positive() { + rule.reclassify_to.clone() + } else { + None + }, + }); + } + } + + let Some(best) = best else { + return Ok(entity); + }; + + entity.score = (entity.score + best.score).clamp(0.0, 1.0); + if let Some(label) = best.reclassify_to { + entity.label = label; + } + Ok(entity) +} + +fn hotword_distance( + full_text: &str, + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + hit: &SearchMatch, + rule: &HotwordRule, +) -> Result> { + let (distance, max_distance) = if hit.end() <= entity.start { + ( + char_distance(full_text, offsets, hit.end(), entity.start)?, + rule.proximity_before, + ) + } else if hit.start() >= entity.end { + ( + char_distance(full_text, offsets, entity.end, hit.start())?, + rule.proximity_after, + ) + } else { + (0, u32::max(rule.proximity_before, rule.proximity_after)) + }; + + if distance > max_distance { + return Ok(None); + } + Ok(Some((distance, max_distance))) +} + +fn char_distance( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, +) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + let start = offsets.validate_offset(start)?; + let end = offsets.validate_offset(end)?; + let distance = full_text + .get(start..end) + .ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(start).unwrap_or(u32::MAX), + end: u32::try_from(end).unwrap_or(u32::MAX), + })? + .chars() + .count(); + u32::try_from(distance) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn label_allowed(label: &str, allowed_labels: &[String]) -> bool { + allowed_labels.is_empty() + || allowed_labels.iter().any(|allowed| allowed == label) +} + +struct HotwordAdjustment { + score: f64, + reclassify_to: Option, +} diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 5f8f077a..8116c5ce 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -9,6 +9,7 @@ pub(crate) mod byte_offsets; mod dates; mod diagnostics; mod false_positives; +mod hotwords; mod legal_forms; mod money; pub(crate) mod normalize; @@ -29,6 +30,7 @@ pub use diagnostics::{ DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, StaticRedactionDiagnostics, }; +pub use hotwords::{HotwordRule, HotwordRuleData}; pub use legal_forms::LegalFormData; pub use money::{ AmountWordsData, CurrencyData, MagnitudeSuffixData, MonetaryData, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index eb7df871..0e8835ab 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -5,6 +5,7 @@ use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; use crate::dates::{DateData, PreparedDateData}; use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; use crate::false_positives::filter_entity_false_positives; +use crate::hotwords::{HotwordRuleData, apply_hotword_rules}; use crate::legal_forms::{ LegalFormData, PreparedLegalFormData, process_legal_form_matches, }; @@ -57,6 +58,7 @@ pub struct PreparedSearch { deny_list_data: Option, gazetteer_data: Option, country_data: Option, + hotword_data: Option, trigger_data: Option, legal_form_data: Option, address_seed_data: Option, @@ -76,6 +78,7 @@ pub struct PreparedSearchSlices { pub street_types: PatternSlice, pub gazetteer: PatternSlice, pub countries: PatternSlice, + pub hotwords: PatternSlice, } #[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] @@ -98,6 +101,8 @@ pub struct PreparedSearchConfig { pub deny_list_data: Option, pub gazetteer_data: Option, pub country_data: Option, + #[serde(default)] + pub hotword_data: Option, pub trigger_data: Option, pub legal_form_data: Option, pub address_seed_data: Option, @@ -428,6 +433,7 @@ impl PreparedSearch { deny_list_data: config.deny_list_data, gazetteer_data: config.gazetteer_data, country_data: config.country_data, + hotword_data: config.hotword_data, trigger_data: config .trigger_data .map(PreparedTriggerData::new) @@ -828,8 +834,13 @@ impl PreparedSearch { ) -> Result { let detections = self .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; - let raw_entities = filter_entities_for_redaction( + let pre_threshold_entities = self.apply_hotword_entities( detections.all_entities(), + full_text, + &detections.matches.literal, + )?; + let raw_entities = filter_entities_for_redaction( + pre_threshold_entities, self.threshold, self.confidence_boost, &self.allowed_labels, @@ -897,6 +908,25 @@ impl PreparedSearch { redaction, }) } + + fn apply_hotword_entities( + &self, + entities: Vec, + full_text: &str, + literal_matches: &[SearchMatch], + ) -> Result> { + let Some(data) = &self.hotword_data else { + return Ok(entities); + }; + apply_hotword_rules( + entities, + full_text, + literal_matches, + self.slices.hotwords, + data, + &self.allowed_labels, + ) + } } fn process_signature_entities(full_text: &str) -> TimedEntities { @@ -1420,6 +1450,7 @@ fn validate_supported_config( validate_deny_list_config(config)?; validate_gazetteer_config(config)?; validate_country_config(config)?; + validate_hotword_config(config)?; validate_address_seed_config(config) } @@ -1468,6 +1499,11 @@ fn validate_search_config( config.slices.countries, config.literal_patterns.len(), )?; + validate_slice_bounds( + "slices.hotwords", + config.slices.hotwords, + config.literal_patterns.len(), + )?; } validate_static_data_length( "regex_meta", @@ -1598,6 +1634,41 @@ fn validate_country_config(config: &PreparedSearchConfig) -> Result<()> { ) } +fn validate_hotword_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.hotwords.is_empty() { + return Ok(()); + } + + let Some(data) = &config.hotword_data else { + return Err(Error::MissingStaticData { + field: "hotword_data", + }); + }; + + validate_static_data_length( + "hotword_data.pattern_rule_indices", + config.slices.hotwords, + data.pattern_rule_indices.len(), + )?; + + for rule_index in &data.pattern_rule_indices { + let Ok(rule_index) = usize::try_from(*rule_index) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index exceeds usize range"), + }); + }; + if rule_index >= data.rules.len() { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index out of range"), + }); + } + } + + Ok(()) +} + const fn validate_address_seed_config( config: &PreparedSearchConfig, ) -> Result<()> { diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index 9bada913..37bf3032 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -23,6 +23,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { deny_list_data: None, gazetteer_data: None, country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index f985aa76..86f6fa71 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -25,6 +25,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { deny_list_data: None, gazetteer_data: None, country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 484a7d45..db0f196c 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -5,12 +5,13 @@ use std::collections::{BTreeMap, BTreeSet}; use stella_anonymize_core::{ AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, - DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, - LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, - OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchArtifacts, - PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, TriggerData, - TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, + DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, HotwordRule, + HotwordRuleData, LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, + MonetaryData, OperatorConfig, PatternSlice, PreparedSearch, + PreparedSearchArtifacts, PreparedSearchConfig, PreparedSearchSlices, + RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, + SourceDetail, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, + WrittenAmountPatternData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -30,6 +31,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { deny_list_data: None, gazetteer_data: None, country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -144,6 +146,7 @@ fn prepared_search_runs_normalized_literal_pass() { is_fuzzy: vec![false], }), country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -189,6 +192,7 @@ fn prepared_search_artifacts_match_direct_prepare() { is_fuzzy: vec![false], }), country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -335,6 +339,7 @@ fn prepared_search_emits_static_detector_entities() { country_data: Some(CountryMatchData { labels: vec![String::from("country")], }), + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -967,6 +972,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { country_data: Some(CountryMatchData { labels: vec![String::from("country")], }), + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -1067,6 +1073,55 @@ fn prepared_search_boosts_near_miss_entities_when_enabled() { ); } +#[test] +fn prepared_search_applies_hotword_reclassification_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("narozen"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + allowed_labels: vec![String::from("date of birth")], + threshold: 0.8, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + hotwords: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.7)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + target_labels: vec![String::from("date")], + score_adjustment: 0.15, + reclassify_to: Some(String::from("date of birth")), + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![0], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "narozen dne 12.03.1990 v Praze", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert_eq!(result.resolved_entities[0].label, "date of birth"); + assert_eq!(result.resolved_entities[0].text, "12.03.1990"); + assert_eq!( + result.redaction.redacted_text, + "narozen dne [DATE_OF_BIRTH_1] v Praze" + ); +} + #[test] fn prepared_search_applies_allowed_labels_before_redaction() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -1198,6 +1253,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { is_fuzzy: vec![false], }), country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, @@ -1279,6 +1335,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { }), gazetteer_data: None, country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 3e400251..75c1bffc 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -23,6 +23,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { deny_list_data: None, gazetteer_data: None, country_data: None, + hotword_data: None, trigger_data: None, legal_form_data: None, address_seed_data: None, diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index c73f4dd1..4b8d61ee 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -974,6 +974,71 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS hotword reclassification", async () => { + const adapters = getAdapters(); + const fullText = "narozen dne 12.03.1990 v Praze"; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: true, + enableZoneClassification: false, + labels: ["date of birth"], + workspaceId: "native-pipeline-hotword-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "date of birth", + text: "12.03.1990", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, @@ -999,7 +1064,6 @@ describe("native adapter parity", () => { "enableNameCorpus", "enableCoreference", "enableZoneClassification", - "enableHotwordRules", ], }); }); diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index e002cd08..6b6c7bca 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -130,6 +130,33 @@ describe("pipeline config semantics", () => { expect(search.nativeStaticConfig.confidence_boost).toBe(true); }); + test("native config carries hotword rule metadata", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableHotwordRules: true, + labels: ["date of birth"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.allowed_labels).toEqual(["date of birth"]); + expect(search.nativeStaticConfig.slices.hotwords?.end).toBeGreaterThan( + search.nativeStaticConfig.slices.hotwords?.start ?? 0, + ); + expect( + search.nativeStaticConfig.hotword_data?.rules.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.hotword_data?.pattern_rule_indices.length, + ).toBe( + (search.nativeStaticConfig.slices.hotwords?.end ?? 0) - + (search.nativeStaticConfig.slices.hotwords?.start ?? 0), + ); + }); + test("native config keeps unsupported validator regexes fail-fast", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 649a920b..d86c45bc 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -65,7 +65,11 @@ import { } from "./detectors/address-seeds"; import { buildGazetteerPatterns } from "./detectors/gazetteer"; import { buildCountryPatterns, type CountryData } from "./detectors/countries"; -import { expandLabelsForHotwordRules } from "./filters/hotword-rules"; +import { + expandLabelsForHotwordRuleSet, + loadHotwordRuleSet, + type HotwordRule, +} from "./filters/hotword-rules"; import { getClauseNounHeadsSync, getConnectorProseHeadsSync, @@ -227,6 +231,19 @@ export type NativeGazetteerData = { is_fuzzy: boolean[]; }; +export type NativeHotwordRule = { + target_labels: string[]; + score_adjustment: number; + reclassify_to?: string; + proximity_before: number; + proximity_after: number; +}; + +export type NativeHotwordRuleData = { + rules: NativeHotwordRule[]; + pattern_rule_indices: number[]; +}; + export type NativePreparedSearchConfig = { regex_patterns: NativeSearchPattern[]; custom_regex_patterns: NativeSearchPattern[]; @@ -247,12 +264,14 @@ export type NativePreparedSearchConfig = { street_types?: PatternSlice; gazetteer: PatternSlice; countries: PatternSlice; + hotwords?: PatternSlice; }; regex_meta: NativeRegexMatchMeta[]; custom_regex_meta: NativeRegexMatchMeta[]; deny_list_data?: NativeDenyListMatchData; gazetteer_data?: NativeGazetteerData; country_data?: CountryData; + hotword_data?: NativeHotwordRuleData; trigger_data?: NativeTriggerData; legal_form_data?: NativeLegalFormData; address_seed_data?: NativeAddressSeedData; @@ -339,6 +358,7 @@ type UnifiedSearchSources = { nativeAddressSeedData: NativeAddressSeedData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: string[]; + hotwordRules: readonly HotwordRule[]; nativeCurrencyPatternRange: PatternSlice; nativeDatePatternRange: PatternSlice; nativeSigningPatternRange: PatternSlice; @@ -366,9 +386,11 @@ const buildUnifiedSearchSources = async ( ): Promise => { config = applyPipelineLanguageScope(config); const legalFormsEnabled = isLegalFormsEnabled(config); + const hotwordRules = + config.enableHotwordRules === true ? await loadHotwordRuleSet() : []; const searchLabels = config.enableHotwordRules === true - ? expandLabelsForHotwordRules(config.labels) + ? expandLabelsForHotwordRuleSet(config.labels, hotwordRules) : config.labels; const allowedLabels = createAllowedLabelSet(searchLabels); const customRegexes = config.enableRegex @@ -683,6 +705,7 @@ const buildUnifiedSearchSources = async ( nativeAddressSeedData: addressSeedData, nativeSigningPatterns, partyPositionTerms, + hotwordRules, nativeCurrencyPatternRange, nativeDatePatternRange, nativeSigningPatternRange, @@ -731,6 +754,7 @@ export const buildNativeStaticSearchBundle = async ( addressSeedData: sources.nativeAddressSeedData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, + hotwordRules: sources.hotwordRules, streetTypes: sources.streetTypes, omitRegexRanges: [ sources.nativeCurrencyPatternRange, @@ -813,6 +837,7 @@ export const buildUnifiedSearch = async ( addressSeedData: sources.nativeAddressSeedData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, + hotwordRules: sources.hotwordRules, streetTypes: sources.streetTypes, omitRegexRanges: [ sources.nativeCurrencyPatternRange, @@ -860,6 +885,7 @@ type BuildNativeStaticConfigArgs = { addressSeedData: NativeAddressSeedData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: readonly string[]; + hotwordRules: readonly HotwordRule[]; omitRegexRanges?: readonly PatternSlice[]; streetTypes: readonly string[]; gazetteerPatterns: readonly PatternEntry[]; @@ -888,6 +914,7 @@ const buildNativeStaticConfig = ({ addressSeedData, nativeSigningPatterns, partyPositionTerms, + hotwordRules, omitRegexRanges, streetTypes, gazetteerPatterns, @@ -960,6 +987,14 @@ const buildNativeStaticConfig = ({ ? toNativeGlobalLiteralPattern(patternEntryText(pattern)) : toNativeLiteralPattern(pattern), ); + const nativeHotwordPatterns: NativeSearchPattern[] = []; + const nativeHotwordPatternRuleIndices: number[] = []; + for (const [ruleIndex, rule] of hotwordRules.entries()) { + for (const hotword of rule.hotwords) { + nativeHotwordPatterns.push(toNativeHotwordPattern(hotword)); + nativeHotwordPatternRuleIndices.push(ruleIndex); + } + } let literalOffset = 0; const denyListPatternCount = denyListPatternsFromData @@ -984,6 +1019,11 @@ const buildNativeStaticConfig = ({ start: literalOffset, end: literalOffset + countryNativePatterns.length, }; + literalOffset = countriesSlice.end; + const hotwordsSlice = { + start: literalOffset, + end: literalOffset + nativeHotwordPatterns.length, + }; const nativeConfig: NativePreparedSearchConfig = { regex_patterns: nativeRegexPatterns, @@ -993,6 +1033,7 @@ const buildNativeStaticConfig = ({ ...streetTypeNativePatterns, ...gazetteerNativePatterns, ...countryNativePatterns, + ...nativeHotwordPatterns, ], regex_options: { literal_case_insensitive: true, @@ -1032,6 +1073,7 @@ const buildNativeStaticConfig = ({ street_types: streetTypesSlice, gazetteer: gazetteerSlice, countries: countriesSlice, + hotwords: hotwordsSlice, }, regex_meta: nativeRegexMeta, custom_regex_meta: nativeCustomRegexMeta, @@ -1049,6 +1091,12 @@ const buildNativeStaticConfig = ({ if (countryData) { nativeConfig.country_data = countryData; } + if (hotwordRules.length > 0) { + nativeConfig.hotword_data = { + rules: hotwordRules.map(toNativeHotwordRule), + pattern_rule_indices: nativeHotwordPatternRuleIndices, + }; + } if (triggerRules.length > 0) { nativeConfig.trigger_data = { rules: triggerRules.map(toNativeTriggerRule), @@ -1089,6 +1137,26 @@ const toNativeTriggerPattern = (pattern: string): NativeSearchPattern => ({ case_insensitive: true, }); +const toNativeHotwordPattern = (pattern: string): NativeSearchPattern => ({ + kind: "literal-with-options", + pattern, + case_insensitive: true, + whole_words: true, +}); + +const toNativeHotwordRule = (rule: HotwordRule): NativeHotwordRule => { + const result: NativeHotwordRule = { + target_labels: [...rule.targetLabels], + score_adjustment: rule.scoreAdjustment, + proximity_before: rule.proximityBefore, + proximity_after: rule.proximityAfter, + }; + if (rule.reclassifyTo !== undefined) { + result.reclassify_to = rule.reclassifyTo; + } + return result; +}; + const toNativeTriggerRule = (rule: TriggerRule): NativeTriggerRule => ({ trigger: rule.trigger, label: rule.label, diff --git a/packages/anonymize/src/filters/hotword-rules.ts b/packages/anonymize/src/filters/hotword-rules.ts index e4640579..b80e53c9 100644 --- a/packages/anonymize/src/filters/hotword-rules.ts +++ b/packages/anonymize/src/filters/hotword-rules.ts @@ -20,7 +20,7 @@ type HotwordRulesConfig = { // ── Lazy-loaded state ─────────────────────────────── -let rules: HotwordRule[] | null = null; +let rules: readonly HotwordRule[] | null = null; let search: { findIter: (text: string) => Match[] } | null = null; /** * Maps each TextSearch pattern index back to the @@ -28,14 +28,27 @@ let search: { findIter: (text: string) => Match[] } | null = null; * resolves all hotword hits to their rule. */ let patternToRule: number[] | null = null; +let ruleSetPromise: Promise | null = null; let initPromise: Promise | null = null; // ── Init ──────────────────────────────────────────── +export const loadHotwordRuleSet = (): Promise => { + if (ruleSetPromise !== null) return ruleSetPromise; + ruleSetPromise = import("../data/hotword-rules.json") + .then((mod) => { + const data: HotwordRulesConfig = mod.default ?? mod; + return data.rules; + }) + .catch((err) => { + ruleSetPromise = null; + throw err; + }); + return ruleSetPromise; +}; + const loadRules = async (): Promise => { - const mod = await import("../data/hotword-rules.json"); - const data: HotwordRulesConfig = mod.default ?? mod; - const loaded = data.rules; + const loaded = await loadHotwordRuleSet(); // Build a flat pattern list and the reverse map. const patterns: PatternEntry[] = []; @@ -102,11 +115,20 @@ export const expandLabelsForHotwordRules = ( if (rules === null || requestedLabels.length === 0) { return requestedLabels; } + return expandLabelsForHotwordRuleSet(requestedLabels, rules); +}; +export const expandLabelsForHotwordRuleSet = ( + requestedLabels: readonly string[], + ruleSet: readonly HotwordRule[], +): readonly string[] => { + if (requestedLabels.length === 0) { + return requestedLabels; + } const requested = new Set(requestedLabels); const expanded = new Set(requestedLabels); - for (const rule of rules) { + for (const rule of ruleSet) { if (rule.reclassifyTo === undefined || !requested.has(rule.reclassifyTo)) { continue; } diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 1812e33d..22824285 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -19,8 +19,7 @@ export type NativePipelineUnsupportedFeature = | "enableNer" | "enableNameCorpus" | "enableCoreference" - | "enableZoneClassification" - | "enableHotwordRules"; + | "enableZoneClassification"; export type NativePipelineCompatibility = | { status: "supported" } @@ -85,9 +84,6 @@ export const getNativePipelineCompatibility = ( if (config.enableZoneClassification === true) { unsupportedFeatures.push("enableZoneClassification"); } - if (config.enableHotwordRules === true) { - unsupportedFeatures.push("enableHotwordRules"); - } if (unsupportedFeatures.length === 0) { return { status: "supported" }; From feb9faeee42b912bc0588993abdd4b97f9cbee4e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 00:17:58 +0200 Subject: [PATCH 055/130] fix: tighten native pipeline parity --- crates/anonymize-core/src/prepared.rs | 113 ++++++++++++------ crates/anonymize-core/tests/prepared.rs | 32 +++++ .../__test__/native-adapter-parity.test.ts | 95 ++++++++++++++- .../src/__test__/pipeline-config.test.ts | 23 ++++ .../anonymize/src/build-unified-search.ts | 10 +- packages/anonymize/src/native-pipeline.ts | 15 ++- 6 files changed, 247 insertions(+), 41 deletions(-) diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 0e8835ab..c44889f2 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -2,6 +2,7 @@ use std::time::Instant; use crate::address_seeds::{AddressSeedData, PreparedAddressSeedData}; use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; +use crate::byte_offsets::ByteOffsets; use crate::dates::{DateData, PreparedDateData}; use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; use crate::false_positives::filter_entity_false_positives; @@ -841,10 +842,11 @@ impl PreparedSearch { )?; let raw_entities = filter_entities_for_redaction( pre_threshold_entities, + full_text, self.threshold, self.confidence_boost, &self.allowed_labels, - ); + )?; let merge_start = Instant::now(); let merged = merge_and_dedup(&raw_entities); if let Some(diagnostics) = &mut diagnostics { @@ -950,15 +952,16 @@ fn filter_entities_for_config( fn filter_entities_for_redaction( entities: Vec, + full_text: &str, threshold: f64, confidence_boost: bool, allowed_labels: &[String], -) -> Vec { +) -> Result> { let entities = filter_entities_for_labels(entities, allowed_labels); if confidence_boost { - return boost_near_miss_entities(entities, threshold); + return boost_near_miss_entities(entities, full_text, threshold); } - filter_entities_for_threshold(entities, threshold) + Ok(filter_entities_for_threshold(entities, threshold)) } fn filter_entities_for_labels( @@ -986,45 +989,85 @@ fn filter_entities_for_threshold( fn boost_near_miss_entities( entities: Vec, + full_text: &str, threshold: f64, -) -> Vec { +) -> Result> { let near_miss_floor = f64::max(0.0, threshold - NEAR_MISS_BAND); + let byte_offsets = ByteOffsets::new(full_text); + let text_offsets = TextOffsetMap::new(full_text); let anchors = entities .iter() .filter(|entity| entity.score >= HIGH_CONFIDENCE_FLOOR) - .map(entity_midpoint) - .collect::>(); + .map(|entity| entity_midpoint(entity, &byte_offsets, &text_offsets)) + .collect::>>()?; - entities - .into_iter() - .filter_map(|mut entity| { - if entity.score >= threshold { - return Some(entity); - } - if entity.score < near_miss_floor { - return None; - } - - let midpoint = entity_midpoint(&entity); - let neighbours = anchors - .iter() - .filter(|anchor| (midpoint - **anchor).abs() <= CONTEXT_WINDOW_CHARS) - .count(); - let neighbour_count = u32::try_from(neighbours).unwrap_or(u32::MAX); - let boosted_score = - f64::from(neighbour_count).mul_add(BOOST_PER_NEIGHBOUR, entity.score); - if boosted_score < threshold { - return None; - } - - entity.score = f64::min(1.0, boosted_score); - Some(entity) - }) - .collect() + let mut boosted = Vec::with_capacity(entities.len()); + for mut entity in entities { + if entity.score >= threshold { + boosted.push(entity); + continue; + } + if entity.score < near_miss_floor { + continue; + } + + let midpoint = entity_midpoint(&entity, &byte_offsets, &text_offsets)?; + let neighbours = anchors + .iter() + .filter(|anchor| (midpoint - **anchor).abs() <= CONTEXT_WINDOW_CHARS) + .count(); + let neighbour_count = u32::try_from(neighbours).unwrap_or(u32::MAX); + let boosted_score = + f64::from(neighbour_count).mul_add(BOOST_PER_NEIGHBOUR, entity.score); + if boosted_score < threshold { + continue; + } + + entity.score = f64::min(1.0, boosted_score); + boosted.push(entity); + } + + Ok(boosted) +} + +fn entity_midpoint( + entity: &PipelineEntity, + byte_offsets: &ByteOffsets<'_>, + text_offsets: &TextOffsetMap, +) -> Result { + let start = text_offsets.offset_for(byte_offsets, entity.start)?; + let end = text_offsets.offset_for(byte_offsets, entity.end)?; + Ok(f64::midpoint(start, end)) } -fn entity_midpoint(entity: &PipelineEntity) -> f64 { - f64::midpoint(f64::from(entity.start), f64::from(entity.end)) +struct TextOffsetMap { + byte_offsets: Vec, +} + +impl TextOffsetMap { + fn new(full_text: &str) -> Self { + let mut byte_offsets = full_text + .char_indices() + .map(|(byte_offset, _)| byte_offset) + .collect::>(); + byte_offsets.push(full_text.len()); + Self { byte_offsets } + } + + fn offset_for( + &self, + byte_offsets: &ByteOffsets<'_>, + offset: u32, + ) -> Result { + let byte_offset = byte_offsets.validate_offset(offset)?; + let index = self + .byte_offsets + .binary_search(&byte_offset) + .map_err(|_| Error::ByteOffsetInsideCodepoint { offset })?; + let index = u32::try_from(index) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX })?; + Ok(f64::from(index)) + } } fn record_static_entity_diagnostics( diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index db0f196c..3962f323 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1073,6 +1073,38 @@ fn prepared_search_boosts_near_miss_entities_when_enabled() { ); } +#[test] +fn prepared_search_boost_counts_text_offsets_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"\bANCHOR-\d+\b")), + SearchPattern::Regex(String::from(r"\bNEAR-\d+\b")), + ], + threshold: 0.5, + confidence_boost: true, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("registration number", 0.95), + RegexMatchMeta::new("matter id", 0.45), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("ANCHOR-123 {} NEAR-456.", "á".repeat(120)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 2); + assert_eq!(result.resolved_entities[0].text, "ANCHOR-123"); + assert_eq!(result.resolved_entities[1].text, "NEAR-456"); + assert!((result.resolved_entities[1].score - 0.5).abs() < f64::EPSILON); +} + #[test] fn prepared_search_applies_hotword_reclassification_before_threshold() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 4b8d61ee..f4f9249b 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -857,7 +857,7 @@ describe("native adapter parity", () => { enableCoreference: false, enableHotwordRules: false, enableZoneClassification: false, - labels: [...DEFAULT_ENTITY_LABELS, "matter id"], + labels: ["organization", "date", "person", "matter id"], workspaceId: "native-pipeline-static-test", }; const gazetteerEntries = [ @@ -933,7 +933,7 @@ describe("native adapter parity", () => { }, { pattern: "\\bNEAR-\\d+\\b", label: "matter id", score: 0.45 }, ], - labels: [...DEFAULT_ENTITY_LABELS, "matter id"], + labels: ["registration number", "matter id"], workspaceId: "native-pipeline-confidence-boost-test", }; @@ -974,6 +974,72 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches trigger-only legal suffix reclassification", async () => { + const adapters = getAdapters(); + const fullText = "jednatelem Novák Partners s.r.o. na základě plné moci."; + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-trigger-suffix-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "organization", + text: expect.stringContaining("s.r.o."), + source: "trigger", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline package matches TS hotword reclassification", async () => { const adapters = getAdapters(); const fullText = "narozen dne 12.03.1990 v Praze"; @@ -1064,10 +1130,35 @@ describe("native adapter parity", () => { "enableNameCorpus", "enableCoreference", "enableZoneClassification", + "addressContextPasses", ], }); }); + test("native pipeline compatibility rejects address context passes", () => { + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["address"], + workspaceId: "native-pipeline-address-context-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "unsupported", + unsupportedFeatures: ["addressContextPasses"], + }); + }); + test("native facade and Python match on contract fixture packages", async () => { const adapters = getAdapters(); for (const language of CONTRACT_FIXTURE_LANGUAGES) { diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 6b6c7bca..0c5b6a12 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -157,6 +157,29 @@ describe("pipeline config semantics", () => { ); }); + test("native trigger config carries legal suffix data without legal-form search", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableTriggerPhrases: true, + enableLegalForms: false, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + const legalFormsSlice = search.nativeStaticConfig.slices.legal_forms; + expect(legalFormsSlice).toBeDefined(); + expect(legalFormsSlice?.end).toBe(legalFormsSlice?.start); + expect( + search.nativeStaticConfig.legal_form_data?.suffixes.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.trigger_data?.rules.length, + ).toBeGreaterThan(0); + }); + test("native config keeps unsupported validator regexes fail-fast", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index d86c45bc..1d28624a 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -464,17 +464,21 @@ const buildUnifiedSearchSources = async ( const nativeLegalFormPatterns = legalFormsEnabled ? [...getKnownLegalSuffixes()] : []; + const nativeLegalFormSuffixes = + legalFormsEnabled || config.enableTriggerPhrases + ? [...getKnownLegalSuffixes()] + : []; const nativeLegalFormData = - nativeLegalFormPatterns.length > 0 + nativeLegalFormSuffixes.length > 0 ? { - suffixes: nativeLegalFormPatterns, + suffixes: nativeLegalFormSuffixes, normalized_boundary_suffixes: [ ...getNormalizedLegalBoundarySuffixesSync(), ], normalized_in_name_words: [ ...getNormalizedInNameLegalFormWordsSync(), ], - normalized_suffix_words: nativeLegalFormPatterns + normalized_suffix_words: nativeLegalFormSuffixes .map((suffix) => suffix.replaceAll(/[.,\s]/g, "").toLowerCase()) .filter((suffix) => suffix.length > 0), role_heads: [...getLegalRoleHeadsSync()], diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 22824285..e2cbcd30 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -19,7 +19,8 @@ export type NativePipelineUnsupportedFeature = | "enableNer" | "enableNameCorpus" | "enableCoreference" - | "enableZoneClassification"; + | "enableZoneClassification" + | "addressContextPasses"; export type NativePipelineCompatibility = | { status: "supported" } @@ -84,6 +85,9 @@ export const getNativePipelineCompatibility = ( if (config.enableZoneClassification === true) { unsupportedFeatures.push("enableZoneClassification"); } + if (addressContextPassesCanAffectOutput(config)) { + unsupportedFeatures.push("addressContextPasses"); + } if (unsupportedFeatures.length === 0) { return { status: "supported" }; @@ -91,6 +95,15 @@ export const getNativePipelineCompatibility = ( return { status: "unsupported", unsupportedFeatures }; }; +const addressContextPassesCanAffectOutput = (config: PipelineConfig): boolean => + labelIsEnabled(config.labels, "address") && config.threshold <= 0.95; + +const labelIsEnabled = ( + labels: readonly string[] | undefined, + label: string, +): boolean => + labels === undefined || labels.length === 0 || labels.includes(label); + export const assertNativePipelineSupported = (config: PipelineConfig): void => { const compatibility = getNativePipelineCompatibility(config); if (compatibility.status === "supported") { From 5d5f014014b8a9195dc3c76479b1bd1576f064e6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 00:26:08 +0200 Subject: [PATCH 056/130] fix: cover native cache and address seeds --- crates/anonymize-core/src/address_seeds.rs | 44 ++++++++++++- .../tests/address_seed_parity.rs | 64 ++++++++++++++++++- .../src/__test__/pipeline-config.test.ts | 35 ++++++++++ packages/anonymize/src/pipeline.ts | 3 + 4 files changed, 141 insertions(+), 5 deletions(-) diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index a08d749c..278c5713 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -366,7 +366,12 @@ impl PreparedAddressSeedData { cluster: &SeedCluster, existing_entities: &[PipelineEntity], ) -> Span { - let left_bound = nearest_left_non_address(cluster.start, existing_entities); + let left_bound = nearest_left_non_address( + full_text, + cluster.start, + existing_entities, + cluster_starts_with_street_type_word(cluster), + ); let left_pos = expand_left(full_text, cluster.start, left_bound); if !cluster.has_expandable_address_context() { return Span { @@ -861,14 +866,24 @@ fn score_cluster(cluster: &SeedCluster) -> f64 { } fn nearest_left_non_address( + full_text: &str, start: usize, existing_entities: &[PipelineEntity], + ignore_date_prefix: bool, ) -> usize { existing_entities .iter() - .filter(|entity| non_address_label(&entity.label)) .filter_map(|entity| { + if !non_address_label(&entity.label) { + return None; + } let end = usize::try_from(entity.end).ok()?; + if ignore_date_prefix + && date_label(&entity.label) + && date_can_prefix_street_name(full_text, end, start) + { + return None; + } (end <= start).then_some(end) }) .max() @@ -910,6 +925,31 @@ fn non_address_label(label: &str) -> bool { ) } +fn date_label(label: &str) -> bool { + matches!(label, "date" | "date of birth") +} + +fn cluster_starts_with_street_type_word(cluster: &SeedCluster) -> bool { + cluster.seeds.iter().any(|seed| { + seed.start == cluster.start + && seed.kind == SeedType::StreetWord + && !seed.text.chars().any(|ch| ch.is_ascii_digit()) + }) +} + +fn date_can_prefix_street_name( + full_text: &str, + date_end: usize, + street_start: usize, +) -> bool { + if date_end > street_start { + return false; + } + full_text.get(date_end..street_start).is_some_and(|gap| { + !gap.contains('\n') && gap.chars().all(char::is_whitespace) + }) +} + fn expand_left(full_text: &str, start: usize, left_bound: usize) -> usize { let mut left_pos = start; while left_pos > left_bound { diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index 37bf3032..b22e01d6 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -1,9 +1,9 @@ #![allow(clippy::expect_used)] use stella_anonymize_core::{ - AddressSeedData, LiteralSearchOptions, OperatorConfig, PatternSlice, - PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, SearchOptions, - SearchPattern, + AddressSeedData, DenyListFilterData, DenyListMatchData, LiteralSearchOptions, + OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, SearchOptions, SearchPattern, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -107,3 +107,61 @@ fn detects_cue_gated_br_cep_address_seed() { ); assert!(!result.redaction.redacted_text.contains("01001-000")); } + +#[test] +fn keeps_date_like_street_name_in_address_seed_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("May 15"))], + regex_meta: vec![RegexMatchMeta::new("date", 0.9)], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("London"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("London")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Notices go to May 15 Street, London 12345.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"May 15 Street, London 12345"), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); + assert!(!result.redaction.redacted_text.contains("May 15 Street")); +} diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 0c5b6a12..6ca7df5a 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -386,6 +386,41 @@ describe("pipeline config semantics", () => { expect(second).not.toBe(first); }); + test("preparePipelineSearch cache keys native redaction options", async () => { + const context = createPipelineContext(); + const baseConfig = { + ...BASE_CONFIG, + enableRegex: true, + labels: ["date of birth"], + }; + + const first = await preparePipelineSearch({ + config: { + ...baseConfig, + threshold: 0.5, + enableConfidenceBoost: false, + enableHotwordRules: false, + }, + context, + }); + const second = await preparePipelineSearch({ + config: { + ...baseConfig, + threshold: 0.93, + enableConfidenceBoost: true, + enableHotwordRules: true, + }, + context, + }); + + expect(second).not.toBe(first); + expect(second.nativeStaticConfig.threshold).toBe(0.93); + expect(second.nativeStaticConfig.confidence_boost).toBe(true); + expect( + second.nativeStaticConfig.hotword_data?.rules.length, + ).toBeGreaterThan(0); + }); + test("enableLegalForms flag gates legal-form detection", async () => { const withFlag = await detect("Acme s.r.o.", { enableLegalForms: true, diff --git a/packages/anonymize/src/pipeline.ts b/packages/anonymize/src/pipeline.ts index 0dd0bc53..c857d7ba 100644 --- a/packages/anonymize/src/pipeline.ts +++ b/packages/anonymize/src/pipeline.ts @@ -895,6 +895,9 @@ const configKey = ( `${config.enableNameCorpus}:` + `${config.nameCorpusLanguages?.toSorted().join(",") ?? ""}:` + `${config.enableRegex}:` + + `${config.threshold}:` + + `${config.enableConfidenceBoost}:` + + `${config.enableHotwordRules === true}:` + `${config.labels.toSorted().join(",")}:` + `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + From 1e126bd1c396facf54b6b7bdfbd23da387d13aa5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 00:51:38 +0200 Subject: [PATCH 057/130] fix: port address context to native core --- crates/anonymize-adapter-contract/src/lib.rs | 52 +- crates/anonymize-core/src/address_context.rs | 478 ++++++++++++++++++ crates/anonymize-core/src/diagnostics.rs | 1 + crates/anonymize-core/src/lib.rs | 2 + crates/anonymize-core/src/prepared.rs | 92 +++- .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 101 +++- crates/anonymize-core/tests/trigger_parity.rs | 1 + .../__test__/native-adapter-parity.test.ts | 71 ++- .../anonymize/src/build-unified-search.ts | 19 + .../anonymize/src/data/address-context.json | 72 +++ .../anonymize/src/filters/confidence-boost.ts | 153 +++--- packages/anonymize/src/native-pipeline.ts | 16 +- 14 files changed, 913 insertions(+), 147 deletions(-) create mode 100644 crates/anonymize-core/src/address_context.rs create mode 100644 packages/anonymize/src/data/address-context.json diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index c9733c3a..631ae214 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -3,15 +3,15 @@ use std::collections::{BTreeMap, BTreeSet}; use serde::{Deserialize, Serialize}; use stella_anonymize_core::{ - AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, - DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, - DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, - HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, - MagnitudeSuffixData, MonetaryData, OperatorConfig, OperatorType, - PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, - ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, - StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, + AddressContextData, AddressSeedData, AmountWordsData, CountryMatchData, + CurrencyData, DateData, DenyListFilterData, DenyListMatchData, + DetectionSource, DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, + FuzzySearchOptions, GazetteerMatchData, HotwordRule, HotwordRuleData, + LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, + OperatorConfig, OperatorType, PatternSlice, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchEngine, + SearchOptions, SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, + SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, StringGroups, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; @@ -19,13 +19,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 6; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 7; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 4; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 5; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 5; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 6; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 5; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 6; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -342,6 +342,18 @@ pub struct BindingAddressSeedData { pub br_cep_cue_words: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAddressContextData { + #[serde(default)] + pub address_prepositions: Vec, + #[serde(default)] + pub temporal_prepositions: Vec, + #[serde(default)] + pub street_abbreviations: Vec, + #[serde(default)] + pub bare_house_stopwords: Vec, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingDenyListMatchData { #[serde(default)] @@ -445,6 +457,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub address_seed_data: Option, #[serde(default)] + pub address_context_data: Option, + #[serde(default)] pub date_data: Option, #[serde(default)] pub monetary_data: Option, @@ -502,6 +516,7 @@ struct BinaryPreparedSearchConfig { trigger_data: Option, legal_form_data: Option, address_seed_data: Option, + address_context_data: Option, date_data: Option, monetary_data: Option, } @@ -702,6 +717,7 @@ impl From for BinaryPreparedSearchConfig { trigger_data: config.trigger_data.map(BinaryTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, + address_context_data: config.address_context_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -732,6 +748,7 @@ impl From for BindingPreparedSearchConfig { trigger_data: config.trigger_data.map(BindingTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, + address_context_data: config.address_context_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -1182,6 +1199,14 @@ pub fn prepared_search_config_from_binding( boundary_words: data.boundary_words, br_cep_cue_words: data.br_cep_cue_words, }), + address_context_data: config.address_context_data.map(|data| { + AddressContextData { + address_prepositions: data.address_prepositions, + temporal_prepositions: data.temporal_prepositions, + street_abbreviations: data.street_abbreviations, + bare_house_stopwords: data.bare_house_stopwords, + } + }), date_data: config.date_data.map(|data| DateData { month_names_by_language: data.month_names_by_language, year_words_by_language: data.year_words_by_language, @@ -2097,6 +2122,7 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::EntitySignature => "entity.signature", DiagnosticStage::EntityLegalForm => "entity.legal-form", DiagnosticStage::EntityAddressSeed => "entity.address-seed", + DiagnosticStage::EntityAddressContext => "entity.address-context", DiagnosticStage::Merge => "resolution.merge", DiagnosticStage::Boundary => "resolution.boundary", DiagnosticStage::Sanitize => "resolution.sanitize", diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs new file mode 100644 index 00000000..96c54e70 --- /dev/null +++ b/crates/anonymize-core/src/address_context.rs @@ -0,0 +1,478 @@ +use std::collections::BTreeSet; + +use regex::Regex; + +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const HEADER_ZONE_PERCENT: u32 = 15; +const STREET_CONTEXT_WINDOW: u32 = 200; +const BARE_HOUSE_CONTEXT_WINDOW: u32 = 50; +const MAX_BACKWARD_WORDS: usize = 5; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct AddressContextData { + #[serde(default)] + pub address_prepositions: Vec, + #[serde(default)] + pub temporal_prepositions: Vec, + #[serde(default)] + pub street_abbreviations: Vec, + #[serde(default)] + pub bare_house_stopwords: Vec, +} + +pub(crate) struct PreparedAddressContextData { + address_prepositions: BTreeSet, + temporal_prepositions: BTreeSet, + street_abbreviations: BTreeSet, + bare_house_stopwords: BTreeSet, + slash_house_number: Regex, + bare_house_number: Regex, + orphan_street_line: Regex, +} + +struct WordBefore { + start: usize, + raw: String, + normalized: String, + has_dot: bool, +} + +impl PreparedAddressContextData { + pub(crate) fn new(data: AddressContextData) -> Result { + Ok(Self { + address_prepositions: lowercased_set(data.address_prepositions), + temporal_prepositions: lowercased_set(data.temporal_prepositions), + street_abbreviations: lowercased_set(data.street_abbreviations), + bare_house_stopwords: data.bare_house_stopwords.into_iter().collect(), + slash_house_number: compile_regex( + "address_context.slash_house_number", + r"(?u)\b(?:\d{1,4}/\d+[A-Za-z]\b|\d{3,4}/\d+\b|(?:1[3-9]|[2-9]\d)/\d{3,}\b)", + )?, + bare_house_number: compile_regex( + "address_context.bare_house_number", + r"(?u)(?:^|\s)(?P\p{Lu}\p{Ll}[\p{Ll}\p{Lu}]+\s+\d{1,3})\b", + )?, + orphan_street_line: compile_regex( + "address_context.orphan_street_line", + r"(?um)^[^\S\n]*(?P\p{Lu}[\p{Ll}\p{Lu}]+(?:[^\S\n]+[\p{Lu}\p{Ll}][\p{Ll}]+)*[^\S\n]+\d{2,4}[A-Za-z]?)[^\S\n]*$", + )?, + }) + } + + pub(crate) fn process( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut results = self + .detect_street_patterns_near_addresses(full_text, existing_entities)?; + let mut orphan_context = + Vec::with_capacity(existing_entities.len().saturating_add(results.len())); + orphan_context.extend_from_slice(existing_entities); + orphan_context.extend(results.iter().cloned()); + results + .extend(self.detect_orphan_street_lines(full_text, &orphan_context)?); + Ok(results) + } + + fn detect_street_patterns_near_addresses( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut results = Vec::new(); + let address_entities = existing_entities + .iter() + .filter(|entity| entity.label == "address") + .filter(|entity| !is_caller_owned_entity(entity)) + .collect::>(); + let header_end = header_end(full_text); + + for found in self.slash_house_number.find_iter(full_text) { + let num_start = usize_to_u32("address_context.num_start", found.start())?; + let num_end = usize_to_u32("address_context.num_end", found.end())?; + if covered_by(existing_entities, num_start, num_end) { + continue; + } + + let in_header = num_start < header_end; + let near_address = address_entities.iter().any(|entity| { + entity.start.abs_diff(num_end) < STREET_CONTEXT_WINDOW + || entity.end.abs_diff(num_start) < STREET_CONTEXT_WINDOW + }); + if !in_header && !near_address { + continue; + } + + let Some(scan_start) = skip_whitespace_back(full_text, found.start()) + else { + continue; + }; + let Some((street_start, has_temporal_prep)) = + self.scan_street_start(full_text, scan_start)? + else { + continue; + }; + let street_start_u32 = + usize_to_u32("address_context.street_start", street_start)?; + if has_temporal_prep { + continue; + } + if covered_by(existing_entities, street_start_u32, num_end) { + continue; + } + + let street_text = text_slice(full_text, street_start_u32, num_end)?; + if street_text.len() < 4 { + continue; + } + let score = address_context_score(full_text, street_start, in_header); + results.push(PipelineEntity::detected( + street_start_u32, + num_end, + "address", + street_text, + score, + DetectionSource::Regex, + )); + } + + self.detect_bare_house_numbers( + full_text, + existing_entities, + &mut results, + )?; + Ok(results) + } + + fn scan_street_start( + &self, + full_text: &str, + mut scan_pos: usize, + ) -> Result> { + let mut has_temporal_prep = false; + let mut street_start = scan_pos; + let mut word_count = 0usize; + + while word_count < MAX_BACKWARD_WORDS { + let Some(word) = word_before(full_text, scan_pos)? else { + break; + }; + if word.normalized.is_empty() { + break; + } + + let is_street_abbrev = word.has_dot + && self.street_abbreviations.contains(&word.raw.to_lowercase()); + let lower_word = word.normalized.to_lowercase(); + let is_prep = self.address_prepositions.contains(&lower_word); + let is_upper = word + .normalized + .chars() + .next() + .is_some_and(char::is_uppercase); + let is_digit_token = is_short_ascii_digit_token(&word.normalized); + if !is_upper && !is_prep && !is_street_abbrev && !is_digit_token { + break; + } + if is_prep && self.temporal_prepositions.contains(&lower_word) { + has_temporal_prep = true; + } + + street_start = word.start; + word_count = word_count.saturating_add(1); + + let before_word = skip_whitespace_back(full_text, word.start); + let Some(next_scan_pos) = before_word else { + break; + }; + let Some((_, previous)) = previous_char(full_text, next_scan_pos) else { + break; + }; + if matches!(previous, '\n' | '\t' | ';' | ',') { + break; + } + scan_pos = next_scan_pos; + } + + if word_count == 0 { + return Ok(None); + } + Ok(Some((street_start, has_temporal_prep))) + } + + fn detect_bare_house_numbers( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + results: &mut Vec, + ) -> Result<()> { + for captures in self.bare_house_number.captures_iter(full_text) { + let Some(captured) = captures.name("value") else { + continue; + }; + let start = usize_to_u32("address_context.bare_start", captured.start())?; + let end = usize_to_u32("address_context.bare_end", captured.end())?; + if !near_confirmed_address_same_line( + full_text, + existing_entities, + results, + start, + end, + )? { + continue; + } + + let word = captured.as_str().split_whitespace().next().unwrap_or(""); + if self.bare_house_stopwords.contains(word) { + continue; + } + if overlaps_any(existing_entities, start, end) + || overlaps_any(results, start, end) + { + continue; + } + + results.push(PipelineEntity::detected( + start, + end, + "address", + captured.as_str(), + 0.75, + DetectionSource::Regex, + )); + } + Ok(()) + } + + fn detect_orphan_street_lines( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let header_end = header_end(full_text); + let context_entities = existing_entities + .iter() + .filter(|entity| { + !(entity.label == "address" && is_caller_owned_entity(entity)) + }) + .collect::>(); + let mut results = Vec::new(); + + for captures in self.orphan_street_line.captures_iter(full_text) { + let Some(captured) = captures.name("value") else { + continue; + }; + let start = + usize_to_u32("address_context.orphan_start", captured.start())?; + let end = usize_to_u32("address_context.orphan_end", captured.end())?; + if start >= header_end || covered_by(existing_entities, start, end) { + continue; + } + let has_context = context_entities.iter().any(|entity| { + entity.start.abs_diff(end) < STREET_CONTEXT_WINDOW + || entity.end.abs_diff(start) < STREET_CONTEXT_WINDOW + }); + if !has_context { + continue; + } + + results.push(PipelineEntity::detected( + start, + end, + "address", + captured.as_str(), + 0.85, + DetectionSource::Regex, + )); + } + Ok(results) + } +} + +fn lowercased_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn compile_regex(field: &'static str, pattern: &str) -> Result { + Regex::new(pattern).map_err(|error| Error::InvalidStaticData { + field, + reason: error.to_string(), + }) +} + +fn header_end(full_text: &str) -> u32 { + let len = u32::try_from(full_text.len()).unwrap_or(u32::MAX); + len.saturating_mul(HEADER_ZONE_PERCENT).div_euclid(100) +} + +const fn is_caller_owned_entity(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn covered_by(entities: &[PipelineEntity], start: u32, end: u32) -> bool { + entities + .iter() + .any(|entity| entity.start <= start && entity.end >= end) +} + +fn overlaps_any(entities: &[PipelineEntity], start: u32, end: u32) -> bool { + entities + .iter() + .any(|entity| entity.start < end && entity.end > start) +} + +fn skip_whitespace_back(full_text: &str, mut pos: usize) -> Option { + while let Some((index, ch)) = previous_char(full_text, pos) { + if !is_space(ch) { + return Some(pos); + } + pos = index; + } + None +} + +fn previous_char(full_text: &str, pos: usize) -> Option<(usize, char)> { + full_text.get(..pos)?.char_indices().next_back() +} + +fn word_before(full_text: &str, pos: usize) -> Result> { + let Some((last_index, last_ch)) = previous_char(full_text, pos) else { + return Ok(None); + }; + let mut scan_pos = pos; + let has_dot = last_ch == '.'; + if has_dot { + scan_pos = last_index; + } + + let mut word_start = scan_pos; + while let Some((previous_index, previous_ch)) = + previous_char(full_text, word_start) + { + if !is_word_char(previous_ch) { + break; + } + word_start = previous_index; + } + + let raw = full_text + .get(word_start..pos) + .ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(word_start).unwrap_or(u32::MAX), + end: u32::try_from(pos).unwrap_or(u32::MAX), + })? + .to_owned(); + let normalized = full_text + .get(word_start..scan_pos) + .ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(word_start).unwrap_or(u32::MAX), + end: u32::try_from(scan_pos).unwrap_or(u32::MAX), + })? + .to_owned(); + Ok(Some(WordBefore { + start: word_start, + raw, + normalized, + has_dot, + })) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphabetic() || ch.is_ascii_digit() || is_combining_mark(ch) +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + +const fn is_space(ch: char) -> bool { + ch.is_whitespace() || ch == '\u{00a0}' +} + +fn near_confirmed_address_same_line( + full_text: &str, + existing_entities: &[PipelineEntity], + results: &[PipelineEntity], + start: u32, + end: u32, +) -> Result { + for entity in existing_entities.iter().chain(results.iter()) { + if entity.label != "address" { + continue; + } + let dist = entity.start.abs_diff(end).min(entity.end.abs_diff(start)); + if dist > BARE_HOUSE_CONTEXT_WINDOW { + continue; + } + let lo = entity.start.min(start); + let hi = entity.end.max(end); + if !text_slice(full_text, lo, hi)?.contains('\n') { + return Ok(true); + } + } + Ok(false) +} + +fn is_short_ascii_digit_token(value: &str) -> bool { + let mut count = 0usize; + for ch in value.chars() { + if !ch.is_ascii_digit() { + return false; + } + count = count.saturating_add(1); + } + (1..=2).contains(&count) +} + +fn address_context_score( + full_text: &str, + street_start: usize, + in_header: bool, +) -> f64 { + let before_start = street_start.saturating_sub(5); + let has_colon = full_text + .get(before_start..street_start) + .is_some_and(|before| before.contains(':')); + if has_colon { + return 0.95; + } + if in_header { + return 0.85; + } + 0.8 +} + +fn text_slice(full_text: &str, start: u32, end: u32) -> Result<&str> { + let start_usize = usize::try_from(start) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: start })?; + let end_usize = usize::try_from(end) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: end })?; + full_text + .get(start_usize..end_usize) + .ok_or(Error::InvalidSpan { start, end }) +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: "span offset exceeds u32 range".to_owned(), + }) +} diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 267bef8c..421a4f19 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -37,6 +37,7 @@ pub enum DiagnosticStage { EntitySignature, EntityLegalForm, EntityAddressSeed, + EntityAddressContext, Merge, Boundary, Sanitize, diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 8116c5ce..6bac2206 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -2,6 +2,7 @@ //! Core anonymization contracts shared by host-language bindings. +mod address_context; mod address_seeds; mod anchored; mod artifact_bytes; @@ -24,6 +25,7 @@ mod triggers; mod types; mod validators; +pub use address_context::AddressContextData; pub use address_seeds::AddressSeedData; pub use dates::DateData; pub use diagnostics::{ diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index c44889f2..680c24b8 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1,5 +1,6 @@ use std::time::Instant; +use crate::address_context::{AddressContextData, PreparedAddressContextData}; use crate::address_seeds::{AddressSeedData, PreparedAddressSeedData}; use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; use crate::byte_offsets::ByteOffsets; @@ -63,6 +64,7 @@ pub struct PreparedSearch { trigger_data: Option, legal_form_data: Option, address_seed_data: Option, + address_context_data: Option, date_data: Option, monetary_data: Option, } @@ -107,6 +109,8 @@ pub struct PreparedSearchConfig { pub trigger_data: Option, pub legal_form_data: Option, pub address_seed_data: Option, + #[serde(default)] + pub address_context_data: Option, pub date_data: Option, pub monetary_data: Option, } @@ -367,19 +371,13 @@ impl PreparedSearch { diagnostics.as_deref_mut(), )?; - let indexes = build_search_indexes( - SearchIndexBuildInputs { - regex_patterns: regex_groups.regex, - regex_options: config.regex_options, - custom_regex_patterns: config.custom_regex_patterns, - custom_regex_options: config.custom_regex_options, - legal_form_patterns: regex_groups.legal_forms, - trigger_patterns: promote_case_insensitive_literals( - regex_groups.triggers, - ), - literal_patterns: config.literal_patterns, - literal_options: config.literal_options, - }, + let indexes = build_search_indexes_for_config( + regex_groups, + config.regex_options, + config.custom_regex_patterns, + config.custom_regex_options, + config.literal_patterns, + config.literal_options, artifacts, )?; let ( @@ -441,6 +439,9 @@ impl PreparedSearch { .transpose()?, legal_form_data: config.legal_form_data.map(PreparedLegalFormData::new), address_seed_data: prepare_address_seed_data(config.address_seed_data)?, + address_context_data: prepare_address_context_data( + config.address_context_data, + )?, date_data, monetary_data, }) @@ -840,13 +841,25 @@ impl PreparedSearch { full_text, &detections.matches.literal, )?; - let raw_entities = filter_entities_for_redaction( + let mut raw_entities = filter_entities_for_redaction( pre_threshold_entities, full_text, self.threshold, self.confidence_boost, &self.allowed_labels, )?; + let address_context_start = Instant::now(); + let address_context_entities = + self.process_address_context_entities(full_text, &raw_entities)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::EntityAddressContext, + &address_context_entities, + full_text, + Some(elapsed_us(address_context_start)), + ); + } + raw_entities.extend(address_context_entities); let merge_start = Instant::now(); let merged = merge_and_dedup(&raw_entities); if let Some(diagnostics) = &mut diagnostics { @@ -929,6 +942,20 @@ impl PreparedSearch { &self.allowed_labels, ) } + + fn process_address_context_entities( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + if !label_is_allowed("address", &self.allowed_labels) { + return Ok(Vec::new()); + } + let Some(data) = &self.address_context_data else { + return Ok(Vec::new()); + }; + data.process(full_text, existing_entities) + } } fn process_signature_entities(full_text: &str) -> TimedEntities { @@ -977,6 +1004,11 @@ fn filter_entities_for_labels( .collect() } +fn label_is_allowed(label: &str, allowed_labels: &[String]) -> bool { + allowed_labels.is_empty() + || allowed_labels.iter().any(|allowed| allowed == label) +} + fn filter_entities_for_threshold( entities: Vec, threshold: f64, @@ -1154,6 +1186,32 @@ fn elapsed_us(start: Instant) -> u64 { u64::try_from(micros).unwrap_or(u64::MAX) } +fn build_search_indexes_for_config( + regex_groups: RegexPatternGroups, + regex_options: SearchOptions, + custom_regex_patterns: Vec, + custom_regex_options: SearchOptions, + literal_patterns: Vec, + literal_options: SearchOptions, + artifacts: Option<&PreparedSearchArtifacts>, +) -> Result { + build_search_indexes( + SearchIndexBuildInputs { + regex_patterns: regex_groups.regex, + regex_options, + custom_regex_patterns, + custom_regex_options, + legal_form_patterns: regex_groups.legal_forms, + trigger_patterns: promote_case_insensitive_literals( + regex_groups.triggers, + ), + literal_patterns, + literal_options, + }, + artifacts, + ) +} + fn build_search_indexes( inputs: SearchIndexBuildInputs, artifacts: Option<&PreparedSearchArtifacts>, @@ -1336,6 +1394,12 @@ fn prepare_address_seed_data( data.map(PreparedAddressSeedData::new).transpose() } +fn prepare_address_context_data( + data: Option, +) -> Result> { + data.map(PreparedAddressContextData::new).transpose() +} + fn split_regex_patterns( patterns: Vec, slices: &PreparedSearchSlices, diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index b22e01d6..63d797fe 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -27,6 +27,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index 86f6fa71..77de3cce 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -29,6 +29,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 3962f323..5ec674c5 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -3,15 +3,15 @@ use std::collections::{BTreeMap, BTreeSet}; use stella_anonymize_core::{ - AddressSeedData, AmountWordsData, CountryMatchData, CurrencyData, DateData, - DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, - DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, HotwordRule, - HotwordRuleData, LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, - MonetaryData, OperatorConfig, PatternSlice, PreparedSearch, - PreparedSearchArtifacts, PreparedSearchConfig, PreparedSearchSlices, - RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, - SourceDetail, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, - WrittenAmountPatternData, + AddressContextData, AddressSeedData, AmountWordsData, CountryMatchData, + CurrencyData, DateData, DenyListFilterData, DenyListMatchData, + DetectionSource, DiagnosticEventKind, DiagnosticStage, Error, + FuzzySearchOptions, GazetteerMatchData, HotwordRule, HotwordRuleData, + LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, + OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchArtifacts, + PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, TriggerData, + TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -35,6 +35,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, } @@ -107,6 +108,15 @@ fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { .unwrap() } +fn address_context_data() -> AddressContextData { + AddressContextData { + address_prepositions: vec![String::from("na"), String::from("mezi")], + temporal_prepositions: vec![String::from("od"), String::from("do")], + street_abbreviations: vec![String::from("ul.")], + bare_house_stopwords: vec![String::from("Section")], + } +} + #[test] fn prepared_search_runs_legal_form_pass_on_normalized_text() { let prepared = legal_form_prepared_search(vec!["Pty Ltd"]); @@ -150,6 +160,7 @@ fn prepared_search_runs_normalized_literal_pass() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }) @@ -163,6 +174,73 @@ fn prepared_search_runs_normalized_literal_pass() { assert_eq!(result.gazetteer_entities[0].text, "Acme\u{00a0}Corp"); } +#[test] +fn prepared_search_adds_slash_house_number_address_context() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 2\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sídlo: Praha 2, Vinohradská 2512/2a", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "address" && entity.text.contains("Vinohradská 2512/2a") + })); +} + +#[test] +fn prepared_search_adds_orphan_header_street_line_context() { + let full_text = format!( + "ACME s.r.o.\nEvropská 710\n160 00 Praha\n{}", + "body ".repeat(200) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + &full_text, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.result.resolved_entities.iter().any(|entity| { + entity.label == "address" && entity.text == "Evropská 710" + })); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::EntityAddressContext + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); +} + #[test] fn prepared_search_artifacts_match_direct_prepare() { let config = PreparedSearchConfig { @@ -196,6 +274,7 @@ fn prepared_search_artifacts_match_direct_prepare() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }; @@ -343,6 +422,7 @@ fn prepared_search_emits_static_detector_entities() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }) @@ -976,6 +1056,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }) @@ -1289,6 +1370,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }) @@ -1371,6 +1453,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, }) diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 75c1bffc..8c4517b3 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -27,6 +27,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + address_context_data: None, date_data: None, monetary_data: None, } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index f4f9249b..bf4deab9 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -908,6 +908,71 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS address context redaction", async () => { + const adapters = getAdapters(); + const fullText = + "ACME s.r.o.\nEvropska 710\n160 00 Praha\n" + "body ".repeat(200); + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + customRegexes: [ + { pattern: "ACME s\\.r\\.o\\.", label: "organization", score: 1 }, + ], + labels: ["organization", "address"], + workspaceId: "native-pipeline-address-context-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsContext = createPipelineContext(); + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ label: "address", text: "Evropska 710" }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline package matches TS confidence boost redaction", async () => { const adapters = getAdapters(); const fullText = "ANCHOR-123 signed with NEAR-456."; @@ -1130,12 +1195,11 @@ describe("native adapter parity", () => { "enableNameCorpus", "enableCoreference", "enableZoneClassification", - "addressContextPasses", ], }); }); - test("native pipeline compatibility rejects address context passes", () => { + test("native pipeline compatibility accepts address context passes", () => { const config: PipelineConfig = { threshold: 0.85, enableTriggerPhrases: false, @@ -1154,8 +1218,7 @@ describe("native adapter parity", () => { }; expect(getNativePipelineCompatibility(config)).toEqual({ - status: "unsupported", - unsupportedFeatures: ["addressContextPasses"], + status: "supported", }); }); diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 1d28624a..ac96023c 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -70,6 +70,10 @@ import { loadHotwordRuleSet, type HotwordRule, } from "./filters/hotword-rules"; +import { + getAddressContextData, + type AddressContextData, +} from "./filters/confidence-boost"; import { getClauseNounHeadsSync, getConnectorProseHeadsSync, @@ -226,6 +230,7 @@ export type NativeDateData = { export type NativeMonetaryData = MonetaryData; export type NativeAddressSeedData = AddressSeedData; +export type NativeAddressContextData = AddressContextData; export type NativeGazetteerData = { labels: string[]; is_fuzzy: boolean[]; @@ -275,6 +280,7 @@ export type NativePreparedSearchConfig = { trigger_data?: NativeTriggerData; legal_form_data?: NativeLegalFormData; address_seed_data?: NativeAddressSeedData; + address_context_data?: NativeAddressContextData; date_data?: NativeDateData; monetary_data?: NativeMonetaryData; }; @@ -356,6 +362,7 @@ type UnifiedSearchSources = { nativeDateData: NativeDateData | null; nativeMonetaryData: NativeMonetaryData | null; nativeAddressSeedData: NativeAddressSeedData | null; + nativeAddressContextData: NativeAddressContextData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: string[]; hotwordRules: readonly HotwordRule[]; @@ -416,6 +423,7 @@ const buildUnifiedSearchSources = async ( yearWordData, monetaryData, addressSeedData, + addressContextData, ] = await Promise.all([ legalFormsEnabled || config.enableTriggerPhrases ? warmLegalRoleHeads() @@ -453,6 +461,9 @@ const buildUnifiedSearchSources = async ( labelIsAllowed("address", allowedLabels) ? getAddressSeedData() : Promise.resolve(null), + labelIsAllowed("address", allowedLabels) + ? Promise.resolve(getAddressContextData()) + : Promise.resolve(null), ]); // Read but never populated: the legal-form slice in the unified // search is permanently empty after the v2 rewrite. Tracking it @@ -707,6 +718,7 @@ const buildUnifiedSearchSources = async ( nativeDateData, nativeMonetaryData, nativeAddressSeedData: addressSeedData, + nativeAddressContextData: addressContextData, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -756,6 +768,7 @@ export const buildNativeStaticSearchBundle = async ( dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, + addressContextData: sources.nativeAddressContextData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -839,6 +852,7 @@ export const buildUnifiedSearch = async ( dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, + addressContextData: sources.nativeAddressContextData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -887,6 +901,7 @@ type BuildNativeStaticConfigArgs = { dateData: NativeDateData | null; monetaryData: NativeMonetaryData | null; addressSeedData: NativeAddressSeedData | null; + addressContextData: NativeAddressContextData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: readonly string[]; hotwordRules: readonly HotwordRule[]; @@ -916,6 +931,7 @@ const buildNativeStaticConfig = ({ dateData, monetaryData, addressSeedData, + addressContextData, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -1116,6 +1132,9 @@ const buildNativeStaticConfig = ({ if (addressSeedData) { nativeConfig.address_seed_data = addressSeedData; } + if (addressContextData) { + nativeConfig.address_context_data = addressContextData; + } if (dateData) { nativeConfig.date_data = dateData; } diff --git a/packages/anonymize/src/data/address-context.json b/packages/anonymize/src/data/address-context.json new file mode 100644 index 00000000..393a7981 --- /dev/null +++ b/packages/anonymize/src/data/address-context.json @@ -0,0 +1,72 @@ +{ + "_comment": "Address context guard words by language. These words can appear as '[Word] [number]' near address-like text in legal documents, but usually denote structure, payments, dates, or references rather than a bare street and house number.", + "bareHouseStopwords": { + "cs": [ + "Příloha", + "Smlouva", + "Článek", + "Dodatek", + "Celkem", + "Strana", + "Faktura", + "Částka", + "Položka", + "Kapitola", + "Zákon", + "Vyhláška", + "Nařízení", + "Usnesení", + "Rozsudek", + "Bod", + "Odstavec", + "Záloha", + "Zbývá", + "Dne", + "Platba", + "Datum", + "Splatnost", + "Variabilní", + "Konstantní", + "Specifický" + ], + "en": [ + "Section", + "Sections", + "Article", + "Articles", + "Schedule", + "Schedules", + "Exhibit", + "Exhibits", + "Annex", + "Annexes", + "Appendix", + "Appendices", + "Clause", + "Clauses", + "Chapter", + "Chapters", + "Paragraph", + "Paragraphs", + "Subsection", + "Subsections", + "Form", + "Page", + "Pages", + "Item", + "Items", + "Note", + "Notes", + "Rule", + "Rules", + "Attachment", + "Attachments", + "Volume", + "Volumes", + "Book", + "Books", + "Part", + "Parts" + ] + } +} diff --git a/packages/anonymize/src/filters/confidence-boost.ts b/packages/anonymize/src/filters/confidence-boost.ts index 9e383e2e..1cda53fe 100644 --- a/packages/anonymize/src/filters/confidence-boost.ts +++ b/packages/anonymize/src/filters/confidence-boost.ts @@ -1,82 +1,9 @@ +import addressContextJson from "../data/address-context.json"; +import addressPrepositionsJson from "../data/address-prepositions.json"; import addressStreetTypesJson from "../data/address-street-types.json"; import type { Entity } from "../types"; import { isCallerOwnedEntity } from "../util/entity-source"; -// Capitalised words that look like the start of an -// `[Uppercase] [number]` address (Czech: "Vinohradská 12") -// but in contract prose introduce a section, clause, or -// document reference instead ("Section 6", "Article 9"). -// Listed here so the `bareHouseRe` near-address scan does -// not promote them to address spans. Module-level to avoid -// allocation in a hot loop. -const BARE_STOPWORDS = new Set([ - // ── Czech ──────────────────────────────────────── - "Příloha", - "Smlouva", - "Článek", - "Dodatek", - "Celkem", - "Strana", - "Faktura", - "Částka", - "Položka", - "Kapitola", - "Zákon", - "Vyhláška", - "Nařízení", - "Usnesení", - "Rozsudek", - "Bod", - "Odstavec", - "Záloha", - "Zbývá", - "Dne", - "Platba", - "Datum", - "Splatnost", - "Variabilní", - "Konstantní", - "Specifický", - // ── English ────────────────────────────────────── - "Section", - "Sections", - "Article", - "Articles", - "Schedule", - "Schedules", - "Exhibit", - "Exhibits", - "Annex", - "Annexes", - "Appendix", - "Appendices", - "Clause", - "Clauses", - "Chapter", - "Chapters", - "Paragraph", - "Paragraphs", - "Subsection", - "Subsections", - "Form", - "Page", - "Pages", - "Item", - "Items", - "Note", - "Notes", - "Rule", - "Rules", - "Attachment", - "Attachments", - "Volume", - "Volumes", - "Book", - "Books", - "Part", - "Parts", -]); - const NEAR_MISS_BAND = 0.15; const BOOST_PER_NEIGHBOUR = 0.05; const CONTEXT_WINDOW_CHARS = 150; @@ -155,29 +82,61 @@ type PrepositionData = { temporal: Record; }; +type AddressContextJson = { + bareHouseStopwords: Record; +}; + +export type AddressContextData = { + address_prepositions: string[]; + temporal_prepositions: string[]; + street_abbreviations: string[]; + bare_house_stopwords: string[]; +}; + +const languageRecordValues = ( + record: Record, + transform: (value: string) => string = (value) => value, +): string[] => { + const values: string[] = []; + for (const [language, words] of Object.entries(record)) { + if (language.startsWith("_") || !Array.isArray(words)) { + continue; + } + for (const word of words) { + values.push(transform(word)); + } + } + return values; +}; + +const buildPrepositionSets = ( + data: PrepositionData, +): { + address: ReadonlySet; + temporal: ReadonlySet; +} => ({ + address: new Set( + languageRecordValues(data.address, (word) => word.toLowerCase()), + ), + temporal: new Set( + languageRecordValues(data.temporal, (word) => word.toLowerCase()), + ), +}); + +const buildBareStopwords = (data: AddressContextJson): ReadonlySet => + new Set(languageRecordValues(data.bareHouseStopwords)); + +const BARE_STOPWORDS = buildBareStopwords(addressContextJson); + let _addressPreps: ReadonlySet | null = null; let _temporalPreps: ReadonlySet | null = null; let _prepsPromise: Promise | null = null; const loadPrepositions = async (): Promise => { try { - const mod = await import("../data/address-prepositions.json"); - const data: PrepositionData = mod.default ?? mod; - // Merge all languages into flat sets - const addr = new Set(); - const temp = new Set(); - for (const words of Object.values(data.address)) { - if (Array.isArray(words)) { - for (const w of words) addr.add(w.toLowerCase()); - } - } - for (const words of Object.values(data.temporal)) { - if (Array.isArray(words)) { - for (const w of words) temp.add(w.toLowerCase()); - } - } - _addressPreps = addr; - _temporalPreps = temp; + const prepositions = buildPrepositionSets(addressPrepositionsJson); + _addressPreps = prepositions.address; + _temporalPreps = prepositions.temporal; } catch { _addressPreps = new Set(); _temporalPreps = new Set(); @@ -240,6 +199,16 @@ export const initStreetAbbrevs = (): Promise => { export const getStreetAbbrevs = (): ReadonlySet => _streetAbbrevs ?? new Set(); +export const getAddressContextData = (): AddressContextData => { + const prepositions = buildPrepositionSets(addressPrepositionsJson); + return { + address_prepositions: [...prepositions.address], + temporal_prepositions: [...prepositions.temporal], + street_abbreviations: [...buildStreetAbbrevs(addressStreetTypesJson)], + bare_house_stopwords: [...buildBareStopwords(addressContextJson)], + }; +}; + /** * Scan backwards from known address entities and * house number patterns to find street names. diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index e2cbcd30..0b7f98ba 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -19,8 +19,7 @@ export type NativePipelineUnsupportedFeature = | "enableNer" | "enableNameCorpus" | "enableCoreference" - | "enableZoneClassification" - | "addressContextPasses"; + | "enableZoneClassification"; export type NativePipelineCompatibility = | { status: "supported" } @@ -85,25 +84,12 @@ export const getNativePipelineCompatibility = ( if (config.enableZoneClassification === true) { unsupportedFeatures.push("enableZoneClassification"); } - if (addressContextPassesCanAffectOutput(config)) { - unsupportedFeatures.push("addressContextPasses"); - } - if (unsupportedFeatures.length === 0) { return { status: "supported" }; } return { status: "unsupported", unsupportedFeatures }; }; -const addressContextPassesCanAffectOutput = (config: PipelineConfig): boolean => - labelIsEnabled(config.labels, "address") && config.threshold <= 0.95; - -const labelIsEnabled = ( - labels: readonly string[] | undefined, - label: string, -): boolean => - labels === undefined || labels.length === 0 || labels.includes(label); - export const assertNativePipelineSupported = (config: PipelineConfig): void => { const compatibility = getNativePipelineCompatibility(config); if (compatibility.status === "supported") { From 7e7be1344e3d16490018fc22e0cda521521a2cab Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 00:54:53 +0200 Subject: [PATCH 058/130] fix: mirror address context data --- packages/data/config/address-context.json | 72 +++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 packages/data/config/address-context.json diff --git a/packages/data/config/address-context.json b/packages/data/config/address-context.json new file mode 100644 index 00000000..393a7981 --- /dev/null +++ b/packages/data/config/address-context.json @@ -0,0 +1,72 @@ +{ + "_comment": "Address context guard words by language. These words can appear as '[Word] [number]' near address-like text in legal documents, but usually denote structure, payments, dates, or references rather than a bare street and house number.", + "bareHouseStopwords": { + "cs": [ + "Příloha", + "Smlouva", + "Článek", + "Dodatek", + "Celkem", + "Strana", + "Faktura", + "Částka", + "Položka", + "Kapitola", + "Zákon", + "Vyhláška", + "Nařízení", + "Usnesení", + "Rozsudek", + "Bod", + "Odstavec", + "Záloha", + "Zbývá", + "Dne", + "Platba", + "Datum", + "Splatnost", + "Variabilní", + "Konstantní", + "Specifický" + ], + "en": [ + "Section", + "Sections", + "Article", + "Articles", + "Schedule", + "Schedules", + "Exhibit", + "Exhibits", + "Annex", + "Annexes", + "Appendix", + "Appendices", + "Clause", + "Clauses", + "Chapter", + "Chapters", + "Paragraph", + "Paragraphs", + "Subsection", + "Subsections", + "Form", + "Page", + "Pages", + "Item", + "Items", + "Note", + "Notes", + "Rule", + "Rules", + "Attachment", + "Attachments", + "Volume", + "Volumes", + "Book", + "Books", + "Part", + "Parts" + ] + } +} From 7e16361a95c74c5cb01eb2f3ac7abc3c5ad693e9 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:23:01 +0200 Subject: [PATCH 059/130] chore: bump search core revisions --- Cargo.lock | 4 ++-- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6fef9ddb..0eb37809 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -562,7 +562,7 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" -source = "git+https://github.com/stella/aho-corasick?rev=38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1#38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1" +source = "git+https://github.com/stella/aho-corasick?rev=28226295ca5df514cd915e7c26af6fd605348b81#28226295ca5df514cd915e7c26af6fd605348b81" dependencies = [ "daachorse", "unicode-case-mapping", @@ -645,7 +645,7 @@ source = "git+https://github.com/stella/stdnum?rev=b4949ece8981b84c53a21c26f7a50 [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=0cfaad48a3df24f918cf52a2d5aaf32f5a031148#0cfaad48a3df24f918cf52a2d5aaf32f5a031148" +source = "git+https://github.com/stella/text-search?rev=8a42c28a8e7c5a32c838ae9dd443c21deab391ed#8a42c28a8e7c5a32c838ae9dd443c21deab391ed" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 0c8573ec..ebe00d77 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "b4949ece8981b84c53a21c26f7a5068dba553142" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0cfaad48a3df24f918cf52a2d5aaf32f5a031148" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } [lints] workspace = true From 0a32a1784a2cc94067fc9e74703522e2a079d30d Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:23:23 +0200 Subject: [PATCH 060/130] fix: cache native pipeline packages --- .../src/__test__/pipeline-config.test.ts | 151 +++++++++++++++++ packages/anonymize/src/context.ts | 6 + packages/anonymize/src/native-pipeline.ts | 153 ++++++++++++++++-- packages/anonymize/src/pipeline-cache-key.ts | 74 +++++++++ packages/anonymize/src/pipeline.ts | 71 +------- 5 files changed, 372 insertions(+), 83 deletions(-) create mode 100644 packages/anonymize/src/pipeline-cache-key.ts diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 6ca7df5a..5ddc71ab 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -3,13 +3,16 @@ import { describe, expect, test } from "bun:test"; import { createPipelineContext, DEFAULT_ENTITY_LABELS, + createNativePipelineFromConfig, preparePipelineSearch, + prepareNativePipelinePackage, redactText, runPipeline, } from "../index"; import { buildUnifiedSearch } from "../build-unified-search"; import { REGEX_META } from "../detectors/regex"; import { applyPipelineLanguageScope } from "../language-scope"; +import type { NativeAnonymizeBinding } from "../native"; import type { Dictionaries, PipelineConfig } from "../types"; import { loadTestDictionaries } from "./load-dictionaries"; @@ -51,6 +54,52 @@ const detect = async (fullText: string, config: Partial) => context: getCtx(), }); +const createCountingNativeBinding = (version: string) => { + let compressedPrepare = 0; + let rawPrepare = 0; + let fromPackage = 0; + const binding = { + nativePackageVersion: () => version, + NativePreparedSearch: { + fromConfigJsonBytes: () => { + throw new Error("native package cache test should use package bytes"); + }, + fromPreparedPackageBytes: () => { + fromPackage += 1; + return { + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + redactStaticEntities: (fullText: string) => ({ + resolvedEntities: [], + redaction: { + redactedText: fullText, + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, + }), + }; + }, + }, + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => { + rawPrepare += 1; + return new Uint8Array([rawPrepare, configJson.byteLength % 256]); + }, + prepareStaticSearchCompressedPackageBytes: (configJson: Uint8Array) => { + compressedPrepare += 1; + return new Uint8Array([compressedPrepare, configJson.byteLength % 256]); + }, + } satisfies NativeAnonymizeBinding; + + return { + binding, + counts: () => ({ + compressedPrepare, + fromPackage, + rawPrepare, + }), + }; +}; + describe("pipeline config semantics", () => { test("content language derives dictionary scopes", () => { expect( @@ -421,6 +470,108 @@ describe("pipeline config semantics", () => { ).toBeGreaterThan(0); }); + test("native pipeline package cache reuses exact configs", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-context", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + labels: ["person"], + }; + + const first = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + first[0] = 99; + const second = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + await createNativePipelineFromConfig({ binding, config, context }); + + expect(counts().compressedPrepare).toBe(1); + expect(second[0]).toBe(1); + }); + + test("native pipeline package cache is scoped by dictionary identity", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-dictionaries", + ); + const cacheDictionaries = { + firstNames: { + en: ["Ada"], + }, + } satisfies Dictionaries; + const config = { + ...BASE_CONFIG, + dictionaries: cacheDictionaries, + enableCountries: false, + labels: ["person"], + }; + + await prepareNativePipelinePackage({ + binding, + config, + context: createPipelineContext(), + }); + await prepareNativePipelinePackage({ + binding, + config, + context: createPipelineContext(), + }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + dictionaries: { ...cacheDictionaries }, + }, + context: createPipelineContext(), + }); + + expect(counts().compressedPrepare).toBe(2); + }); + + test("native pipeline package cache keys caller data", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-caller-data", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + customRegexes: [ + { + label: "matter id", + pattern: "MAT-[0-9]+", + }, + ], + enableCountries: false, + enableRegex: true, + labels: ["matter id"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + customRegexes: [ + { + label: "matter id", + pattern: "REF-[0-9]+", + }, + ], + }, + context, + }); + + expect(counts().compressedPrepare).toBe(2); + }); + test("enableLegalForms flag gates legal-form detection", async () => { const withFlag = await detect("Acme s.r.o.", { enableLegalForms: true, diff --git a/packages/anonymize/src/context.ts b/packages/anonymize/src/context.ts index ca5ce8dd..7c1ed162 100644 --- a/packages/anonymize/src/context.ts +++ b/packages/anonymize/src/context.ts @@ -69,6 +69,9 @@ export type PipelineContext = { search: UnifiedSearchInstance | null; searchKey: string; searchPromise: Promise | null; + nativePipelinePackage: Uint8Array | null; + nativePipelinePackageKey: string; + nativePipelinePackagePromise: Promise | null; // ── Name corpus ─────────────────────────────── nameCorpus: NameCorpusData | null; @@ -112,6 +115,9 @@ export const createPipelineContext = (): PipelineContext => ({ search: null, searchKey: "", searchPromise: null, + nativePipelinePackage: null, + nativePipelinePackageKey: "", + nativePipelinePackagePromise: null, nameCorpus: null, nameCorpusKey: "", diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 0b7f98ba..e256b390 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -4,9 +4,10 @@ import { } from "./build-unified-search"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; -import type { GazetteerEntry, PipelineConfig } from "./types"; +import { applyPipelineLanguageScope } from "./language-scope"; +import { pipelineConfigKey } from "./pipeline-cache-key"; +import type { Dictionaries, GazetteerEntry, PipelineConfig } from "./types"; import { - createNativeAnonymizerFromConfig, createNativeAnonymizerFromPackage, prepareNativeSearchPackage, PreparedNativeAnonymizer, @@ -44,6 +45,32 @@ export type NativePipelineFromPackageOptions = { packageBytes: Uint8Array; }; +type NativePipelinePackageCacheValue = Promise | Uint8Array; + +const sharedPackageByDictionaries = new WeakMap< + Dictionaries, + Map +>(); +const sharedPackageWithoutDictionaries = new Map< + string, + NativePipelinePackageCacheValue +>(); + +const sharedPackageCacheFor = ( + dictionaries: Dictionaries | undefined, +): Map => { + if (dictionaries === undefined) { + return sharedPackageWithoutDictionaries; + } + const cached = sharedPackageByDictionaries.get(dictionaries); + if (cached !== undefined) { + return cached; + } + const created = new Map(); + sharedPackageByDictionaries.set(dictionaries, created); + return created; +}; + export class PreparedNativePipeline { readonly #anonymizer: PreparedNativeAnonymizer; @@ -124,16 +151,14 @@ export const prepareNativePipelinePackage = async ({ context, compressed = true, }: NativePipelinePackageOptions): Promise => { - const nativeConfig = await prepareNativePipelineConfig({ + const packageBytes = await getCachedNativePipelinePackage({ config, + binding, gazetteerEntries, ...(context ? { context } : {}), - }); - return prepareNativeSearchPackage({ - binding, - config: nativeConfig, compressed, }); + return packageBytes.slice(); }; export const createNativePipelineFromConfig = async ({ @@ -142,17 +167,13 @@ export const createNativePipelineFromConfig = async ({ gazetteerEntries = [], context, }: NativePipelineBuildOptions): Promise => { - const nativeConfig = await prepareNativePipelineConfig({ + const packageBytes = await getCachedNativePipelinePackage({ + binding, config, gazetteerEntries, ...(context ? { context } : {}), }); - return new PreparedNativePipeline( - createNativeAnonymizerFromConfig({ - binding, - config: nativeConfig, - }), - ); + return createNativePipelineFromPackage({ binding, packageBytes }); }; export const createNativePipelineFromPackage = ({ @@ -162,3 +183,107 @@ export const createNativePipelineFromPackage = ({ new PreparedNativePipeline( createNativeAnonymizerFromPackage({ binding, packageBytes }), ); + +const getCachedNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries = [], + context, + compressed = true, +}: NativePipelinePackageOptions): Promise => { + const scopedConfig = applyPipelineLanguageScope(config); + assertNativePipelineSupported(scopedConfig); + const ctx = context ?? defaultContext; + const key = nativePackageCacheKey({ + binding, + config: scopedConfig, + gazetteerEntries, + compressed, + }); + if (ctx.nativePipelinePackage && ctx.nativePipelinePackageKey === key) { + return ctx.nativePipelinePackage; + } + if ( + ctx.nativePipelinePackagePromise && + ctx.nativePipelinePackageKey === key + ) { + return ctx.nativePipelinePackagePromise; + } + + const sharedCache = sharedPackageCacheFor(scopedConfig.dictionaries); + const shared = sharedCache.get(key); + if (shared !== undefined) { + const packageBytes = await shared; + ctx.nativePipelinePackage = packageBytes; + ctx.nativePipelinePackageKey = key; + ctx.nativePipelinePackagePromise = null; + return packageBytes; + } + + ctx.nativePipelinePackage = null; + ctx.nativePipelinePackageKey = key; + const promise = buildNativePipelinePackage({ + binding, + config: scopedConfig, + gazetteerEntries, + context: ctx, + compressed, + }); + ctx.nativePipelinePackagePromise = promise; + sharedCache.set(key, promise); + let packageBytes: Uint8Array; + try { + packageBytes = await promise; + } catch (error) { + if (sharedCache.get(key) === promise) { + sharedCache.delete(key); + } + throw error; + } + if (sharedCache.get(key) === promise) { + sharedCache.set(key, packageBytes); + } + if (ctx.nativePipelinePackageKey === key) { + ctx.nativePipelinePackage = packageBytes; + ctx.nativePipelinePackagePromise = null; + } + return packageBytes; +}; + +const buildNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries, + context, + compressed, +}: Required): Promise => { + const bundle = await buildNativeStaticSearchBundle( + config, + gazetteerEntries, + context, + ); + return prepareNativeSearchPackage({ + binding, + config: bundle.nativeStaticConfig, + compressed, + }); +}; + +type NativePackageCacheKeyOptions = { + binding: NativeAnonymizeBinding; + config: PipelineConfig; + gazetteerEntries: readonly GazetteerEntry[]; + compressed: boolean; +}; + +const nativePackageCacheKey = ({ + binding, + config, + gazetteerEntries, + compressed, +}: NativePackageCacheKeyOptions): string => + [ + binding.nativePackageVersion(), + compressed ? "compressed" : "raw", + pipelineConfigKey(config, gazetteerEntries), + ].join(":"); diff --git a/packages/anonymize/src/pipeline-cache-key.ts b/packages/anonymize/src/pipeline-cache-key.ts new file mode 100644 index 00000000..c25d2219 --- /dev/null +++ b/packages/anonymize/src/pipeline-cache-key.ts @@ -0,0 +1,74 @@ +import { + isLegalFormsEnabled, + type GazetteerEntry, + type PipelineConfig, +} from "./types"; + +const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; + +export const pipelineConfigKey = ( + config: PipelineConfig, + gazetteerEntries: readonly GazetteerEntry[], +): string => { + const legalFormsEnabled = isLegalFormsEnabled(config); + const customDenyFingerprint = + config.enableDenyList && config.customDenyList + ? config.customDenyList + .map((entry) => + JSON.stringify({ + label: entry.label, + value: entry.value, + variants: [...(entry.variants ?? [])].sort(), + }), + ) + .sort() + .join("\n") + : ""; + const customRegexFingerprint = + config.enableRegex && config.customRegexes + ? config.customRegexes + .map((entry) => + JSON.stringify({ + label: entry.label, + pattern: entry.pattern, + score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, + }), + ) + .sort() + .join("\n") + : ""; + const gazFingerprint = + config.enableGazetteer && gazetteerEntries.length > 0 + ? gazetteerEntries + .map( + (entry) => + `${entry.id}:${entry.canonical}:${entry.label}:${[ + ...entry.variants, + ] + .sort() + .join(",")}`, + ) + .toSorted() + .join(";") + : ""; + + return ( + `${config.enableDenyList}:` + + `${config.enableTriggerPhrases}:` + + `${legalFormsEnabled}:` + + `${config.enableNameCorpus}:` + + `${config.nameCorpusLanguages?.toSorted().join(",") ?? ""}:` + + `${config.enableRegex}:` + + `${config.threshold}:` + + `${config.enableConfidenceBoost}:` + + `${config.enableHotwordRules === true}:` + + `${config.labels.toSorted().join(",")}:` + + `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + + `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + + `${config.denyListExcludeCategories?.toSorted().join(",") ?? ""}:` + + `${customDenyFingerprint}:` + + `${customRegexFingerprint}:` + + `${config.enableGazetteer}:${gazFingerprint}:` + + `${config.enableCountries !== false}` + ); +}; diff --git a/packages/anonymize/src/pipeline.ts b/packages/anonymize/src/pipeline.ts index c857d7ba..e3339f30 100644 --- a/packages/anonymize/src/pipeline.ts +++ b/packages/anonymize/src/pipeline.ts @@ -71,6 +71,7 @@ import { maskDetectedSpans, unmaskNerEntities } from "./util/entity-masking"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; import { applyPipelineLanguageScope } from "./language-scope"; +import { pipelineConfigKey } from "./pipeline-cache-key"; /** * Sources backed by curated literal dictionaries. @@ -803,8 +804,6 @@ const createAllowedLabelSetFromLabels = ( const createAllowedLabelSet = (config: PipelineConfig): AllowedLabelSet => createAllowedLabelSetFromLabels(config.labels); -const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; - const filterAllowedLabels = ( entities: Entity[], allowedLabels: AllowedLabelSet, @@ -843,72 +842,6 @@ const checkAbort = (signal?: AbortSignal): void => { } }; -const configKey = ( - config: PipelineConfig, - gazetteerEntries: GazetteerEntry[], -): string => { - const legalFormsEnabled = isLegalFormsEnabled(config); - const customDenyFingerprint = - config.enableDenyList && config.customDenyList - ? config.customDenyList - .map((entry) => - JSON.stringify({ - label: entry.label, - value: entry.value, - variants: [...(entry.variants ?? [])].sort(), - }), - ) - .sort() - .join("\n") - : ""; - const customRegexFingerprint = - config.enableRegex && config.customRegexes - ? config.customRegexes - .map((entry) => - JSON.stringify({ - label: entry.label, - pattern: entry.pattern, - score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, - }), - ) - .sort() - .join("\n") - : ""; - // Gazetteer fingerprint: sorted entry IDs, - // canonical forms, labels, and variants. - // Skip when gazetteer is disabled to avoid - // unnecessary cache misses. - const gazFingerprint = - config.enableGazetteer && gazetteerEntries.length > 0 - ? gazetteerEntries - .map( - (e) => - `${e.id}:${e.canonical}:${e.label}:${[...e.variants].sort().join(",")}`, - ) - .toSorted() - .join(";") - : ""; - return ( - `${config.enableDenyList}:` + - `${config.enableTriggerPhrases}:` + - `${legalFormsEnabled}:` + - `${config.enableNameCorpus}:` + - `${config.nameCorpusLanguages?.toSorted().join(",") ?? ""}:` + - `${config.enableRegex}:` + - `${config.threshold}:` + - `${config.enableConfidenceBoost}:` + - `${config.enableHotwordRules === true}:` + - `${config.labels.toSorted().join(",")}:` + - `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + - `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + - `${config.denyListExcludeCategories?.toSorted().join(",") ?? ""}:` + - `${customDenyFingerprint}:` + - `${customRegexFingerprint}:` + - `${config.enableGazetteer}:${gazFingerprint}:` + - `${config.enableCountries !== false}` - ); -}; - type SharedSearchCacheValue = | Promise | UnifiedSearchInstance; @@ -968,7 +901,7 @@ const getCachedSearch = async ( gazetteerEntries: GazetteerEntry[], ctx: PipelineContext, ): Promise => { - const key = configKey(config, gazetteerEntries); + const key = pipelineConfigKey(config, gazetteerEntries); if (ctx.search && ctx.searchKey === key) { return ctx.search; } From 7580d03137b8575cf11ea0236604858e3323b7cf Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:23:30 +0200 Subject: [PATCH 061/130] fix: preserve native detection edges --- crates/anonymize-adapter-contract/src/lib.rs | 1 + crates/anonymize-core/src/diagnostics.rs | 30 +++ crates/anonymize-core/src/prepared.rs | 12 +- crates/anonymize-core/src/processors.rs | 57 ++++- crates/anonymize-core/src/resolution/merge.rs | 25 +++ crates/anonymize-core/src/search.rs | 63 +++++- crates/anonymize-core/src/triggers.rs | 204 +++++++++++++++++- crates/anonymize-core/tests/processors.rs | 77 +++++++ crates/anonymize-core/tests/resolution.rs | 57 +++++ crates/anonymize-core/tests/search.rs | 57 +++++ 10 files changed, 568 insertions(+), 15 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 631ae214..2c3c8362 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -2136,6 +2136,7 @@ fn diagnostic_event_kind_name(kind: DiagnosticEventKind) -> String { DiagnosticEventKind::StageSummary => "stage-summary", DiagnosticEventKind::SearchMatch => "search-match", DiagnosticEventKind::Entity => "entity", + DiagnosticEventKind::Rejection => "rejection", } .to_owned() } diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 421a4f19..1df82ed3 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -49,6 +49,7 @@ pub enum DiagnosticEventKind { StageSummary, SearchMatch, Entity, + Rejection, } #[derive(Clone, Debug, PartialEq)] @@ -178,6 +179,35 @@ impl StaticRedactionDiagnostics { }); } + pub(crate) fn record_rejection( + &mut self, + stage: DiagnosticStage, + pattern: Option, + label: Option<&str>, + start: Option, + end: Option, + reason: &'static str, + ) { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::Rejection, + count: None, + engine: None, + pattern, + source: None, + source_detail: None, + label: label.map(str::to_owned), + start, + end, + text: None, + score: None, + span_valid: None, + elapsed_us: None, + input_bytes: None, + reason: Some(String::from(reason)), + }); + } + pub(crate) fn record_stage( &mut self, stage: DiagnosticStage, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 680c24b8..441109af 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -574,7 +574,11 @@ impl PreparedSearch { ) -> Result { let matches = self.find_matches_inner(full_text, diagnostics.as_deref_mut())?; - let passes = self.process_static_entity_passes(&matches, full_text)?; + let passes = self.process_static_entity_passes( + &matches, + full_text, + diagnostics.as_deref_mut(), + )?; if let Some(diagnostics) = &mut diagnostics { record_static_entity_diagnostics(diagnostics, full_text, &passes); @@ -599,6 +603,7 @@ impl PreparedSearch { &self, matches: &PreparedSearchMatches, full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, ) -> Result { let regex_start = Instant::now(); let regex = TimedEntities { @@ -656,7 +661,8 @@ impl PreparedSearch { let anchored = self.process_anchored_entities(full_text)?; - let trigger = self.process_trigger_entities(matches, full_text)?; + let trigger = + self.process_trigger_entities(matches, full_text, diagnostics)?; let signature = process_signature_entities(full_text); @@ -714,6 +720,7 @@ impl PreparedSearch { &self, matches: &PreparedSearchMatches, full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, ) -> Result { let start = Instant::now(); let entities = if let Some(data) = &self.trigger_data { @@ -722,6 +729,7 @@ impl PreparedSearch { self.slices.triggers, full_text, data, + diagnostics, )? } else { Vec::new() diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 4480a4a2..1c4bd255 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -339,8 +339,9 @@ pub fn process_deny_list_matches( data: &DenyListMatchData, ) -> Result> { let offsets = ByteOffsets::new(full_text); - let matches_by_pattern = + let mut matches_by_pattern = collect_deny_list_matches(matches, slice, full_text, data, &offsets)?; + suppress_shorter_curated_contained_matches(&mut matches_by_pattern); let mut results = Vec::new(); let mut name_hits = Vec::new(); @@ -408,6 +409,60 @@ pub fn process_deny_list_matches( Ok(results) } +fn suppress_shorter_curated_contained_matches( + matches_by_pattern: &mut BTreeMap>, +) { + let mut ranges = Vec::<(u32, u32)>::new(); + for matches in matches_by_pattern.values() { + for found in matches { + if found.labels.is_empty() { + continue; + } + ranges.push((found.start, found.end)); + } + } + + ranges.sort_by(|left, right| { + left.0.cmp(&right.0).then_with(|| right.1.cmp(&left.1)) + }); + + let mut suppress = BTreeSet::<(u32, u32)>::new(); + let mut max_end = None::; + let mut max_end_start = None::; + for (start, end) in ranges { + if max_end.is_some_and(|container_end| { + container_end > end + || (container_end == end + && max_end_start + .is_some_and(|container_start| container_start < start)) + }) { + suppress.insert((start, end)); + } + if max_end.is_none_or(|current| end > current) { + max_end = Some(end); + max_end_start = Some(start); + } + } + + if suppress.is_empty() { + return; + } + + for matches in matches_by_pattern.values_mut() { + for found in matches.iter_mut() { + if found.labels.is_empty() { + continue; + } + if suppress.contains(&(found.start, found.end)) { + found.labels.clear(); + } + } + matches.retain(|found| { + !found.labels.is_empty() || !found.custom_labels.is_empty() + }); + } +} + fn collect_deny_list_matches( matches: &[SearchMatch], slice: PatternSlice, diff --git a/crates/anonymize-core/src/resolution/merge.rs b/crates/anonymize-core/src/resolution/merge.rs index d9897a8f..e36089d7 100644 --- a/crates/anonymize-core/src/resolution/merge.rs +++ b/crates/anonymize-core/src/resolution/merge.rs @@ -107,6 +107,17 @@ fn should_replace( return false; } + if curated_organization_contains_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if curated_organization_contains_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + if address_contains_bare_postal(candidate, existing) && candidate_len > existing_len { @@ -307,6 +318,20 @@ fn literal_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { && outer.end >= inner.end } +fn curated_organization_contains_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + matches!( + outer.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) && outer.label == "organization" + && matches!(inner.label.as_str(), "address" | "country") + && !is_caller_owned(inner) + && outer.start <= inner.start + && outer.end >= inner.end +} + fn address_contains_bare_postal( outer: &PipelineEntity, inner: &PipelineEntity, diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index f4653917..820ffaea 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -215,11 +215,7 @@ impl SearchIndex { parts.literals, literal_options(options.literal), )?; - capture_slot_artifacts( - &mut slots, - parts.regex, - regex_options(options.regex), - )?; + capture_regex_slot_artifacts(&mut slots, parts.regex, options.regex)?; capture_slot_artifacts( &mut slots, parts.fuzzy, @@ -433,14 +429,12 @@ fn build_search_index( literal_options(options.literal), literal_artifacts, )?; - let regex_artifacts = slot_artifacts(&parts.regex, &mut artifacts)?; - push_slot( + push_regex_slots( &mut slots, - SlotEngine::Regex, parts.regex, parts.regex_indexes, - regex_options(options.regex), - regex_artifacts, + options.regex, + &mut artifacts, )?; let fuzzy_artifacts = slot_artifacts(&parts.fuzzy, &mut artifacts)?; push_slot( @@ -468,6 +462,55 @@ fn slot_artifacts<'a>( cursor.next().map(Some) } +fn capture_regex_slot_artifacts( + slots: &mut Vec, + patterns: Vec, + options: RegexSearchOptions, +) -> Result<()> { + if !options.overlap_all { + return capture_slot_artifacts(slots, patterns, regex_options(options)); + } + + for pattern in patterns { + capture_slot_artifacts(slots, vec![pattern], regex_options(options))?; + } + Ok(()) +} + +fn push_regex_slots( + slots: &mut Vec, + patterns: Vec, + pattern_indexes: Vec, + options: RegexSearchOptions, + artifacts: &mut Option<&mut SearchIndexArtifactCursor<'_>>, +) -> Result<()> { + if !options.overlap_all { + let regex_artifacts = slot_artifacts(&patterns, artifacts)?; + return push_slot( + slots, + SlotEngine::Regex, + patterns, + pattern_indexes, + regex_options(options), + regex_artifacts, + ); + } + + for (pattern, pattern_index) in patterns.into_iter().zip(pattern_indexes) { + let regex_artifacts = + slot_artifacts(std::slice::from_ref(&pattern), artifacts)?; + push_slot( + slots, + SlotEngine::Regex, + vec![pattern], + vec![pattern_index], + regex_options(options), + regex_artifacts, + )?; + } + Ok(()) +} + fn push_slot( slots: &mut Vec, engine: SlotEngine, diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 2e05af0f..684d7692 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -2,6 +2,7 @@ use fancy_regex::Regex as FancyRegex; use regex::{Regex, RegexBuilder}; use crate::byte_offsets::ByteOffsets; +use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; use crate::resolution::{DetectionSource, PipelineEntity}; use crate::types::{Error, Result, SearchMatch}; use crate::validators::validate_named_id; @@ -215,6 +216,7 @@ pub(crate) fn process_trigger_matches( slice: PatternSlice, full_text: &str, data: &PreparedTriggerData, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, ) -> Result> { let offsets = ByteOffsets::new(full_text); let mut results = Vec::new(); @@ -232,9 +234,11 @@ pub(crate) fn process_trigger_matches( continue; }; if !has_left_boundary(full_text, &offsets, found.start())? { + record_trigger_rejection(&mut diagnostics, found, rule, "left-boundary"); continue; } if !has_right_boundary(full_text, &offsets, found.end(), &rule.trigger)? { + record_trigger_rejection(&mut diagnostics, found, rule, "right-boundary"); continue; } let Some(raw_value) = extract_value( @@ -246,17 +250,26 @@ pub(crate) fn process_trigger_matches( &extraction_data, )? else { + record_trigger_rejection(&mut diagnostics, found, rule, "empty-value"); continue; }; let Some(mut value) = strip_quotes(&raw_value) else { + record_trigger_rejection( + &mut diagnostics, + found, + rule, + "empty-quoted-value", + ); continue; }; if !apply_validations(&value.text, &rule.validations) { + record_trigger_rejection(&mut diagnostics, found, rule, "validation"); continue; } if rule.label == "phone number" && !is_plausible_phone_trigger_value(&value.text) { + record_trigger_rejection(&mut diagnostics, found, rule, "phone-shape"); continue; } if rule.label == "phone number" @@ -307,6 +320,25 @@ pub(crate) fn process_trigger_matches( Ok(results) } +fn record_trigger_rejection( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + found: &SearchMatch, + rule: &PreparedTriggerRule, + reason: &'static str, +) { + let Some(diagnostics) = diagnostics.as_deref_mut() else { + return; + }; + diagnostics.record_rejection( + DiagnosticStage::EntityTrigger, + Some(found.pattern()), + Some(&rule.label), + Some(found.start()), + Some(found.end()), + reason, + ); +} + fn extract_value( text: &str, offsets: &ByteOffsets<'_>, @@ -317,8 +349,10 @@ fn extract_value( ) -> Result> { let trigger_end_byte = offsets.validate_offset(trigger_end)?; let lookahead = get_trigger_lookahead(strategy); - let lookahead_end = - text.len().min(trigger_end_byte.saturating_add(lookahead)); + let lookahead_end = floor_char_boundary( + text, + text.len().min(trigger_end_byte.saturating_add(lookahead)), + ); let remaining = text .get(trigger_end_byte..lookahead_end) .unwrap_or_default(); @@ -886,6 +920,13 @@ fn previous_char_boundary(text: &str, byte: usize) -> usize { .map_or(0, |(index, _)| index) } +const fn floor_char_boundary(text: &str, mut byte: usize) -> usize { + while byte > 0 && !text.is_char_boundary(byte) { + byte = byte.saturating_sub(1); + } + byte +} + fn is_word_byte(text: &str, byte: usize) -> bool { text .get(byte..) @@ -1469,3 +1510,162 @@ fn u32_len(text: &str) -> u32 { fn byte_to_offset(byte: usize) -> Option { u32::try_from(byte).ok() } + +#[cfg(test)] +#[allow(clippy::indexing_slicing, clippy::unwrap_used)] +mod tests { + use crate::search::{SearchIndex, SearchOptions, SearchPattern}; + + use super::*; + + #[test] + fn court_trigger_includes_trigger_span() { + let text = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let start = text.find("Krajským soudem").unwrap(); + let end = start.saturating_add("Krajským soudem".len()); + let data = PreparedTriggerData::new(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("krajským soudem"), + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![String::from("oddíl")], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let entities = process_trigger_matches( + &[SearchMatch::Literal { + pattern: 0, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }], + PatternSlice { start: 0, end: 1 }, + text, + &data, + None, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].source, DetectionSource::Trigger); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } + + #[test] + fn court_trigger_survives_generated_slice_shape() { + let text = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let slice = PatternSlice { + start: 1372, + end: 2791, + }; + let mut patterns = Vec::new(); + for index in 0..slice.end { + let pattern = if index == slice.start.saturating_add(216) { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + patterns.push(SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(false), + }); + } + let search = SearchIndex::new(patterns, SearchOptions::default()).unwrap(); + let mut rules = Vec::new(); + for index in slice.start..slice.end { + let trigger = if index == slice.start.saturating_add(216) { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + rules.push(TriggerRule { + trigger, + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![ + String::from("dne"), + String::from("v oddíle"), + String::from("oddíl"), + String::from("vložka"), + ], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }); + } + let data = PreparedTriggerData::new(TriggerData { + rules, + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let matches = search.find_iter(text).unwrap(); + let entities = + process_trigger_matches(&matches, slice, text, &data, None).unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].source, DetectionSource::Trigger); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } + + #[test] + fn court_trigger_lookahead_can_end_inside_later_utf8_scalar() { + let prefix = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let trigger_start = prefix.find("Krajským soudem").unwrap(); + let trigger_end = trigger_start.saturating_add("Krajským soudem".len()); + let lookahead_end = trigger_end + .saturating_add(MAX_TRIGGER_VALUE_LEN) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN); + let padding_len = + lookahead_end.saturating_sub(prefix.len()).saturating_sub(1); + let text = format!("{prefix}{}é trailing", "x".repeat(padding_len)); + let data = PreparedTriggerData::new(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("krajským soudem"), + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![String::from("oddíl")], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let entities = process_trigger_matches( + &[SearchMatch::Literal { + pattern: 0, + start: u32::try_from(trigger_start).unwrap(), + end: u32::try_from(trigger_end).unwrap(), + }], + PatternSlice { start: 0, end: 1 }, + &text, + &data, + None, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } +} diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs index 58701793..c39081fb 100644 --- a/crates/anonymize-core/tests/processors.rs +++ b/crates/anonymize-core/tests/processors.rs @@ -278,6 +278,83 @@ fn deny_list_processor_emits_curated_non_person_labels() { assert_eq!(entities[0].source_detail, None); } +#[test] +fn deny_list_processor_suppresses_shorter_curated_same_start_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 7, + }, + SearchMatch::Literal { + pattern: 1, + start: 0, + end: 17, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")], vec![String::from("country")]] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("Česká"), String::from("Česká republika")], + sources: vec![vec![String::from("city")], vec![String::from("deny-list")]] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Česká republika", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "country"); + assert_eq!(entities[0].text, "Česká republika"); +} + +#[test] +fn deny_list_processor_suppresses_shorter_contained_curated_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 17, + }, + SearchMatch::Literal { + pattern: 1, + start: 10, + end: 17, + }, + ]; + let data = DenyListMatchData { + labels: vec![ + vec![String::from("organization")], + vec![String::from("address")], + ] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("Nemocnice Blansko"), String::from("Blansko")], + sources: vec![vec![String::from("deny-list")], vec![String::from("city")]] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Nemocnice Blansko", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].text, "Nemocnice Blansko"); +} + #[test] fn deny_list_processor_suppresses_signing_place_address() { let text = "Podepsano V Brně dne 1. ledna 2026."; diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs index e1f51207..1eb1b4f3 100644 --- a/crates/anonymize-core/tests/resolution.rs +++ b/crates/anonymize-core/tests/resolution.rs @@ -227,6 +227,63 @@ fn literal_container_beats_shorter_same_label_match() { assert_eq!(kept.end, 11); } +#[test] +fn literal_container_survives_overlapping_shorter_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.7, 551, 557, "address"), + entity(DetectionSource::DenyList, 0.9, 501, 518, "organization"), + entity(DetectionSource::DenyList, 1.0, 511, 518, "address"), + entity(DetectionSource::DenyList, 1.0, 543, 550, "address"), + entity(DetectionSource::DenyList, 1.0, 527, 533, "address"), + entity(DetectionSource::Trigger, 0.95, 511, 518, "organization"), + entity(DetectionSource::Trigger, 0.95, 511, 518, "organization"), + entity(DetectionSource::Trigger, 1.0, 527, 557, "address"), + entity(DetectionSource::Trigger, 1.0, 527, 557, "address"), + entity(DetectionSource::Regex, 0.9, 527, 557, "address"), + ]); + + assert!( + result + .iter() + .any(|entry| entry.source == DetectionSource::DenyList + && entry.label == "organization" + && entry.start == 501 + && entry.end == 518), + "merged entities: {result:?}", + ); + assert!( + result + .iter() + .all(|entry| !(entry.source == DetectionSource::Trigger + && entry.label == "organization" + && entry.start == 511 + && entry.end == 518)), + "merged entities: {result:?}", + ); +} + +#[test] +fn address_component_beats_low_confidence_name_collision() { + let result = merge_and_dedup(&[ + entity(DetectionSource::DenyList, 0.5, 510, 521, "person"), + entity(DetectionSource::DenyList, 0.9, 515, 521, "address"), + entity(DetectionSource::DenyList, 0.9, 523, 531, "address"), + ]); + + assert!( + result.iter().any(|entry| entry.label == "address" + && entry.start == 515 + && entry.end == 521), + "merged entities: {result:?}", + ); + assert!( + result.iter().all(|entry| !(entry.label == "person" + && entry.start == 510 + && entry.end == 521)), + "merged entities: {result:?}", + ); +} + #[test] fn caller_owned_boundaries_win_overlap_resolution() { let mut custom = entity(DetectionSource::Regex, 0.5, 0, 8, "person"); diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index d231e10a..8dadd9b5 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -94,6 +94,63 @@ fn search_index_preserves_byte_offsets_from_primitive_engines() { ); } +#[test] +fn search_index_preserves_case_insensitive_literal_byte_offsets() { + let index = SearchIndex::new( + vec![SearchPattern::LiteralWithOptions { + pattern: String::from("krajským soudem"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + SearchOptions::default(), + ) + .unwrap(); + + let haystack = "zapsaná v obchodním rejstříku vedeném Krajským soudem"; + let start = haystack.find("Krajským").unwrap(); + let end = haystack.len(); + + assert_eq!( + index.find_iter(haystack).unwrap(), + vec![SearchMatch::Literal { + pattern: 0, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }] + ); +} + +#[test] +fn search_index_preserves_large_case_insensitive_literal_byte_offsets() { + let mut patterns = Vec::new(); + for index in 0..300 { + let pattern = if index == 216 { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + patterns.push(SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(false), + }); + } + let index = SearchIndex::new(patterns, SearchOptions::default()).unwrap(); + + let haystack = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí"; + let start = haystack.find("Krajským").unwrap(); + let end = start.saturating_add("Krajským soudem".len()); + + assert_eq!( + index.find_iter(haystack).unwrap(), + vec![SearchMatch::Literal { + pattern: 216, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }] + ); +} + #[test] fn search_index_returns_overlapping_literal_matches() { let index = SearchIndex::new( From d89ea157ebb57a3da120abead0769be1bd98b054 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:23:34 +0200 Subject: [PATCH 062/130] test: track native fixture deltas --- .../scripts/migration-fixture-perf.mjs | 111 ++++++++++++++++-- 1 file changed, 102 insertions(+), 9 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index b9ea4a38..f945e0e8 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -26,6 +26,7 @@ const FIXTURES_DIR = join( ); const BASELINE_REF = process.env.ANONYMIZE_MIGRATION_BASELINE_REF ?? "origin/main"; +const WORKING_TREE_BASELINE_REF = "working-tree"; const COMPARE_BASELINE = process.env.ANONYMIZE_MIGRATION_COMPARE_BASELINE !== "0"; const REQUIRE_NATIVE_PIPELINE = @@ -70,6 +71,47 @@ const WRITE_NATIVE_PACKAGE_PATH = const USER_DATA_SCENARIO = process.env.ANONYMIZE_MIGRATION_USER_DATA_SCENARIO?.trim() ?? "none"; +const ACCEPTED_NATIVE_STATIC_DELTAS = new Map( + [ + { + fixture: "cs/asset-transfer-court-declensions.txt", + reason: "wider-address-span", + candidateExtra: [ + { start: 445, end: 485, label: "address", source: "regex" }, + ], + candidateMissing: [ + { start: 471, end: 485, label: "address", source: "deny-list" }, + ], + }, + { + fixture: "cs/nakit-legal-services-framework.txt", + reason: "role-heading-not-person", + candidateExtra: [], + candidateMissing: [ + { start: 49384, end: 49395, label: "person", source: "trigger" }, + ], + }, + { + fixture: "cs/vinci-donation-agreement.txt", + reason: "party-organization-retained", + candidateExtra: [ + { start: 542, end: 585, label: "organization", source: "deny-list" }, + ], + candidateMissing: [], + }, + { + fixture: "en/software-license-agreement.txt", + reason: "phone-leading-parenthesis", + candidateExtra: [ + { start: 1857, end: 1871, label: "phone number", source: "regex" }, + ], + candidateMissing: [ + { start: 1858, end: 1871, label: "phone number", source: "trigger" }, + ], + }, + ].map((entry) => [entry.fixture, entry]), +); + if (process.env.ANONYMIZE_MIGRATION_WORKER === "1") { await runWorker(); } else { @@ -91,8 +133,10 @@ async function runCoordinator() { try { let baseline = null; if (COMPARE_BASELINE) { - ensureGitRef(BASELINE_REF); - const baselineRoot = materializeGitRef(BASELINE_REF, tempRoot); + const baselineRoot = + BASELINE_REF === WORKING_TREE_BASELINE_REF + ? ROOT_DIR + : materializeBaselineRef(BASELINE_REF, tempRoot); baseline = runVariant({ name: `baseline:${BASELINE_REF}`, sourceRoot: baselineRoot, @@ -717,6 +761,9 @@ function compareSnapshots(baseline, candidate) { baseline: baseline.variant, candidate: candidate.variant, equal: mismatches.length === 0, + acceptedEqual: mismatches.every( + (mismatch) => mismatch.acceptedReason !== null, + ), mismatchSummary: mismatchSummary(mismatches), fixtureCount: fixtureNames.size, mismatches, @@ -733,15 +780,31 @@ function mismatchSummary(mismatches) { let materialMismatchCount = 0; let redactionMismatchCount = 0; let sourceOnlyMismatchCount = 0; + let acceptedMismatchCount = 0; + let unexplainedMismatchCount = 0; + let unexplainedMaterialMismatchCount = 0; + let unexplainedRedactionMismatchCount = 0; for (const mismatch of mismatches) { const category = mismatch.category ?? mismatch.kind; byCategory[category] = (byCategory[category] ?? 0) + 1; + const accepted = mismatch.acceptedReason !== null; + if (accepted) { + acceptedMismatchCount += 1; + } else { + unexplainedMismatchCount += 1; + } if (mismatch.sourceAgnosticEqual !== true) { materialMismatchCount += 1; + if (!accepted) { + unexplainedMaterialMismatchCount += 1; + } } if (mismatch.redactedTextEqual === false) { redactionMismatchCount += 1; + if (!accepted) { + unexplainedRedactionMismatchCount += 1; + } } if ( mismatch.redactedTextEqual && @@ -758,6 +821,10 @@ function mismatchSummary(mismatches) { materialMismatchCount, redactionMismatchCount, sourceOnlyMismatchCount, + acceptedMismatchCount, + unexplainedMismatchCount, + unexplainedMaterialMismatchCount, + unexplainedRedactionMismatchCount, byCategory, }; } @@ -801,7 +868,7 @@ function describeMismatch(fixture, expected, actual) { ); const category = mismatchCategory(expected, actual); - return { + const mismatch = { fixture, kind: "snapshot-mismatch", category: category.kind, @@ -842,6 +909,31 @@ function describeMismatch(fixture, expected, actual) { actualByteSnapshot.entities.at(firstByteEntityDiff) ?? null, }, }; + return { + ...mismatch, + acceptedReason: acceptedMismatchReason(mismatch), + }; +} + +function acceptedMismatchReason(mismatch) { + if (mismatch.sourceAgnosticEqual === true) { + return "source-only"; + } + const accepted = ACCEPTED_NATIVE_STATIC_DELTAS.get(mismatch.fixture); + if (accepted === undefined) { + return null; + } + if ( + entitySummariesEqual(mismatch.candidateExtra, accepted.candidateExtra) && + entitySummariesEqual(mismatch.candidateMissing, accepted.candidateMissing) + ) { + return accepted.reason; + } + return null; +} + +function entitySummariesEqual(left, right) { + return JSON.stringify(left ?? []) === JSON.stringify(right ?? []); } function mismatchCategory(expected, actual) { @@ -1252,15 +1344,12 @@ function describeUnsupportedPipelineStages( config.enableDenyList ? "name-corpus-supplemental" : "name-corpus", ); } - if (config.enableHotwordRules) { - stages.push("hotword-rules"); + if (config.enableNer) { + stages.push("ner"); } if (config.enableZoneClassification) { stages.push("zone-classification"); } - if (config.enableConfidenceBoost) { - stages.push("confidence-boost"); - } if (config.enableCoreference) { stages.push("coreference"); } @@ -1271,7 +1360,6 @@ function describeUnsupportedPipelineStages( if (!nativeRuntime) { stages.push("signatures"); } - stages.push("false-positive-filters", "final-extensions"); return stages; } @@ -1422,6 +1510,11 @@ function ensureGitRef(ref) { throw new Error(`Cannot resolve baseline ref: ${ref}`); } +function materializeBaselineRef(ref, tempRoot) { + ensureGitRef(ref); + return materializeGitRef(ref, tempRoot); +} + function materializeGitRef(ref, tempRoot) { const outputDir = join(tempRoot, "baseline"); mkdirSync(outputDir, { recursive: true }); From 5b198005ac5461e3f6a5cba886a8756667db5e23 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:58:18 +0200 Subject: [PATCH 063/130] fix: align native prepared edge cases --- crates/anonymize-adapter-contract/src/lib.rs | 21 +++- crates/anonymize-core/src/address_context.rs | 36 ++++-- crates/anonymize-core/src/address_seeds.rs | 46 ++++++- crates/anonymize-core/src/prepared.rs | 41 +++++-- crates/anonymize-core/src/resolution/types.rs | 1 + .../tests/address_seed_parity.rs | 62 ++++++++++ .../tests/false_positive_parity.rs | 27 +++++ crates/anonymize-core/tests/prepared.rs | 113 ++++++++++++++++++ crates/anonymize-core/tests/trigger_parity.rs | 1 + .../src/__test__/pipeline-config.test.ts | 18 +++ .../anonymize/src/build-unified-search.ts | 28 ++++- .../src/data/address-unit-abbreviations.json | 4 + .../anonymize/src/detectors/address-seeds.ts | 23 +++- packages/anonymize/src/detectors/deny-list.ts | 2 +- 14 files changed, 392 insertions(+), 31 deletions(-) create mode 100644 packages/anonymize/src/data/address-unit-abbreviations.json diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 2c3c8362..5a10358f 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -19,13 +19,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 7; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 8; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 5; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 6; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 6; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 7; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 6; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 7; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -340,6 +340,8 @@ pub struct BindingAddressSeedData { pub boundary_words: Vec, #[serde(default)] pub br_cep_cue_words: Vec, + #[serde(default)] + pub unit_abbreviations: Vec, } #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] @@ -445,6 +447,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub deny_list_data: Option, #[serde(default)] + pub false_positive_filters: Option, + #[serde(default)] pub gazetteer_data: Option, #[serde(default)] pub country_data: Option, @@ -510,6 +514,7 @@ struct BinaryPreparedSearchConfig { regex_meta: Vec, custom_regex_meta: Vec, deny_list_data: Option, + false_positive_filters: Option, gazetteer_data: Option, country_data: Option, hotword_data: Option, @@ -711,6 +716,7 @@ impl From for BinaryPreparedSearchConfig { regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, gazetteer_data: config.gazetteer_data, country_data: config.country_data, hotword_data: config.hotword_data, @@ -742,6 +748,7 @@ impl From for BindingPreparedSearchConfig { regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, gazetteer_data: config.gazetteer_data, country_data: config.country_data, hotword_data: config.hotword_data, @@ -1183,6 +1190,9 @@ pub fn prepared_search_config_from_binding( deny_list_data: deny_list_data .map(deny_list_data_from_binding) .transpose()?, + false_positive_filters: config + .false_positive_filters + .map(deny_list_filters_from_binding), gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { labels: data.labels, is_fuzzy: data.is_fuzzy, @@ -1198,6 +1208,7 @@ pub fn prepared_search_config_from_binding( address_seed_data: config.address_seed_data.map(|data| AddressSeedData { boundary_words: data.boundary_words, br_cep_cue_words: data.br_cep_cue_words, + unit_abbreviations: data.unit_abbreviations, }), address_context_data: config.address_context_data.map(|data| { AddressContextData { @@ -2038,6 +2049,7 @@ fn source_detail_from_binding(value: &str) -> Result { "custom-deny-list" => Ok(SourceDetail::CustomDenyList), "custom-regex" => Ok(SourceDetail::CustomRegex), "gazetteer-extension" => Ok(SourceDetail::GazetteerExtension), + "address-context" => Ok(SourceDetail::AddressContext), _ => Err(ContractError::UnsupportedSourceDetail { value: value.to_owned(), }), @@ -2073,6 +2085,7 @@ fn source_detail_name(detail: SourceDetail) -> String { SourceDetail::CustomDenyList => "custom-deny-list", SourceDetail::CustomRegex => "custom-regex", SourceDetail::GazetteerExtension => "gazetteer-extension", + SourceDetail::AddressContext => "address-context", } .to_owned() } diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index 96c54e70..45e826ef 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -5,7 +5,7 @@ use regex::Regex; use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; use crate::types::{Error, Result}; -const HEADER_ZONE_PERCENT: u32 = 15; +const HEADER_ZONE_PERCENT: usize = 15; const STREET_CONTEXT_WINDOW: u32 = 200; const BARE_HOUSE_CONTEXT_WINDOW: u32 = 50; const MAX_BACKWARD_WORDS: usize = 5; @@ -131,7 +131,7 @@ impl PreparedAddressContextData { continue; } let score = address_context_score(full_text, street_start, in_header); - results.push(PipelineEntity::detected( + results.push(address_context_entity( street_start_u32, num_end, "address", @@ -237,7 +237,7 @@ impl PreparedAddressContextData { continue; } - results.push(PipelineEntity::detected( + results.push(address_context_entity( start, end, "address", @@ -281,7 +281,7 @@ impl PreparedAddressContextData { continue; } - results.push(PipelineEntity::detected( + results.push(address_context_entity( start, end, "address", @@ -309,8 +309,16 @@ fn compile_regex(field: &'static str, pattern: &str) -> Result { } fn header_end(full_text: &str) -> u32 { - let len = u32::try_from(full_text.len()).unwrap_or(u32::MAX); - len.saturating_mul(HEADER_ZONE_PERCENT).div_euclid(100) + let text_len = full_text.chars().map(char::len_utf16).sum::(); + let cutoff = text_len.saturating_mul(HEADER_ZONE_PERCENT).div_euclid(100); + let mut units = 0usize; + for (byte, ch) in full_text.char_indices() { + if units >= cutoff { + return u32::try_from(byte).unwrap_or(u32::MAX); + } + units = units.saturating_add(ch.len_utf16()); + } + u32::try_from(full_text.len()).unwrap_or(u32::MAX) } const fn is_caller_owned_entity(entity: &PipelineEntity) -> bool { @@ -332,6 +340,20 @@ fn overlaps_any(entities: &[PipelineEntity], start: u32, end: u32) -> bool { .any(|entity| entity.start < end && entity.end > start) } +fn address_context_entity( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source: DetectionSource, +) -> PipelineEntity { + let mut entity = + PipelineEntity::detected(start, end, label, text, score, source); + entity.source_detail = Some(SourceDetail::AddressContext); + entity +} + fn skip_whitespace_back(full_text: &str, mut pos: usize) -> Option { while let Some((index, ch)) = previous_char(full_text, pos) { if !is_space(ch) { @@ -415,7 +437,7 @@ fn near_confirmed_address_same_line( end: u32, ) -> Result { for entity in existing_entities.iter().chain(results.iter()) { - if entity.label != "address" { + if entity.label != "address" || is_caller_owned_entity(entity) { continue; } let dist = entity.start.abs_diff(end).min(entity.end.abs_diff(start)); diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index 278c5713..26c3e591 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeSet; + use regex::Regex; use crate::processors::PatternSlice; @@ -19,11 +21,13 @@ const US_ZIP_CONTEXT_WINDOW: usize = 120; pub struct AddressSeedData { pub boundary_words: Vec, pub br_cep_cue_words: Vec, + pub unit_abbreviations: Vec, } pub(crate) struct PreparedAddressSeedData { boundary_search: Option, br_cep_cue_search: Option, + unit_abbreviations: BTreeSet, postal_code_re: Regex, br_cep_shape_re: Regex, us_zip_plus_four_shape_re: Regex, @@ -39,6 +43,7 @@ impl PreparedAddressSeedData { Ok(Self { boundary_search: literal_search(data.boundary_words)?, br_cep_cue_search: literal_search(data.br_cep_cue_words)?, + unit_abbreviations: lowercased_set(data.unit_abbreviations), postal_code_re: compile_regex( r"(?u)(?:\d{5}[-‐‑‒–—―]\d{4}|\d{5}[-‐‑‒–—―]\d{3}|\d{3}\s\d{2}|\d{2}[-‐‑‒–—―]\d{3}|\d{5})", )?, @@ -409,7 +414,9 @@ impl PreparedAddressSeedData { if let Some(double_newline) = remaining.find("\n\n") { nearest_boundary = nearest_boundary.min(double_newline); } - if let Some(sentence_boundary) = sentence_boundary(remaining) { + if let Some(sentence_boundary) = + sentence_boundary(remaining, &self.unit_abbreviations) + { nearest_boundary = nearest_boundary.min(sentence_boundary); } @@ -505,6 +512,13 @@ fn literal_search(patterns: Vec) -> Result> { Ok(Some(SearchIndex::new(patterns, SearchOptions::default())?)) } +fn lowercased_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + fn compile_regex(pattern: &str) -> Result { Regex::new(pattern).map_err(|error| Error::Search { engine: SearchEngine::Regex, @@ -1037,12 +1051,18 @@ fn trim_address_tail(full_text: &str, start: usize, mut end: usize) -> usize { end } -fn sentence_boundary(text: &str) -> Option { +fn sentence_boundary( + text: &str, + unit_abbreviations: &BTreeSet, +) -> Option { let mut iter = text.char_indices().peekable(); while let Some((index, ch)) = iter.next() { if !matches!(ch, '.' | '!' | '?') { continue; } + if ch == '.' && is_unit_abbreviation(text, index, unit_abbreviations) { + continue; + } let mut saw_whitespace = false; while let Some((_, next)) = iter.peek().copied() { if !next.is_whitespace() { @@ -1061,6 +1081,27 @@ fn sentence_boundary(text: &str) -> Option { None } +fn is_unit_abbreviation( + text: &str, + dot_index: usize, + unit_abbreviations: &BTreeSet, +) -> bool { + let mut start = dot_index; + while let Some((previous_start, ch)) = previous_char(text, start) { + if ch.is_alphanumeric() || ch == '.' { + start = previous_start; + continue; + } + break; + } + if start == dot_index { + return false; + } + text + .get(start..dot_index.saturating_add(1)) + .is_some_and(|token| unit_abbreviations.contains(&token.to_lowercase())) +} + const fn is_address_trailing_trim(ch: char) -> bool { ch.is_whitespace() || matches!( @@ -1202,6 +1243,7 @@ mod tests { let data = PreparedAddressSeedData::new(AddressSeedData { boundary_words: vec![String::from("steuer-id")], br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), })?; let full_text = concat!( "(2) Frau Karoline M. Brentano,\n", diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 441109af..25d8bf5f 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -16,13 +16,14 @@ use crate::normalize::{ NormalizedSearchText, normalize_for_search_with_byte_map, }; use crate::processors::{ - CountryMatchData, DenyListMatchData, GazetteerMatchData, PatternSlice, - RegexMatchMeta, ensure_supported_deny_list_sources, process_country_matches, - process_deny_list_matches, process_gazetteer_matches, process_regex_matches, + CountryMatchData, DenyListFilterData, DenyListMatchData, GazetteerMatchData, + PatternSlice, RegexMatchMeta, ensure_supported_deny_list_sources, + process_country_matches, process_deny_list_matches, + process_gazetteer_matches, process_regex_matches, }; use crate::redact::redact_text; use crate::resolution::{ - PipelineEntity, enforce_boundary_consistency, merge_and_dedup, + PipelineEntity, SourceDetail, enforce_boundary_consistency, merge_and_dedup, sanitize_entities_with_source, }; use crate::search::{ @@ -58,6 +59,7 @@ pub struct PreparedSearch { regex_meta: Vec, custom_regex_meta: Vec, deny_list_data: Option, + false_positive_filters: Option, gazetteer_data: Option, country_data: Option, hotword_data: Option, @@ -102,6 +104,8 @@ pub struct PreparedSearchConfig { pub regex_meta: Vec, pub custom_regex_meta: Vec, pub deny_list_data: Option, + #[serde(default)] + pub false_positive_filters: Option, pub gazetteer_data: Option, pub country_data: Option, #[serde(default)] @@ -430,6 +434,7 @@ impl PreparedSearch { regex_meta: config.regex_meta, custom_regex_meta: config.custom_regex_meta, deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, gazetteer_data: config.gazetteer_data, country_data: config.country_data, hotword_data: config.hotword_data, @@ -891,18 +896,23 @@ impl PreparedSearch { let sanitize_start = Instant::now(); let sanitized_entities = sanitize_entities_with_source(&consistent, full_text)?; - let resolved_entities = filter_entities_for_config( - filter_entity_false_positives( - sanitized_entities, - full_text, + let false_positive_filters = + self.false_positive_filters.as_ref().or_else(|| { self .deny_list_data .as_ref() - .and_then(|data| data.filters.as_ref()), + .and_then(|data| data.filters.as_ref()) + }); + let mut resolved_entities = filter_entities_for_config( + filter_entity_false_positives( + sanitized_entities, + full_text, + false_positive_filters, )?, self.threshold, &self.allowed_labels, ); + clear_internal_source_details(&mut resolved_entities); if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( DiagnosticStage::Sanitize, @@ -1023,10 +1033,21 @@ fn filter_entities_for_threshold( ) -> Vec { entities .into_iter() - .filter(|entity| entity.score >= threshold) + .filter(|entity| { + entity.score >= threshold + || entity.source_detail == Some(SourceDetail::AddressContext) + }) .collect() } +fn clear_internal_source_details(entities: &mut [PipelineEntity]) { + for entity in entities { + if entity.source_detail == Some(SourceDetail::AddressContext) { + entity.source_detail = None; + } + } +} + fn boost_near_miss_entities( entities: Vec, full_text: &str, diff --git a/crates/anonymize-core/src/resolution/types.rs b/crates/anonymize-core/src/resolution/types.rs index 903d33e9..bd838074 100644 --- a/crates/anonymize-core/src/resolution/types.rs +++ b/crates/anonymize-core/src/resolution/types.rs @@ -31,6 +31,7 @@ pub enum SourceDetail { CustomDenyList, CustomRegex, GazetteerExtension, + AddressContext, } #[derive(Clone, Debug, PartialEq)] diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index 63d797fe..f884b4f6 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -21,6 +21,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: None, country_data: None, hotword_data: None, @@ -89,6 +90,7 @@ fn detects_cue_gated_br_cep_address_seed() { address_seed_data: Some(AddressSeedData { boundary_words: Vec::new(), br_cep_cue_words: vec![String::from("CEP")], + unit_abbreviations: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -166,3 +168,63 @@ fn keeps_date_like_street_name_in_address_seed_span() { ); assert!(!result.redaction.redacted_text.contains("May 15 Street")); } + +#[test] +fn preserves_unit_abbreviation_inside_address_seed_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Springfield"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Springfield")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: Vec::new(), + br_cep_cue_words: Vec::new(), + unit_abbreviations: vec![String::from("apt.")], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Notices go to 10 Main Street, Springfield 12345 Apt. 5. Thank you.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result) + .contains(&"10 Main Street, Springfield 12345 Apt. 5"), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); + assert!(!result.redaction.redacted_text.contains("Apt. 5")); +} diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index 77de3cce..3f54ff5c 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -23,6 +23,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: None, country_data: None, hotword_data: None, @@ -150,6 +151,32 @@ fn rejects_document_structure_heading_organizations() { ); } +#[test] +fn rejects_document_headings_without_deny_list_matching() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Schedule No\. 4|Acme No\. 4", + ))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("organization", 0.9)], + false_positive_filters: Some(DenyListFilterData { + document_heading_words: set(["schedule"]), + document_heading_ordinal_markers: set(["no."]), + ..DenyListFilterData::default() + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert_eq!( + resolved_texts(&prepared, "Schedule No. 4\nAcme No. 4 signed."), + [String::from("Acme No. 4")] + ); +} + #[test] fn rejects_only_ambiguous_street_type_trigger_addresses() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 5ec674c5..53eb6790 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -29,6 +29,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: None, country_data: None, hotword_data: None, @@ -151,6 +152,7 @@ fn prepared_search_runs_normalized_literal_pass() { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -241,6 +243,109 @@ fn prepared_search_adds_orphan_header_street_line_context() { })); } +#[test] +fn prepared_search_keeps_address_context_above_threshold() { + let full_text = format!( + "ACME s.r.o.\nEvropská 710\n160 00 Praha\n{}", + "body ".repeat(200) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.9, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "address" + && entity.text == "Evropská 710" + && entity.source_detail.is_none() + })); +} + +#[test] +fn prepared_search_ignores_caller_owned_addresses_for_bare_house_context() { + let mut meta = RegexMatchMeta::new("address", 1.0); + meta.source_detail = Some(SourceDetail::CustomRegex); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bPraha 2\b", + ))], + custom_regex_meta: vec![meta], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Delivery area Praha 2, Evropská 710.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + +#[test] +fn prepared_search_measures_header_zone_in_text_offsets() { + let full_text = format!( + "{}\nACME s.r.o.\nEvropská 710\n{}", + "body ".repeat(80), + "é".repeat(2_000) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + #[test] fn prepared_search_artifacts_match_direct_prepare() { let config = PreparedSearchConfig { @@ -265,6 +370,7 @@ fn prepared_search_artifacts_match_direct_prepare() { regex_meta: vec![RegexMatchMeta::new("identifier", 1.0)], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -411,6 +517,7 @@ fn prepared_search_emits_static_detector_entities() { min_byte_length: None, }], deny_list_data: None, + false_positive_filters: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -1045,6 +1152,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -1361,6 +1469,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: Some(GazetteerMatchData { labels: vec![String::from("organization")], is_fuzzy: vec![false], @@ -1447,6 +1556,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { sources: vec![vec![String::from("custom-deny-list")]].into(), filters: None, }), + false_positive_filters: None, gazetteer_data: None, country_data: None, hotword_data: None, @@ -1844,6 +1954,7 @@ fn prepared_search_stops_address_before_notice_copy_instruction() { address_seed_data: Some(AddressSeedData { boundary_words: vec![String::from("with a copy")], br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -1947,6 +2058,7 @@ fn prepared_search_stops_address_seed_expansion_at_legal_prose() { address_seed_data: Some(AddressSeedData { boundary_words: vec![String::from("pokud")], br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) @@ -2011,6 +2123,7 @@ fn prepared_search_does_not_cluster_address_seed_inside_register_span() { address_seed_data: Some(AddressSeedData { boundary_words: vec![String::from("eingetragen")], br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) }) diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 8c4517b3..b475681a 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -21,6 +21,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { regex_meta: vec![], custom_regex_meta: vec![], deny_list_data: None, + false_positive_filters: None, gazetteer_data: None, country_data: None, hotword_data: None, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 5ddc71ab..fd87113a 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -179,6 +179,24 @@ describe("pipeline config semantics", () => { expect(search.nativeStaticConfig.confidence_boost).toBe(true); }); + test("native config carries false-positive filters without deny-list matching", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableDenyList: false, + enableRegex: true, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.deny_list_data).toBeUndefined(); + expect( + search.nativeStaticConfig.false_positive_filters?.document_heading_words, + ).toContain("schedule"); + }); + test("native config carries hotword rule metadata", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index ac96023c..42989f51 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -31,7 +31,7 @@ import { import { applyPipelineLanguageScope } from "./language-scope"; import type { RegexMeta } from "./detectors/regex"; import type { TriggerRule } from "./types"; -import type { DenyListData } from "./detectors/deny-list"; +import type { DenyListData, DenyListFilterData } from "./detectors/deny-list"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; @@ -57,7 +57,11 @@ import { buildTriggerPatterns, getAddressStopKeywordsSync, } from "./detectors/triggers"; -import { buildDenyList } from "./detectors/deny-list"; +import { + buildDenyList, + buildDenyListFilterData, + ensureDenyListData, +} from "./detectors/deny-list"; import { buildStreetTypePatterns, getAddressSeedData, @@ -274,6 +278,7 @@ export type NativePreparedSearchConfig = { regex_meta: NativeRegexMatchMeta[]; custom_regex_meta: NativeRegexMatchMeta[]; deny_list_data?: NativeDenyListMatchData; + false_positive_filters?: NativeDenyListFilterData; gazetteer_data?: NativeGazetteerData; country_data?: CountryData; hotword_data?: NativeHotwordRuleData; @@ -354,6 +359,7 @@ type UnifiedSearchSources = { rules: TriggerRule[]; }; denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; streetTypes: string[]; gazResult: GazetteerPatternResult | null; countryResult: CountryPatternResult | null; @@ -384,6 +390,7 @@ export type NativeStaticSearchBundle = { regexMeta: readonly RegexMeta[]; customRegexMeta: readonly RegexMeta[]; denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; }; const buildUnifiedSearchSources = async ( @@ -414,6 +421,7 @@ const buildUnifiedSearchSources = async ( _legalFormWarmup, triggers, denyListData, + falsePositiveFilters, streetTypes, currencyPatterns, datePatterns, @@ -435,6 +443,14 @@ const buildUnifiedSearchSources = async ( rules: [] as TriggerRule[], }), config.enableDenyList ? buildDenyList(config, ctx) : Promise.resolve(null), + (async () => { + await ensureDenyListData( + ctx, + config.dictionaries, + config.nameCorpusLanguages, + ); + return buildDenyListFilterData(ctx); + })(), buildStreetTypePatterns(), config.enableRegex && labelIsAllowed("monetary amount", allowedLabels) ? getCurrencyPatternEntries() @@ -710,6 +726,7 @@ const buildUnifiedSearchSources = async ( legalForms, triggers, denyListData, + falsePositiveFilters, streetTypes, gazResult, countryResult, @@ -761,6 +778,7 @@ export const buildNativeStaticSearchBundle = async ( customRegexes: sources.customRegexes, customRegexMeta: sources.customRegexMeta, denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, triggerPatterns: sources.triggers.patterns, triggerRules: sources.triggers.rules, legalFormPatterns: sources.nativeLegalFormPatterns, @@ -792,6 +810,7 @@ export const buildNativeStaticSearchBundle = async ( regexMeta: sources.regexMeta, customRegexMeta: sources.customRegexMeta, denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, }; }; @@ -845,6 +864,7 @@ export const buildUnifiedSearch = async ( customRegexes: sources.customRegexes, customRegexMeta: sources.customRegexMeta, denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, triggerPatterns: sources.triggers.patterns, triggerRules: sources.triggers.rules, legalFormPatterns: sources.nativeLegalFormPatterns, @@ -894,6 +914,7 @@ type BuildNativeStaticConfigArgs = { customRegexes: readonly { pattern: string }[]; customRegexMeta: readonly RegexMeta[]; denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; triggerPatterns: readonly string[]; triggerRules: readonly TriggerRule[]; legalFormPatterns: readonly string[]; @@ -924,6 +945,7 @@ const buildNativeStaticConfig = ({ customRegexes, customRegexMeta, denyListData, + falsePositiveFilters, triggerPatterns, triggerRules, legalFormPatterns, @@ -1105,6 +1127,8 @@ const buildNativeStaticConfig = ({ if (denyListData) { nativeConfig.deny_list_data = toNativeDenyListData(denyListData); } + nativeConfig.false_positive_filters = + toNativeDenyListFilters(falsePositiveFilters); if (gazetteerData) { nativeConfig.gazetteer_data = toNativeGazetteerData(gazetteerData); } diff --git a/packages/anonymize/src/data/address-unit-abbreviations.json b/packages/anonymize/src/data/address-unit-abbreviations.json new file mode 100644 index 00000000..dfdf7cd7 --- /dev/null +++ b/packages/anonymize/src/data/address-unit-abbreviations.json @@ -0,0 +1,4 @@ +{ + "_comment": "Dotted address unit abbreviations that should not terminate address seed expansion. Organised per language because abbreviations are locale-specific.", + "en": ["apt.", "bldg.", "fl.", "ste.", "unit."] +} diff --git a/packages/anonymize/src/detectors/address-seeds.ts b/packages/anonymize/src/detectors/address-seeds.ts index 5dbeca77..3e2c0bea 100644 --- a/packages/anonymize/src/detectors/address-seeds.ts +++ b/packages/anonymize/src/detectors/address-seeds.ts @@ -83,6 +83,7 @@ type DictionaryConfig = Record; export type AddressSeedData = { boundary_words: string[]; br_cep_cue_words: string[]; + unit_abbreviations: string[]; }; let cachedBoundaryRe: RegExp | null = null; @@ -106,6 +107,15 @@ const loadFieldStopWords = async (): Promise => { } }; +const loadUnitAbbreviations = async (): Promise => { + try { + const mod = await import("../data/address-unit-abbreviations.json"); + return mod.default as DictionaryConfig; + } catch { + return {}; + } +}; + // ── pt-BR CEP context gating ──────────────────────── // // The bare `\d{5}-\d{3}` CEP shape collides with non- @@ -356,14 +366,17 @@ export const buildStreetTypePatterns = async (): Promise => { export const getAddressSeedData = async (): Promise => { addressSeedDataPromise ??= (async () => { - const [boundaryWords, fieldStopWords, brCueWords] = await Promise.all([ - loadBoundaryWords(), - loadFieldStopWords(), - loadBrCueWords(), - ]); + const [boundaryWords, fieldStopWords, unitAbbreviations, brCueWords] = + await Promise.all([ + loadBoundaryWords(), + loadFieldStopWords(), + loadUnitAbbreviations(), + loadBrCueWords(), + ]); return { boundary_words: flattenDictionaries([boundaryWords, fieldStopWords]), br_cep_cue_words: [...brCueWords], + unit_abbreviations: flattenDictionaries([unitAbbreviations]), }; })(); return addressSeedDataPromise; diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index 895d52be..3ea43ae1 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -1252,7 +1252,7 @@ const loadAddressJurisdictionPrefixes = (): Promise => { return addressJurisdictionPrefixesPromise; }; -const buildDenyListFilterData = async ( +export const buildDenyListFilterData = async ( ctx: PipelineContext, ): Promise => { const [ From f7e29f28076ce3743bed3bff66d1fad1bba0ce96 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 05:58:33 +0200 Subject: [PATCH 064/130] fix: build native node artifact --- .github/tools/check-packlist.mjs | 1 + packages/anonymize/.gitignore | 1 + packages/anonymize/package.json | 2 +- .../anonymize/scripts/build-native-node.mjs | 34 +++++++++++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 packages/anonymize/scripts/build-native-node.mjs diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 0694c5a7..a34f2c6c 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -12,6 +12,7 @@ const PACKAGES = [ "dist/native-node.d.mts", "dist/native-node.mjs", "index.cjs", + "stella_anonymize_napi.node", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/packages/anonymize/.gitignore b/packages/anonymize/.gitignore index 896ad8ab..8f27af15 100644 --- a/packages/anonymize/.gitignore +++ b/packages/anonymize/.gitignore @@ -1 +1,2 @@ wasm/dist/ +*.node diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index bd0a92cd..1ae4e57b 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -40,7 +40,7 @@ }, "license": "MIT", "scripts": { - "build": "tsdown", + "build": "tsdown && bun scripts/build-native-node.mjs", "prepublishOnly": "bun run build", "typecheck": "tsc --noEmit -p tsconfig.json && tsc --noEmit -p tsconfig.test.json && tsc --noEmit -p tsconfig.wasm.json", "test": "bun test --preload ./src/__test__/setup.ts --timeout 15000", diff --git a/packages/anonymize/scripts/build-native-node.mjs b/packages/anonymize/scripts/build-native-node.mjs new file mode 100644 index 00000000..ae79c1e6 --- /dev/null +++ b/packages/anonymize/scripts/build-native-node.mjs @@ -0,0 +1,34 @@ +import { execFileSync } from "node:child_process"; +import { copyFileSync, existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const packageRoot = dirname(dirname(fileURLToPath(import.meta.url))); +const repoRoot = dirname(dirname(packageRoot)); + +const sourceByPlatform = { + darwin: "libstella_anonymize_napi.dylib", + linux: "libstella_anonymize_napi.so", + win32: "stella_anonymize_napi.dll", +}; + +const sourceName = sourceByPlatform[process.platform]; +if (!sourceName) { + throw new Error(`Unsupported native build platform: ${process.platform}`); +} + +execFileSync( + "cargo", + ["build", "-p", "stella-anonymize-napi", "--release", "--locked"], + { + cwd: repoRoot, + stdio: "inherit", + }, +); + +const source = join(repoRoot, "target", "release", sourceName); +if (!existsSync(source)) { + throw new Error(`Native build output is missing: ${source}`); +} + +copyFileSync(source, join(packageRoot, "stella_anonymize_napi.node")); From dc02e421e3810942a44f6ea98248324e4600afde Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 06:12:13 +0200 Subject: [PATCH 065/130] fix: align native text-offset edges --- crates/anonymize-core/src/address_context.rs | 19 ++- crates/anonymize-core/src/address_seeds.rs | 15 +- crates/anonymize-core/src/byte_offsets.rs | 45 ++++++ crates/anonymize-core/src/dates.rs | 14 +- crates/anonymize-core/src/hotwords.rs | 30 +--- crates/anonymize-core/src/processors.rs | 8 +- .../tests/address_seed_parity.rs | 14 +- crates/anonymize-core/tests/prepared.rs | 133 ++++++++++++++++++ .../src/__test__/pipeline-config.test.ts | 3 + .../anonymize/src/build-unified-search.ts | 4 +- 10 files changed, 241 insertions(+), 44 deletions(-) diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index 45e826ef..cc2c57c6 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -2,6 +2,7 @@ use std::collections::BTreeSet; use regex::Regex; +use crate::byte_offsets::ByteOffsets; use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; use crate::types::{Error, Result}; @@ -436,11 +437,12 @@ fn near_confirmed_address_same_line( start: u32, end: u32, ) -> Result { + let offsets = ByteOffsets::new(full_text); for entity in existing_entities.iter().chain(results.iter()) { if entity.label != "address" || is_caller_owned_entity(entity) { continue; } - let dist = entity.start.abs_diff(end).min(entity.end.abs_diff(start)); + let dist = span_gap_utf16_units(&offsets, entity, start, end)?; if dist > BARE_HOUSE_CONTEXT_WINDOW { continue; } @@ -453,6 +455,21 @@ fn near_confirmed_address_same_line( Ok(false) } +fn span_gap_utf16_units( + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + start: u32, + end: u32, +) -> Result { + if entity.end <= start { + return offsets.utf16_units_between(entity.end, start); + } + if end <= entity.start { + return offsets.utf16_units_between(end, entity.start); + } + Ok(0) +} + fn is_short_ascii_digit_token(value: &str) -> bool { let mut count = 0usize; for ch in value.chars() { diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index 26c3e591..44b4d7a3 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -401,7 +401,7 @@ impl PreparedAddressSeedData { let right_pos = cluster.end; let remaining = full_text.get(right_pos..).unwrap_or_default(); let mut nearest_boundary = - byte_cap_at_char_boundary(remaining, ADDRESS_RIGHT_EXPAND_LIMIT); + utf16_cap_at_char_boundary(remaining, ADDRESS_RIGHT_EXPAND_LIMIT); if let Some(boundary) = self.nearest_boundary_word(full_text, right_pos) { nearest_boundary = nearest_boundary.min(boundary); @@ -1172,11 +1172,16 @@ fn resolve_newline_boundary( NewlineBoundaryResolution::Drop } -fn byte_cap_at_char_boundary(text: &str, cap: usize) -> usize { - if cap >= text.len() { - return text.len(); +fn utf16_cap_at_char_boundary(text: &str, cap: usize) -> usize { + let mut units = 0usize; + for (index, ch) in text.char_indices() { + let width = ch.len_utf16(); + if units.saturating_add(width) > cap { + return index; + } + units = units.saturating_add(width); } - floor_char_boundary(text, cap) + text.len() } fn floor_char_boundary(text: &str, mut byte: usize) -> usize { diff --git a/crates/anonymize-core/src/byte_offsets.rs b/crates/anonymize-core/src/byte_offsets.rs index b4119355..1e40b586 100644 --- a/crates/anonymize-core/src/byte_offsets.rs +++ b/crates/anonymize-core/src/byte_offsets.rs @@ -59,4 +59,49 @@ impl<'a> ByteOffsets<'a> { .to_owned(), ) } + + pub(crate) fn utf16_units_between( + &self, + start: u32, + end: u32, + ) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let start_byte = self.validate_offset(start)?; + let end_byte = self.validate_offset(end)?; + let units = self + .text + .get(start_byte..end_byte) + .ok_or(Error::InvalidSpan { start, end })? + .chars() + .map(char::len_utf16) + .sum::(); + u32::try_from(units) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn offset_after_utf16_units( + &self, + start: u32, + max_units: u32, + ) -> Result { + let start_byte = self.validate_offset(start)?; + let mut units = 0_u32; + let tail = self.text.get(start_byte..).ok_or(Error::InvalidSpan { + start, + end: self.len()?, + })?; + for (relative, ch) in tail.char_indices() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + if units.saturating_add(width) > max_units { + let offset = start_byte.saturating_add(relative); + return u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + units = units.saturating_add(width); + } + self.len() + } } diff --git a/crates/anonymize-core/src/dates.rs b/crates/anonymize-core/src/dates.rs index 7f02d9ee..d961a3d7 100644 --- a/crates/anonymize-core/src/dates.rs +++ b/crates/anonymize-core/src/dates.rs @@ -292,7 +292,7 @@ fn ordinal_day_before_month( return None; } for suffix in ["st", "nd", "rd", "th"] { - if !ends_with_before(text, end, suffix) { + if !ends_with_before_ascii_case_insensitive(text, end, suffix) { continue; } let day_end = end.saturating_sub(suffix.len()); @@ -448,6 +448,18 @@ fn ends_with_before(text: &str, index: usize, needle: &str) -> bool { str_head(text, index).is_some_and(|value| value.ends_with(needle)) } +fn ends_with_before_ascii_case_insensitive( + text: &str, + index: usize, + needle: &str, +) -> bool { + let Some(start) = index.checked_sub(needle.len()) else { + return false; + }; + str_slice(text, start, index) + .is_some_and(|value| value.eq_ignore_ascii_case(needle)) +} + fn str_head(text: &str, index: usize) -> Option<&str> { text.get(..index) } diff --git a/crates/anonymize-core/src/hotwords.rs b/crates/anonymize-core/src/hotwords.rs index 69e2972d..f2f489d8 100644 --- a/crates/anonymize-core/src/hotwords.rs +++ b/crates/anonymize-core/src/hotwords.rs @@ -36,8 +36,7 @@ pub(crate) fn apply_hotword_rules( continue; } - let adjusted = - apply_entity_rules(entity, full_text, &offsets, data, &hits_by_rule)?; + let adjusted = apply_entity_rules(entity, &offsets, data, &hits_by_rule)?; if label_allowed(&adjusted.label, allowed_labels) { result.push(adjusted); } @@ -80,7 +79,6 @@ fn collect_hits_by_rule( fn apply_entity_rules( mut entity: PipelineEntity, - full_text: &str, offsets: &ByteOffsets<'_>, data: &HotwordRuleData, hits_by_rule: &[Vec], @@ -100,7 +98,7 @@ fn apply_entity_rules( }; for hit in rule_hits { let Some((distance, max_distance)) = - hotword_distance(full_text, offsets, &entity, hit, rule)? + hotword_distance(offsets, &entity, hit, rule)? else { continue; }; @@ -143,7 +141,6 @@ fn apply_entity_rules( } fn hotword_distance( - full_text: &str, offsets: &ByteOffsets<'_>, entity: &PipelineEntity, hit: &SearchMatch, @@ -151,12 +148,12 @@ fn hotword_distance( ) -> Result> { let (distance, max_distance) = if hit.end() <= entity.start { ( - char_distance(full_text, offsets, hit.end(), entity.start)?, + text_distance(offsets, hit.end(), entity.start)?, rule.proximity_before, ) } else if hit.start() >= entity.end { ( - char_distance(full_text, offsets, entity.end, hit.start())?, + text_distance(offsets, entity.end, hit.start())?, rule.proximity_after, ) } else { @@ -169,27 +166,12 @@ fn hotword_distance( Ok(Some((distance, max_distance))) } -fn char_distance( - full_text: &str, +fn text_distance( offsets: &ByteOffsets<'_>, start: u32, end: u32, ) -> Result { - if start > end { - return Err(Error::InvalidSpan { start, end }); - } - let start = offsets.validate_offset(start)?; - let end = offsets.validate_offset(end)?; - let distance = full_text - .get(start..end) - .ok_or_else(|| Error::InvalidSpan { - start: u32::try_from(start).unwrap_or(u32::MAX), - end: u32::try_from(end).unwrap_or(u32::MAX), - })? - .chars() - .count(); - u32::try_from(distance) - .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + offsets.utf16_units_between(start, end) } const fn caller_owned(entity: &PipelineEntity) -> bool { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 1c4bd255..c2b901b7 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -1832,12 +1832,8 @@ fn try_gazetteer_prefix_extension( offsets: &ByteOffsets<'_>, found: &SearchMatch, ) -> Result)>> { - let full_len = offsets.len()?; - let max_end = found - .end() - .saturating_add(MAX_GAZETTEER_PREFIX_OVERSHOOT) - .min(full_len); - let max_end = offsets.floor_offset(max_end)?; + let max_end = offsets + .offset_after_utf16_units(found.end(), MAX_GAZETTEER_PREFIX_OVERSHOOT)?; if max_end <= found.end().saturating_add(1) { return Ok(None); } diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index f884b4f6..36692424 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -212,19 +212,21 @@ fn preserves_unit_abbreviation_inside_address_seed_span() { }) .expect("address seed data should prepare"); + let suffix = "á".repeat(97); + let full_text = format!( + "Notices go to 10 Main Street, Springfield 12345 Apt. 5 {suffix}. Thank you." + ); let result = prepared - .redact_static_entities( - "Notices go to 10 Main Street, Springfield 12345 Apt. 5. Thank you.", - &OperatorConfig::default(), - ) + .redact_static_entities(&full_text, &OperatorConfig::default()) .expect("static redaction should succeed"); + let expected = format!("10 Main Street, Springfield 12345 Apt. 5 {suffix}"); assert!( - address_texts(&result) - .contains(&"10 Main Street, Springfield 12345 Apt. 5"), + address_texts(&result).contains(&expected.as_str()), "resolved address entities: {:?}; address seed entities: {:?}", result.resolved_entities, result.detections.address_seed_entities, ); assert!(!result.redaction.redacted_text.contains("Apt. 5")); + assert!(!result.redaction.redacted_text.contains(&suffix)); } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 53eb6790..032f76e1 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -276,6 +276,35 @@ fn prepared_search_keeps_address_context_above_threshold() { })); } +#[test] +fn prepared_search_measures_bare_house_context_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("Praha 10 {} Evropská 710.", "á".repeat(40)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + #[test] fn prepared_search_ignores_caller_owned_addresses_for_bare_house_context() { let mut meta = RegexMatchMeta::new("address", 1.0); @@ -549,6 +578,43 @@ fn prepared_search_emits_static_detector_entities() { assert_eq!(result.country_entities[0].source, DetectionSource::Country); } +#[test] +fn prepared_search_extends_gazetteer_suffix_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Acme spółka signed.", &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "organization" && entity.text == "Acme spółka" + })); + assert_eq!(result.redaction.redacted_text, "[ORGANIZATION_1] signed."); +} + #[test] fn prepared_search_preserves_overlapping_custom_regex_matches() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -697,6 +763,32 @@ fn prepared_search_extracts_dates_from_anchored_data() { ); } +#[test] +fn prepared_search_extracts_uppercase_ordinal_dates() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + date_data: Some(DateData { + month_names_by_language: BTreeMap::from([( + String::from("en"), + vec![String::from("January")], + )]), + year_words_by_language: BTreeMap::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Filed on 1ST January 2025.") + .unwrap(); + + assert!( + result + .anchored_entities + .iter() + .any(|entity| entity.text == "1ST January 2025") + ); +} + #[test] fn prepared_search_extracts_written_date_of_birth_trigger() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -1294,6 +1386,47 @@ fn prepared_search_boost_counts_text_offsets_not_bytes() { assert!((result.resolved_entities[1].score - 0.5).abs() < f64::EPSILON); } +#[test] +fn prepared_search_hotword_distance_uses_utf16_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("born"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + allowed_labels: vec![String::from("date of birth")], + threshold: 0.8, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + hotwords: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.7)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + target_labels: vec![String::from("date")], + score_adjustment: 1.0, + reclassify_to: Some(String::from("date of birth")), + proximity_before: 40, + proximity_after: 40, + }], + pattern_rule_indices: vec![0], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("born {} 12.03.1990", "😀".repeat(30)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + #[test] fn prepared_search_applies_hotword_reclassification_before_threshold() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index fd87113a..55fc4f3a 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -384,6 +384,9 @@ describe("pipeline config semantics", () => { labels: ["organization", "organization"], is_fuzzy: [false, true], }); + expect(search.nativeStaticConfig.literal_options.fuzzy_whole_words).toBe( + false, + ); expect( Object.hasOwn(search.nativeStaticConfig.gazetteer_data ?? {}, "isFuzzy"), ).toBe(false); diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 42989f51..1ca88e6b 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -1066,6 +1066,8 @@ const buildNativeStaticConfig = ({ start: literalOffset, end: literalOffset + nativeHotwordPatterns.length, }; + const hasGazetteerFuzzyPatterns = + gazetteerData?.isFuzzy.some((isFuzzy) => isFuzzy) ?? false; const nativeConfig: NativePreparedSearchConfig = { regex_patterns: nativeRegexPatterns, @@ -1090,7 +1092,7 @@ const buildNativeStaticConfig = ({ literal_case_insensitive: true, literal_whole_words: canUseGlobalWholeWordLiterals, fuzzy_case_insensitive: true, - fuzzy_whole_words: true, + fuzzy_whole_words: !hasGazetteerFuzzyPatterns, fuzzy_normalize_diacritics: true, }, literal_patterns_from_deny_list_data: denyListPatternsFromData, From 430e4957d6dad7719bf515b5b429af9460a0056c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 06:19:10 +0200 Subject: [PATCH 066/130] fix: mirror address unit data --- packages/data/config/address-unit-abbreviations.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 packages/data/config/address-unit-abbreviations.json diff --git a/packages/data/config/address-unit-abbreviations.json b/packages/data/config/address-unit-abbreviations.json new file mode 100644 index 00000000..dfdf7cd7 --- /dev/null +++ b/packages/data/config/address-unit-abbreviations.json @@ -0,0 +1,4 @@ +{ + "_comment": "Dotted address unit abbreviations that should not terminate address seed expansion. Organised per language because abbreviations are locale-specific.", + "en": ["apt.", "bldg.", "fl.", "ste.", "unit."] +} From 7f254b2b29381744382d8a714fb3b9b57bb8229d Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 06:37:21 +0200 Subject: [PATCH 067/130] feat: port native coreference pass --- crates/anonymize-adapter-contract/src/lib.rs | 70 +++- crates/anonymize-core/src/coreference.rs | 394 ++++++++++++++++++ crates/anonymize-core/src/diagnostics.rs | 1 + crates/anonymize-core/src/lib.rs | 2 + crates/anonymize-core/src/prepared.rs | 54 +++ .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 143 ++++++- crates/anonymize-core/tests/trigger_parity.rs | 1 + .../scripts/migration-fixture-perf.mjs | 2 +- .../__test__/native-adapter-parity.test.ts | 68 ++- .../src/__test__/pipeline-config.test.ts | 23 + .../anonymize/src/build-unified-search.ts | 71 +++- packages/anonymize/src/native-pipeline.ts | 2 - 14 files changed, 805 insertions(+), 28 deletions(-) create mode 100644 crates/anonymize-core/src/coreference.rs diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 5a10358f..2d724fcf 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -3,15 +3,16 @@ use std::collections::{BTreeMap, BTreeSet}; use serde::{Deserialize, Serialize}; use stella_anonymize_core::{ - AddressContextData, AddressSeedData, AmountWordsData, CountryMatchData, - CurrencyData, DateData, DenyListFilterData, DenyListMatchData, - DetectionSource, DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, - FuzzySearchOptions, GazetteerMatchData, HotwordRule, HotwordRuleData, - LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, - OperatorConfig, OperatorType, PatternSlice, PreparedSearchConfig, - PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchEngine, - SearchOptions, SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, - SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, + AddressContextData, AddressSeedData, AmountWordsData, CoreferenceData, + CoreferencePatternData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, + DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, + HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, + MagnitudeSuffixData, MonetaryData, OperatorConfig, OperatorType, + PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, + ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, + StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, StringGroups, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, }; @@ -19,13 +20,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 8; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 9; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 6; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 7; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 7; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 8; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 7; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 8; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -356,6 +357,23 @@ pub struct BindingAddressContextData { pub bare_house_stopwords: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCoreferenceData { + #[serde(default)] + pub definition_patterns: Vec, + #[serde(default)] + pub role_stop_terms: Vec, + #[serde(default)] + pub legal_form_aliases: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCoreferencePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingDenyListMatchData { #[serde(default)] @@ -463,6 +481,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub address_context_data: Option, #[serde(default)] + pub coreference_data: Option, + #[serde(default)] pub date_data: Option, #[serde(default)] pub monetary_data: Option, @@ -522,6 +542,7 @@ struct BinaryPreparedSearchConfig { legal_form_data: Option, address_seed_data: Option, address_context_data: Option, + coreference_data: Option, date_data: Option, monetary_data: Option, } @@ -724,6 +745,7 @@ impl From for BinaryPreparedSearchConfig { legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, address_context_data: config.address_context_data, + coreference_data: config.coreference_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -756,6 +778,7 @@ impl From for BindingPreparedSearchConfig { legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, address_context_data: config.address_context_data, + coreference_data: config.coreference_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -1218,6 +1241,9 @@ pub fn prepared_search_config_from_binding( bare_house_stopwords: data.bare_house_stopwords, } }), + coreference_data: config + .coreference_data + .map(coreference_data_from_binding), date_data: config.date_data.map(|data| DateData { month_names_by_language: data.month_names_by_language, year_words_by_language: data.year_words_by_language, @@ -1553,6 +1579,23 @@ fn hotword_data_from_binding(data: BindingHotwordRuleData) -> HotwordRuleData { } } +fn coreference_data_from_binding( + data: BindingCoreferenceData, +) -> CoreferenceData { + CoreferenceData { + definition_patterns: data + .definition_patterns + .into_iter() + .map(|pattern| CoreferencePatternData { + pattern: pattern.pattern, + flags: pattern.flags, + }) + .collect(), + role_stop_terms: data.role_stop_terms, + legal_form_aliases: data.legal_form_aliases, + } +} + pub fn operator_config_from_binding( config: Option, ) -> Result { @@ -2136,6 +2179,7 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::EntityLegalForm => "entity.legal-form", DiagnosticStage::EntityAddressSeed => "entity.address-seed", DiagnosticStage::EntityAddressContext => "entity.address-context", + DiagnosticStage::EntityCoreference => "entity.coreference", DiagnosticStage::Merge => "resolution.merge", DiagnosticStage::Boundary => "resolution.boundary", DiagnosticStage::Sanitize => "resolution.sanitize", diff --git a/crates/anonymize-core/src/coreference.rs b/crates/anonymize-core/src/coreference.rs new file mode 100644 index 00000000..b5331525 --- /dev/null +++ b/crates/anonymize-core/src/coreference.rs @@ -0,0 +1,394 @@ +use regex::{Regex, RegexBuilder}; +use std::collections::BTreeSet; + +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const SEARCH_WINDOW: u32 = 200; +const COREFERENCE_SCORE: f64 = 0.95; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct CoreferenceData { + #[serde(default)] + pub definition_patterns: Vec, + #[serde(default)] + pub role_stop_terms: Vec, + #[serde(default)] + pub legal_form_aliases: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct CoreferencePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +pub(crate) struct PreparedCoreferenceData { + definition_patterns: Vec, + role_stop_terms: BTreeSet, + legal_form_aliases: BTreeSet, +} + +struct DefinedTerm { + alias: String, + label: String, + source_text: String, +} + +impl PreparedCoreferenceData { + pub(crate) fn new(data: CoreferenceData) -> Result { + let mut definition_patterns = + Vec::with_capacity(data.definition_patterns.len()); + for pattern in &data.definition_patterns { + definition_patterns.push(compile_definition_pattern(pattern)?); + } + + Ok(Self { + definition_patterns, + role_stop_terms: lower_set(data.role_stop_terms), + legal_form_aliases: data + .legal_form_aliases + .into_iter() + .filter_map(|alias| normalized_legal_form_alias(&alias)) + .collect(), + }) + } + + pub(crate) fn process( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + if self.definition_patterns.is_empty() { + return Ok(Vec::new()); + } + + let terms = self.extract_defined_terms(full_text, existing_entities)?; + Self::find_alias_spans(full_text, &terms) + } + + fn extract_defined_terms( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut sorted = entities + .iter() + .filter(|entity| !caller_owned(entity)) + .collect::>(); + sorted.sort_by_key(|entity| entity.start); + + let mut terms = Vec::new(); + let mut seen = BTreeSet::new(); + + for pattern in &self.definition_patterns { + for captures in pattern.captures_iter(full_text) { + let Some(alias_match) = captures.get(1) else { + continue; + }; + let alias = alias_match.as_str().trim(); + if alias.chars().count() < 2 { + continue; + } + if self.role_stop_terms.contains(&alias.to_lowercase()) { + continue; + } + if normalized_legal_form_alias(alias).is_some_and(|normalized| { + self.legal_form_aliases.contains(&normalized) + }) { + continue; + } + + let Some(full_match) = captures.get(0) else { + continue; + }; + let definition_start = + usize_to_u32("coreference.definition_start", full_match.start())?; + let Some(source) = + nearest_preceding_source(&sorted, &offsets, definition_start)? + else { + continue; + }; + let gap = offsets.slice(full_text, source.end, definition_start)?; + if has_clause_boundary(&gap) { + continue; + } + if !has_entity_similarity(alias, &source.text) { + continue; + } + + let key = format!("{}::{}", alias.to_lowercase(), source.label); + if !seen.insert(key) { + continue; + } + + terms.push(DefinedTerm { + alias: alias.to_owned(), + label: source.label.clone(), + source_text: source.text.clone(), + }); + } + } + + Ok(terms) + } + + fn find_alias_spans( + full_text: &str, + terms: &[DefinedTerm], + ) -> Result> { + let mut results = Vec::new(); + + for term in terms { + let mut search_from = 0; + while search_from < full_text.len() { + let Some(relative) = full_text + .get(search_from..) + .and_then(|tail| tail.find(&term.alias)) + else { + break; + }; + let start = search_from.saturating_add(relative); + let end = start.saturating_add(term.alias.len()); + if !is_word_boundary(full_text, start, end) { + search_from = next_char_boundary(full_text, start); + continue; + } + + let start_u32 = usize_to_u32("coreference.alias_start", start)?; + let end_u32 = usize_to_u32("coreference.alias_end", end)?; + results.push(PipelineEntity::coreference( + start_u32, + end_u32, + term.label.clone(), + term.alias.clone(), + COREFERENCE_SCORE, + term.source_text.clone(), + )); + search_from = end; + } + } + + Ok(results) + } +} + +fn compile_definition_pattern(data: &CoreferencePatternData) -> Result { + let mut builder = RegexBuilder::new(&data.pattern); + for flag in data.flags.chars() { + match flag { + 'g' | 'u' => {} + 'i' => { + builder.case_insensitive(true); + } + 'm' => { + builder.multi_line(true); + } + 's' => { + builder.dot_matches_new_line(true); + } + _ => { + return Err(Error::InvalidStaticData { + field: "coreference_data.definition_patterns", + reason: format!("unsupported regex flag '{flag}'"), + }); + } + } + } + builder.build().map_err(|error| Error::InvalidStaticData { + field: "coreference_data.definition_patterns", + reason: error.to_string(), + }) +} + +fn nearest_preceding_source<'a>( + sorted: &[&'a PipelineEntity], + offsets: &ByteOffsets<'_>, + definition_start: u32, +) -> Result> { + for entity in sorted.iter().rev() { + if entity.end > definition_start { + continue; + } + if offsets.utf16_units_between(entity.end, definition_start)? + > SEARCH_WINDOW + { + break; + } + if matches!(entity.label.as_str(), "person" | "organization") { + return Ok(Some(*entity)); + } + } + Ok(None) +} + +fn has_clause_boundary(gap: &str) -> bool { + if gap.contains(';') { + return true; + } + + for (index, ch) in gap.char_indices() { + if ch != '.' { + continue; + } + let Some(after_dot) = gap.get(index.saturating_add(ch.len_utf8())..) else { + return true; + }; + let mut tail = after_dot.chars(); + let next = loop { + let Some(candidate) = tail.next() else { + return true; + }; + if candidate.is_whitespace() + || matches!(candidate, '"' | '\'' | '„' | '‚' | '(') + { + continue; + } + break candidate; + }; + if next.is_uppercase() { + return true; + } + } + + false +} + +fn has_entity_similarity(alias: &str, entity_text: &str) -> bool { + let alias_lower = alias.to_lowercase(); + let entity_lower = entity_text.to_lowercase(); + + if alias_lower.chars().count() >= 3 && entity_lower.contains(&alias_lower) { + return true; + } + if entity_lower.chars().count() >= 3 && alias_lower.contains(&entity_lower) { + return true; + } + + let alias_words = split_similarity_words(&alias_lower); + let entity_words = split_similarity_words(&entity_lower); + let entity_word_set = entity_words.iter().collect::>(); + if alias_words + .iter() + .any(|word| entity_word_set.contains(word)) + { + return true; + } + + if !is_all_uppercase(alias) || alias.chars().count() < 2 { + return false; + } + let alias_len = alias.chars().count(); + if alias_len > entity_words.len() { + return false; + } + for start in 0..=entity_words.len().saturating_sub(alias_len) { + let initials = entity_words + .iter() + .skip(start) + .take(alias_len) + .filter_map(|word| word.chars().next()) + .collect::(); + if initials == alias_lower { + return true; + } + } + + false +} + +fn split_similarity_words(text: &str) -> Vec { + text + .split(|ch: char| { + matches!( + ch, + ' ' + | '\t' + | '\n' + | '\r' + | '.' + | ',' + | ';' + | ':' + | '\'' + | '"' + | '(' + | ')' + | '/' + | '-' + ) + }) + .filter(|word| word.chars().count() >= 2) + .map(ToOwned::to_owned) + .collect() +} + +fn is_all_uppercase(text: &str) -> bool { + text.chars().all(char::is_uppercase) +} + +fn normalized_legal_form_alias(alias: &str) -> Option { + let normalized = alias.split_whitespace().collect::().to_lowercase(); + (!normalized.is_empty()).then_some(normalized) +} + +fn is_word_boundary(full_text: &str, start: usize, end: usize) -> bool { + previous_char(full_text, start).is_none_or(|ch| !is_word_char(ch)) + && next_char(full_text, end).is_none_or(|ch| !is_word_char(ch)) +} + +fn previous_char(full_text: &str, index: usize) -> Option { + full_text.get(..index)?.chars().next_back() +} + +fn next_char(full_text: &str, index: usize) -> Option { + full_text.get(index..)?.chars().next() +} + +fn next_char_boundary(full_text: &str, index: usize) -> usize { + let Some(ch) = next_char(full_text, index) else { + return full_text.len(); + }; + index.saturating_add(ch.len_utf8()) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphanumeric() || is_combining_mark(ch) +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 1df82ed3..1f102fe5 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -38,6 +38,7 @@ pub enum DiagnosticStage { EntityLegalForm, EntityAddressSeed, EntityAddressContext, + EntityCoreference, Merge, Boundary, Sanitize, diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 6bac2206..2b05b713 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -7,6 +7,7 @@ mod address_seeds; mod anchored; mod artifact_bytes; pub(crate) mod byte_offsets; +mod coreference; mod dates; mod diagnostics; mod false_positives; @@ -27,6 +28,7 @@ mod validators; pub use address_context::AddressContextData; pub use address_seeds::AddressSeedData; +pub use coreference::{CoreferenceData, CoreferencePatternData}; pub use dates::DateData; pub use diagnostics::{ DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 25d8bf5f..dcdcb442 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -4,6 +4,7 @@ use crate::address_context::{AddressContextData, PreparedAddressContextData}; use crate::address_seeds::{AddressSeedData, PreparedAddressSeedData}; use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; use crate::byte_offsets::ByteOffsets; +use crate::coreference::{CoreferenceData, PreparedCoreferenceData}; use crate::dates::{DateData, PreparedDateData}; use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; use crate::false_positives::filter_entity_false_positives; @@ -67,6 +68,7 @@ pub struct PreparedSearch { legal_form_data: Option, address_seed_data: Option, address_context_data: Option, + coreference_data: Option, date_data: Option, monetary_data: Option, } @@ -115,6 +117,8 @@ pub struct PreparedSearchConfig { pub address_seed_data: Option, #[serde(default)] pub address_context_data: Option, + #[serde(default)] + pub coreference_data: Option, pub date_data: Option, pub monetary_data: Option, } @@ -447,6 +451,7 @@ impl PreparedSearch { address_context_data: prepare_address_context_data( config.address_context_data, )?, + coreference_data: prepare_coreference_data(config.coreference_data)?, date_data, monetary_data, }) @@ -912,6 +917,12 @@ impl PreparedSearch { self.threshold, &self.allowed_labels, ); + resolved_entities = self.process_coreference_entities( + full_text, + resolved_entities, + false_positive_filters, + diagnostics.as_deref_mut(), + )?; clear_internal_source_details(&mut resolved_entities); if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( @@ -974,6 +985,43 @@ impl PreparedSearch { }; data.process(full_text, existing_entities) } + + fn process_coreference_entities( + &self, + full_text: &str, + existing_entities: Vec, + false_positive_filters: Option<&DenyListFilterData>, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let Some(data) = &self.coreference_data else { + return Ok(existing_entities); + }; + + let start = Instant::now(); + let coreference_entities = data.process(full_text, &existing_entities)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::EntityCoreference, + &coreference_entities, + full_text, + Some(elapsed_us(start)), + ); + } + if coreference_entities.is_empty() { + return Ok(existing_entities); + } + + let merged = + merge_and_dedup(&[existing_entities, coreference_entities].concat()); + let consistent = enforce_boundary_consistency(&merged, full_text)?; + let sanitized = sanitize_entities_with_source(&consistent, full_text)?; + let filtered = filter_entity_false_positives( + sanitized, + full_text, + false_positive_filters, + )?; + Ok(filter_entities_for_labels(filtered, &self.allowed_labels)) + } } fn process_signature_entities(full_text: &str) -> TimedEntities { @@ -1429,6 +1477,12 @@ fn prepare_address_context_data( data.map(PreparedAddressContextData::new).transpose() } +fn prepare_coreference_data( + data: Option, +) -> Result> { + data.map(PreparedCoreferenceData::new).transpose() +} + fn split_regex_patterns( patterns: Vec, slices: &PreparedSearchSlices, diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index 36692424..5fe4e266 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -29,6 +29,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index 3f54ff5c..f88d6ae3 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -31,6 +31,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 032f76e1..c56332cc 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -3,15 +3,16 @@ use std::collections::{BTreeMap, BTreeSet}; use stella_anonymize_core::{ - AddressContextData, AddressSeedData, AmountWordsData, CountryMatchData, - CurrencyData, DateData, DenyListFilterData, DenyListMatchData, - DetectionSource, DiagnosticEventKind, DiagnosticStage, Error, - FuzzySearchOptions, GazetteerMatchData, HotwordRule, HotwordRuleData, - LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, MonetaryData, - OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchArtifacts, - PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, TriggerData, - TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, + AddressContextData, AddressSeedData, AmountWordsData, CoreferenceData, + CoreferencePatternData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, + DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, HotwordRule, + HotwordRuleData, LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, + MonetaryData, OperatorConfig, PatternSlice, PreparedSearch, + PreparedSearchArtifacts, PreparedSearchConfig, PreparedSearchSlices, + RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, + SourceDetail, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, + WrittenAmountPatternData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -37,6 +38,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, } @@ -118,6 +120,17 @@ fn address_context_data() -> AddressContextData { } } +fn coreference_data() -> CoreferenceData { + CoreferenceData { + definition_patterns: vec![CoreferencePatternData { + pattern: String::from(r#"\((?:hereinafter|the)\s+["']([^"']+)["']\)"#), + flags: String::from("gi"), + }], + role_stop_terms: vec![String::from("seller")], + legal_form_aliases: vec![String::from("LLC")], + } +} + #[test] fn prepared_search_runs_legal_form_pass_on_normalized_text() { let prepared = legal_form_prepared_search(vec!["Pty Ltd"]); @@ -163,6 +176,7 @@ fn prepared_search_runs_normalized_literal_pass() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }) @@ -375,6 +389,112 @@ fn prepared_search_measures_header_zone_in_text_offsets() { ); } +#[test] +fn prepared_search_adds_coreference_aliases_with_source_placeholder() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Acme Corporation", + ))], + regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Acme") signed. Acme later paid."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference && entity.text == "Acme" + })); + assert_eq!( + result.redaction.redacted_text, + r#"[ORGANIZATION_1] (the "[ORGANIZATION_1]") signed. [ORGANIZATION_1] later paid."#, + ); +} + +#[test] +fn prepared_search_does_not_seed_coreference_from_caller_owned_entities() { + let mut meta = RegexMatchMeta::new("organization", 1.0); + meta.source_detail = Some(SourceDetail::CustomRegex); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"Acme Corporation", + ))], + custom_regex_meta: vec![meta], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Acme") signed. Acme later paid."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| { entity.source == DetectionSource::Coreference }) + ); +} + +#[test] +fn prepared_search_rejects_role_and_legal_form_coreference_aliases() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"Acme Corporation")), + SearchPattern::Regex(String::from(r"Beta LLC")), + ], + regex_meta: vec![ + RegexMatchMeta::new("organization", 1.0), + RegexMatchMeta::new("organization", 1.0), + ], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Seller") signed. Seller paid. Beta LLC (the "LLC") joined. LLC remained."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| { entity.source == DetectionSource::Coreference }) + ); +} + #[test] fn prepared_search_artifacts_match_direct_prepare() { let config = PreparedSearchConfig { @@ -410,6 +530,7 @@ fn prepared_search_artifacts_match_direct_prepare() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }; @@ -559,6 +680,7 @@ fn prepared_search_emits_static_detector_entities() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }) @@ -1257,6 +1379,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }) @@ -1613,6 +1736,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }) @@ -1697,6 +1821,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, }) diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index b475681a..3538ab11 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -29,6 +29,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { legal_form_data: None, address_seed_data: None, address_context_data: None, + coreference_data: None, date_data: None, monetary_data: None, } diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index f945e0e8..07010872 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1350,7 +1350,7 @@ function describeUnsupportedPipelineStages( if (config.enableZoneClassification) { stages.push("zone-classification"); } - if (config.enableCoreference) { + if (config.enableCoreference && !nativeRuntime) { stages.push("coreference"); } if (!nativeRuntime && sliceLength(search.slices.streetTypes) > 0) { diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index bf4deab9..e509aa94 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1170,6 +1170,73 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS coreference aliases", async () => { + const adapters = getAdapters(); + const fullText = 'Acme LLC (the "Acme") signed. Acme paid.'; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: true, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-coreference-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "organization", + text: "Acme", + source: "coreference", + corefSourceText: "Acme LLC", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, @@ -1193,7 +1260,6 @@ describe("native adapter parity", () => { unsupportedFeatures: [ "enableNer", "enableNameCorpus", - "enableCoreference", "enableZoneClassification", ], }); diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 55fc4f3a..92bfadcf 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -224,6 +224,29 @@ describe("pipeline config semantics", () => { ); }); + test("native config carries coreference definition data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableCoreference: true, + enableRegex: true, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.coreference_data?.definition_patterns.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.coreference_data?.role_stop_terms, + ).toContain("seller"); + expect( + search.nativeStaticConfig.coreference_data?.legal_form_aliases, + ).toContain("LLC"); + }); + test("native trigger config carries legal suffix data without legal-form search", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 1ca88e6b..98c48217 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -34,6 +34,7 @@ import type { TriggerRule } from "./types"; import type { DenyListData, DenyListFilterData } from "./detectors/deny-list"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; +import { loadLanguageConfigs } from "./util/lang-loader"; import { REGEX_PATTERNS, @@ -235,6 +236,18 @@ export type NativeDateData = { export type NativeMonetaryData = MonetaryData; export type NativeAddressSeedData = AddressSeedData; export type NativeAddressContextData = AddressContextData; +export type NativeCoreferencePatternData = { + pattern: string; + flags: string; +}; +export type NativeCoreferenceData = { + definition_patterns: NativeCoreferencePatternData[]; + role_stop_terms: string[]; + legal_form_aliases: string[]; +}; +type GenericRolesData = { + roles: string[]; +}; export type NativeGazetteerData = { labels: string[]; is_fuzzy: boolean[]; @@ -286,6 +299,7 @@ export type NativePreparedSearchConfig = { legal_form_data?: NativeLegalFormData; address_seed_data?: NativeAddressSeedData; address_context_data?: NativeAddressContextData; + coreference_data?: NativeCoreferenceData; date_data?: NativeDateData; monetary_data?: NativeMonetaryData; }; @@ -348,6 +362,11 @@ type CountryPatternResult = { data: CountryData; }; +type CoreferenceConfigRow = { + pattern: string; + flags: string; +}; + type UnifiedSearchSources = { allRegex: PatternEntry[]; regexMeta: RegexMeta[]; @@ -369,6 +388,7 @@ type UnifiedSearchSources = { nativeMonetaryData: NativeMonetaryData | null; nativeAddressSeedData: NativeAddressSeedData | null; nativeAddressContextData: NativeAddressContextData | null; + nativeCoreferenceData: NativeCoreferenceData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: string[]; hotwordRules: readonly HotwordRule[]; @@ -432,8 +452,9 @@ const buildUnifiedSearchSources = async ( monetaryData, addressSeedData, addressContextData, + coreferenceData, ] = await Promise.all([ - legalFormsEnabled || config.enableTriggerPhrases + legalFormsEnabled || config.enableTriggerPhrases || config.enableCoreference ? warmLegalRoleHeads() : Promise.resolve(), config.enableTriggerPhrases @@ -480,6 +501,9 @@ const buildUnifiedSearchSources = async ( labelIsAllowed("address", allowedLabels) ? Promise.resolve(getAddressContextData()) : Promise.resolve(null), + config.enableCoreference + ? buildNativeCoreferenceData() + : Promise.resolve(null), ]); // Read but never populated: the legal-form slice in the unified // search is permanently empty after the v2 rewrite. Tracking it @@ -492,7 +516,7 @@ const buildUnifiedSearchSources = async ( ? [...getKnownLegalSuffixes()] : []; const nativeLegalFormSuffixes = - legalFormsEnabled || config.enableTriggerPhrases + legalFormsEnabled || config.enableTriggerPhrases || config.enableCoreference ? [...getKnownLegalSuffixes()] : []; const nativeLegalFormData = @@ -736,6 +760,13 @@ const buildUnifiedSearchSources = async ( nativeMonetaryData, nativeAddressSeedData: addressSeedData, nativeAddressContextData: addressContextData, + nativeCoreferenceData: + coreferenceData === null + ? null + : { + ...coreferenceData, + legal_form_aliases: nativeLegalFormSuffixes, + }, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -787,6 +818,7 @@ export const buildNativeStaticSearchBundle = async ( monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, addressContextData: sources.nativeAddressContextData, + coreferenceData: sources.nativeCoreferenceData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -873,6 +905,7 @@ export const buildUnifiedSearch = async ( monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, addressContextData: sources.nativeAddressContextData, + coreferenceData: sources.nativeCoreferenceData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -923,6 +956,7 @@ type BuildNativeStaticConfigArgs = { monetaryData: NativeMonetaryData | null; addressSeedData: NativeAddressSeedData | null; addressContextData: NativeAddressContextData | null; + coreferenceData: NativeCoreferenceData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: readonly string[]; hotwordRules: readonly HotwordRule[]; @@ -954,6 +988,7 @@ const buildNativeStaticConfig = ({ monetaryData, addressSeedData, addressContextData, + coreferenceData, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -1161,6 +1196,9 @@ const buildNativeStaticConfig = ({ if (addressContextData) { nativeConfig.address_context_data = addressContextData; } + if (coreferenceData) { + nativeConfig.coreference_data = coreferenceData; + } if (dateData) { nativeConfig.date_data = dateData; } @@ -1472,6 +1510,35 @@ const sentenceTerminalCurrencyTerms = ( ].toSorted(); }; +const buildNativeCoreferenceData = async (): Promise => { + const roleModule = await import("./data/generic-roles.json"); + const roleData = (roleModule.default ?? roleModule) as GenericRolesData; + const configs = await loadLanguageConfigs( + "coreference", + (mod) => { + const moduleValue = mod as { + default?: readonly CoreferenceConfigRow[]; + }; + return moduleValue.default ?? (mod as readonly CoreferenceConfigRow[]); + }, + ); + const definitionPatterns: NativeCoreferencePatternData[] = []; + for (const rows of configs) { + for (const row of rows) { + definitionPatterns.push({ + pattern: row.pattern, + flags: row.flags, + }); + } + } + + return { + definition_patterns: definitionPatterns, + role_stop_terms: roleData.roles, + legal_form_aliases: [], + }; +}; + const createStringGroupEncoder = (): { table: string[]; encode: (values: string | readonly string[] | undefined) => number[]; diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index e256b390..655bc9c4 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -19,7 +19,6 @@ import { export type NativePipelineUnsupportedFeature = | "enableNer" | "enableNameCorpus" - | "enableCoreference" | "enableZoneClassification"; export type NativePipelineCompatibility = @@ -107,7 +106,6 @@ export const getNativePipelineCompatibility = ( if (config.enableNer) unsupportedFeatures.push("enableNer"); if (config.enableNameCorpus) unsupportedFeatures.push("enableNameCorpus"); - if (config.enableCoreference) unsupportedFeatures.push("enableCoreference"); if (config.enableZoneClassification === true) { unsupportedFeatures.push("enableZoneClassification"); } From 03e48551ac7d8d725cf88473abbd805e7ef6e95f Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 07:02:18 +0200 Subject: [PATCH 068/130] feat: port native zone adjustments --- crates/anonymize-adapter-contract/src/lib.rs | 65 ++- crates/anonymize-core/src/diagnostics.rs | 1 + crates/anonymize-core/src/lib.rs | 2 + crates/anonymize-core/src/prepared.rs | 58 ++- crates/anonymize-core/src/zones.rs | 450 ++++++++++++++++++ .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 110 ++++- crates/anonymize-core/tests/trigger_parity.rs | 1 + .../scripts/migration-fixture-perf.mjs | 2 +- .../__test__/native-adapter-parity.test.ts | 67 ++- .../src/__test__/pipeline-config.test.ts | 24 + .../anonymize/src/build-unified-search.ts | 62 +++ packages/anonymize/src/native-pipeline.ts | 8 +- 14 files changed, 830 insertions(+), 22 deletions(-) create mode 100644 crates/anonymize-core/src/zones.rs diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 2d724fcf..297400e6 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -14,19 +14,20 @@ use stella_anonymize_core::{ ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, StringGroups, TriggerData, TriggerRule, - TriggerStrategy, TriggerValidation, WrittenAmountPatternData, + TriggerStrategy, TriggerValidation, WrittenAmountPatternData, ZoneData, + ZonePatternData, ZoneSigningClauseData, }; pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 9; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 10; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 7; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 8; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 8; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 9; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 8; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 9; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -357,6 +358,31 @@ pub struct BindingAddressContextData { pub bare_house_stopwords: Vec, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZoneData { + #[serde(default)] + pub section_heading_patterns: Vec, + #[serde(default)] + pub signing_clauses: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZonePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZoneSigningClauseData { + #[serde(default)] + pub prefix: String, + #[serde(default)] + pub suffix: String, + #[serde(default)] + pub prepositions: Vec, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingCoreferenceData { #[serde(default)] @@ -479,6 +505,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub address_seed_data: Option, #[serde(default)] + pub zone_data: Option, + #[serde(default)] pub address_context_data: Option, #[serde(default)] pub coreference_data: Option, @@ -541,6 +569,7 @@ struct BinaryPreparedSearchConfig { trigger_data: Option, legal_form_data: Option, address_seed_data: Option, + zone_data: Option, address_context_data: Option, coreference_data: Option, date_data: Option, @@ -744,6 +773,7 @@ impl From for BinaryPreparedSearchConfig { trigger_data: config.trigger_data.map(BinaryTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, + zone_data: config.zone_data, address_context_data: config.address_context_data, coreference_data: config.coreference_data, date_data: config.date_data, @@ -777,6 +807,7 @@ impl From for BindingPreparedSearchConfig { trigger_data: config.trigger_data.map(BindingTriggerData::from), legal_form_data: config.legal_form_data, address_seed_data: config.address_seed_data, + zone_data: config.zone_data, address_context_data: config.address_context_data, coreference_data: config.coreference_data, date_data: config.date_data, @@ -1233,6 +1264,7 @@ pub fn prepared_search_config_from_binding( br_cep_cue_words: data.br_cep_cue_words, unit_abbreviations: data.unit_abbreviations, }), + zone_data: config.zone_data.map(zone_data_from_binding), address_context_data: config.address_context_data.map(|data| { AddressContextData { address_prepositions: data.address_prepositions, @@ -1596,6 +1628,28 @@ fn coreference_data_from_binding( } } +fn zone_data_from_binding(data: BindingZoneData) -> ZoneData { + ZoneData { + section_heading_patterns: data + .section_heading_patterns + .into_iter() + .map(|pattern| ZonePatternData { + pattern: pattern.pattern, + flags: pattern.flags, + }) + .collect(), + signing_clauses: data + .signing_clauses + .into_iter() + .map(|clause| ZoneSigningClauseData { + prefix: clause.prefix, + suffix: clause.suffix, + prepositions: clause.prepositions, + }) + .collect(), + } +} + pub fn operator_config_from_binding( config: Option, ) -> Result { @@ -2178,6 +2232,7 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::EntitySignature => "entity.signature", DiagnosticStage::EntityLegalForm => "entity.legal-form", DiagnosticStage::EntityAddressSeed => "entity.address-seed", + DiagnosticStage::EntityZoneAdjustment => "entity.zone-adjustment", DiagnosticStage::EntityAddressContext => "entity.address-context", DiagnosticStage::EntityCoreference => "entity.coreference", DiagnosticStage::Merge => "resolution.merge", diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 1f102fe5..9fae4eb8 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -37,6 +37,7 @@ pub enum DiagnosticStage { EntitySignature, EntityLegalForm, EntityAddressSeed, + EntityZoneAdjustment, EntityAddressContext, EntityCoreference, Merge, diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index 2b05b713..a6adeaeb 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -25,6 +25,7 @@ mod signatures; mod triggers; mod types; mod validators; +mod zones; pub use address_context::AddressContextData; pub use address_seeds::AddressSeedData; @@ -71,3 +72,4 @@ pub use types::{ PlaceholderEntry, PlaceholderMap, RedactionEntry, RedactionResult, Result, SearchEngine, SearchMatch, }; +pub use zones::{ZoneData, ZonePatternData, ZoneSigningClauseData}; diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index dcdcb442..39b36ff7 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -39,6 +39,7 @@ use crate::types::{ Entity, EntityKind, Error, OperatorConfig, RedactionResult, Result, SearchMatch, }; +use crate::zones::{PreparedZoneData, ZoneData}; const PREPARED_SEARCH_ARTIFACTS_HEADER: [u8; 8] = *b"ANONPSR1"; const PREPARED_SEARCH_ARTIFACTS_VERSION: u32 = 1; @@ -67,6 +68,7 @@ pub struct PreparedSearch { trigger_data: Option, legal_form_data: Option, address_seed_data: Option, + zone_data: Option, address_context_data: Option, coreference_data: Option, date_data: Option, @@ -116,6 +118,8 @@ pub struct PreparedSearchConfig { pub legal_form_data: Option, pub address_seed_data: Option, #[serde(default)] + pub zone_data: Option, + #[serde(default)] pub address_context_data: Option, #[serde(default)] pub coreference_data: Option, @@ -448,6 +452,7 @@ impl PreparedSearch { .transpose()?, legal_form_data: config.legal_form_data.map(PreparedLegalFormData::new), address_seed_data: prepare_address_seed_data(config.address_seed_data)?, + zone_data: prepare_zone_data(config.zone_data.as_ref())?, address_context_data: prepare_address_context_data( config.address_context_data, )?, @@ -854,10 +859,10 @@ impl PreparedSearch { ) -> Result { let detections = self .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; - let pre_threshold_entities = self.apply_hotword_entities( - detections.all_entities(), + let pre_threshold_entities = self.prepare_pre_threshold_entities( + &detections, full_text, - &detections.matches.literal, + diagnostics.as_deref_mut(), )?; let mut raw_entities = filter_entities_for_redaction( pre_threshold_entities, @@ -953,6 +958,24 @@ impl PreparedSearch { }) } + fn prepare_pre_threshold_entities( + &self, + detections: &StaticDetectionResult, + full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let zone_adjusted_entities = self.apply_zone_adjustments( + detections.all_entities(), + full_text, + diagnostics, + )?; + self.apply_hotword_entities( + zone_adjusted_entities, + full_text, + &detections.matches.literal, + ) + } + fn apply_hotword_entities( &self, entities: Vec, @@ -972,6 +995,29 @@ impl PreparedSearch { ) } + fn apply_zone_adjustments( + &self, + entities: Vec, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let Some(data) = &self.zone_data else { + return Ok(entities); + }; + + let start = Instant::now(); + let adjusted = data.adjust_entities(full_text, entities)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::EntityZoneAdjustment, + Some(adjusted.boosted), + Some(elapsed_us(start)), + Some(full_text.len()), + ); + } + Ok(adjusted.entities) + } + fn process_address_context_entities( &self, full_text: &str, @@ -1477,6 +1523,12 @@ fn prepare_address_context_data( data.map(PreparedAddressContextData::new).transpose() } +fn prepare_zone_data( + data: Option<&ZoneData>, +) -> Result> { + data.map(PreparedZoneData::new).transpose() +} + fn prepare_coreference_data( data: Option, ) -> Result> { diff --git a/crates/anonymize-core/src/zones.rs b/crates/anonymize-core/src/zones.rs new file mode 100644 index 00000000..d6c54d33 --- /dev/null +++ b/crates/anonymize-core/src/zones.rs @@ -0,0 +1,450 @@ +use regex::{Regex, RegexBuilder}; + +use crate::resolution::PipelineEntity; +use crate::types::{Error, Result}; + +const MIN_TABS_FOR_TABLE: usize = 2; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct ZoneData { + #[serde(default)] + pub section_heading_patterns: Vec, + #[serde(default)] + pub signing_clauses: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct ZonePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct ZoneSigningClauseData { + #[serde(default)] + pub prefix: String, + #[serde(default)] + pub suffix: String, + #[serde(default)] + pub prepositions: Vec, +} + +pub(crate) struct PreparedZoneData { + section_heading_patterns: Vec, + signing_clause_patterns: Vec, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum DocumentZone { + Header, + Signature, + Body, + Table, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ZoneSpan { + zone: DocumentZone, + start: u32, + end: u32, +} + +struct Line<'a> { + text: &'a str, + start: usize, +} + +pub(crate) struct ZoneAdjustmentResult { + pub(crate) entities: Vec, + pub(crate) boosted: usize, +} + +impl PreparedZoneData { + pub(crate) fn new(data: &ZoneData) -> Result { + Ok(Self { + section_heading_patterns: data + .section_heading_patterns + .iter() + .map(|pattern| { + compile_pattern("zone_data.section_heading_patterns", pattern) + }) + .collect::>>()?, + signing_clause_patterns: data + .signing_clauses + .iter() + .map(compile_signing_clause_pattern) + .collect::>>()?, + }) + } + + pub(crate) fn adjust_entities( + &self, + full_text: &str, + entities: Vec, + ) -> Result { + if entities.is_empty() { + return Ok(ZoneAdjustmentResult { + entities, + boosted: 0, + }); + } + + let zones = self.classify(full_text)?; + let mut boosted: usize = 0; + let adjusted = entities + .into_iter() + .map(|mut entity| { + let zone = zone_for_entity(&zones, &entity); + let adjustment = score_adjustment(zone); + if adjustment > 0.0 { + let score = f64::min(1.0, entity.score + adjustment); + if score > entity.score { + boosted = boosted.saturating_add(1); + entity.score = score; + } + } + entity + }) + .collect(); + + Ok(ZoneAdjustmentResult { + entities: adjusted, + boosted, + }) + } + + fn classify(&self, full_text: &str) -> Result> { + if full_text.is_empty() { + return Ok(Vec::new()); + } + + let lines = split_lines(full_text); + let header_end_line = + first_matching_line(&lines, &self.section_heading_patterns); + let signature_start_line = + last_matching_line(&lines, &self.signing_clause_patterns); + + let mut header_end_offset = header_end_line + .and_then(|line| lines.get(line)) + .map_or(0, |line| line.start); + let signature_start_offset = signature_start_line + .and_then(|line| lines.get(line)) + .map_or(full_text.len(), |line| line.start); + + let mut header_line = header_end_line; + if header_end_line.is_some_and(|line| line > 0) + && signature_start_line.is_some() + && header_end_offset > signature_start_offset + { + header_line = None; + header_end_offset = 0; + } + + let mut zones = Vec::new(); + if header_line.is_some_and(|line| line > 0) { + zones.push(ZoneSpan { + zone: DocumentZone::Header, + start: usize_to_u32("zone.header.start", 0)?, + end: usize_to_u32("zone.header.end", header_end_offset)?, + }); + } + + let body_start = if header_line.is_some_and(|line| line > 0) { + header_end_offset + } else { + 0 + }; + let body_end = signature_start_offset; + add_table_zones( + &mut zones, + &lines, + header_line.unwrap_or(0), + signature_start_line.unwrap_or(lines.len()), + body_end, + )?; + add_body_zones(&mut zones, body_start, body_end)?; + + if signature_start_line.is_some() { + zones.push(ZoneSpan { + zone: DocumentZone::Signature, + start: usize_to_u32("zone.signature.start", signature_start_offset)?, + end: usize_to_u32("zone.signature.end", full_text.len())?, + }); + } + + zones.sort_by_key(|zone| zone.start); + Ok(zones) + } +} + +fn first_matching_line( + lines: &[Line<'_>], + patterns: &[Regex], +) -> Option { + for (index, line) in lines.iter().enumerate() { + if patterns.iter().any(|pattern| pattern.is_match(line.text)) { + return Some(index); + } + } + None +} + +fn last_matching_line(lines: &[Line<'_>], patterns: &[Regex]) -> Option { + for (index, line) in lines.iter().enumerate().rev() { + if patterns.iter().any(|pattern| pattern.is_match(line.text)) { + return Some(index); + } + } + None +} + +fn add_table_zones( + zones: &mut Vec, + lines: &[Line<'_>], + start_line: usize, + end_line: usize, + body_end: usize, +) -> Result<()> { + let mut table_start = None; + for line in lines + .iter() + .enumerate() + .skip(start_line) + .take(end_line.saturating_sub(start_line)) + .map(|(_, line)| line) + { + if is_table_line(line.text) { + table_start.get_or_insert(line.start); + continue; + } + + if let Some(start) = table_start.take() { + zones.push(ZoneSpan { + zone: DocumentZone::Table, + start: usize_to_u32("zone.table.start", start)?, + end: usize_to_u32("zone.table.end", line.start)?, + }); + } + } + + if let Some(start) = table_start { + zones.push(ZoneSpan { + zone: DocumentZone::Table, + start: usize_to_u32("zone.table.start", start)?, + end: usize_to_u32("zone.table.end", body_end)?, + }); + } + + Ok(()) +} + +fn add_body_zones( + zones: &mut Vec, + body_start: usize, + body_end: usize, +) -> Result<()> { + let mut special = zones.clone(); + special.sort_by_key(|zone| zone.start); + + let mut cursor = usize_to_u32("zone.body.start", body_start)?; + let body_end = usize_to_u32("zone.body.end", body_end)?; + for span in special { + if span.zone == DocumentZone::Header { + continue; + } + if span.start > cursor { + zones.push(ZoneSpan { + zone: DocumentZone::Body, + start: cursor, + end: span.start, + }); + } + cursor = u32::max(cursor, span.end); + } + + if cursor < body_end { + zones.push(ZoneSpan { + zone: DocumentZone::Body, + start: cursor, + end: body_end, + }); + } + + Ok(()) +} + +fn zone_for_entity( + zones: &[ZoneSpan], + entity: &PipelineEntity, +) -> DocumentZone { + let midpoint = f64::midpoint(f64::from(entity.start), f64::from(entity.end)); + for zone in zones { + if midpoint >= f64::from(zone.start) && midpoint < f64::from(zone.end) { + return zone.zone; + } + } + DocumentZone::Body +} + +const fn score_adjustment(zone: DocumentZone) -> f64 { + match zone { + DocumentZone::Header => 0.1, + DocumentZone::Signature => 0.15, + DocumentZone::Body => 0.0, + DocumentZone::Table => 0.05, + } +} + +fn split_lines(full_text: &str) -> Vec> { + let mut offset: usize = 0; + let mut lines = Vec::new(); + for line in full_text.split('\n') { + let start = offset; + let end = start.saturating_add(line.len()); + lines.push(Line { text: line, start }); + offset = end.saturating_add(1); + } + lines +} + +fn is_table_line(line: &str) -> bool { + line + .chars() + .filter(|ch| *ch == '\t') + .take(MIN_TABS_FOR_TABLE) + .count() + >= MIN_TABS_FOR_TABLE +} + +fn compile_pattern( + field: &'static str, + data: &ZonePatternData, +) -> Result { + let mut builder = RegexBuilder::new(&data.pattern); + for flag in data.flags.chars() { + match flag { + 'u' => {} + 'i' => { + builder.case_insensitive(true); + } + 'm' => { + builder.multi_line(true); + } + 's' => { + builder.dot_matches_new_line(true); + } + _ => { + return Err(Error::InvalidStaticData { + field, + reason: format!("unsupported regex flag '{flag}'"), + }); + } + } + } + builder.build().map_err(|error| Error::InvalidStaticData { + field, + reason: error.to_string(), + }) +} + +fn compile_signing_clause_pattern( + data: &ZoneSigningClauseData, +) -> Result { + let place = if data.prepositions.is_empty() { + String::from(r"\p{Lu}\p{Ll}+(?:[- ]\p{Lu}\p{Ll}+)*") + } else { + format!( + r"\p{{Lu}}\p{{Ll}}+(?:\s+(?:{})\s+\p{{Lu}}\p{{Ll}}+)*(?:\s+\p{{Lu}}\p{{Ll}}+)*", + data.prepositions.join("|") + ) + }; + let pattern = format!(r"^\s*(?:{}{}{})", data.prefix, place, data.suffix); + compile_pattern( + "zone_data.signing_clauses", + &ZonePatternData { + pattern, + flags: String::new(), + }, + ) +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + + use super::*; + use crate::resolution::{DetectionSource, PipelineEntity}; + + fn test_data() -> PreparedZoneData { + PreparedZoneData::new(&ZoneData { + section_heading_patterns: vec![ZonePatternData { + pattern: String::from(r"^\s*(?:Article|Článek)\s*1"), + flags: String::from("iu"), + }], + signing_clauses: vec![ZoneSigningClauseData { + prefix: String::from(r"(?:V|Ve)\s+"), + suffix: String::from(r"\s*,?\s*dne"), + prepositions: vec![String::from("nad")], + }], + }) + .unwrap() + } + + #[test] + fn classifies_header_table_and_signature_zones() { + let data = test_data(); + let text = [ + "Parties", + "Alice", + "Article 1", + "Name\tAddress\tId", + "Alice\tPrague\t123", + "Body", + "V Praze dne 1.1.2024", + "Alice", + ] + .join("\n"); + + let zones = data.classify(&text).unwrap(); + + assert_eq!(zones.first().unwrap().zone, DocumentZone::Header); + assert!(zones.iter().any(|zone| zone.zone == DocumentZone::Table)); + assert_eq!(zones.last().unwrap().zone, DocumentZone::Signature); + assert_eq!(zones.first().unwrap().start, 0); + assert_eq!( + zones.last().unwrap().end, + u32::try_from(text.len()).unwrap() + ); + } + + #[test] + fn boosts_scores_for_pii_dense_zones() { + let data = test_data(); + let text = ["Alice", "Article 1"].join("\n"); + let entities = vec![PipelineEntity::detected( + 0, + 5, + "person", + "Alice", + 0.45, + DetectionSource::Regex, + )]; + + let adjusted = data.adjust_entities(&text, entities).unwrap(); + + assert_eq!(adjusted.boosted, 1); + assert!((adjusted.entities[0].score - 0.55).abs() < 1e-12); + } +} diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index 5fe4e266..f5bcd38f 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -28,6 +28,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index f88d6ae3..ffd55b66 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -30,6 +30,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index c56332cc..a085b2e4 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -12,7 +12,7 @@ use stella_anonymize_core::{ PreparedSearchArtifacts, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, SourceDetail, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, - WrittenAmountPatternData, + WrittenAmountPatternData, ZoneData, ZonePatternData, ZoneSigningClauseData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -37,6 +37,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -120,6 +121,20 @@ fn address_context_data() -> AddressContextData { } } +fn zone_data() -> ZoneData { + ZoneData { + section_heading_patterns: vec![ZonePatternData { + pattern: String::from(r"^\s*(?:Article|Článek)\s*1"), + flags: String::from("iu"), + }], + signing_clauses: vec![ZoneSigningClauseData { + prefix: String::from(r"(?:V|Ve)\s+"), + suffix: String::from(r"\s*,?\s*dne"), + prepositions: vec![String::from("nad")], + }], + } +} + fn coreference_data() -> CoreferenceData { CoreferenceData { definition_patterns: vec![CoreferencePatternData { @@ -175,6 +190,7 @@ fn prepared_search_runs_normalized_literal_pass() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -529,6 +545,7 @@ fn prepared_search_artifacts_match_direct_prepare() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -679,6 +696,7 @@ fn prepared_search_emits_static_detector_entities() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -1378,6 +1396,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -1439,6 +1458,93 @@ fn prepared_search_applies_threshold_before_merge() { assert_eq!(result.resolved_entities[0].text, "Acme"); } +#[test] +fn prepared_search_applies_header_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.45)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + "Parties\nAlice\nArticle 1\nBody", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.result.resolved_entities.len(), 1); + assert_eq!(result.result.resolved_entities[0].text, "Alice"); + assert!((result.result.resolved_entities[0].score - 0.55).abs() < 1e-12); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::EntityZoneAdjustment + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); +} + +#[test] +fn prepared_search_applies_table_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.46)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Article 1\nName\tAddress\tId\nAlice\tPrague\t123", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert!((result.resolved_entities[0].score - 0.51).abs() < 1e-12); +} + +#[test] +fn prepared_search_applies_signature_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.36)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Article 1\nBody\nV Praze dne 1.1.2024\nAlice", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert!((result.resolved_entities[0].score - 0.51).abs() < 1e-12); +} + #[test] fn prepared_search_boosts_near_miss_entities_when_enabled() { let prepared = PreparedSearch::new(PreparedSearchConfig { @@ -1735,6 +1841,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, @@ -1820,6 +1927,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 3538ab11..0bf8f8ff 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -28,6 +28,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { trigger_data: None, legal_form_data: None, address_seed_data: None, + zone_data: None, address_context_data: None, coreference_data: None, date_data: None, diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 07010872..dfe03e25 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1347,7 +1347,7 @@ function describeUnsupportedPipelineStages( if (config.enableNer) { stages.push("ner"); } - if (config.enableZoneClassification) { + if (config.enableZoneClassification && !nativeRuntime) { stages.push("zone-classification"); } if (config.enableCoreference && !nativeRuntime) { diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index e509aa94..55945963 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1237,6 +1237,67 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS zone score adjustments", async () => { + const adapters = getAdapters(); + const fullText = ["Parties", "Alice", "Article 1", "Body"].join("\n"); + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: true, + customRegexes: [{ pattern: "Alice", label: "person", score: 0.45 }], + labels: ["person"], + workspaceId: "native-pipeline-zone-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual([ + expect.objectContaining({ label: "person", text: "Alice", score: 0.55 }), + ]); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, @@ -1257,11 +1318,7 @@ describe("native adapter parity", () => { expect(getNativePipelineCompatibility(config)).toEqual({ status: "unsupported", - unsupportedFeatures: [ - "enableNer", - "enableNameCorpus", - "enableZoneClassification", - ], + unsupportedFeatures: ["enableNer", "enableNameCorpus"], }); }); diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 92bfadcf..4cf62f71 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -247,6 +247,30 @@ describe("pipeline config semantics", () => { ).toContain("LLC"); }); + test("native config carries zone classifier data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableZoneClassification: true, + labels: ["person"], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.zone_data?.section_heading_patterns.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.zone_data?.section_heading_patterns.some( + ({ pattern }) => pattern.includes("Article"), + ), + ).toBe(true); + expect( + search.nativeStaticConfig.zone_data?.signing_clauses.length, + ).toBeGreaterThan(0); + }); + test("native trigger config carries legal suffix data without legal-form search", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 98c48217..533a6d1d 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -245,6 +245,19 @@ export type NativeCoreferenceData = { role_stop_terms: string[]; legal_form_aliases: string[]; }; +export type NativeZonePatternData = { + pattern: string; + flags: string; +}; +export type NativeZoneSigningClauseData = { + prefix: string; + suffix: string; + prepositions: string[]; +}; +export type NativeZoneData = { + section_heading_patterns: NativeZonePatternData[]; + signing_clauses: NativeZoneSigningClauseData[]; +}; type GenericRolesData = { roles: string[]; }; @@ -298,6 +311,7 @@ export type NativePreparedSearchConfig = { trigger_data?: NativeTriggerData; legal_form_data?: NativeLegalFormData; address_seed_data?: NativeAddressSeedData; + zone_data?: NativeZoneData; address_context_data?: NativeAddressContextData; coreference_data?: NativeCoreferenceData; date_data?: NativeDateData; @@ -367,6 +381,18 @@ type CoreferenceConfigRow = { flags: string; }; +type SectionHeadingsConfig = { + patterns: Array<{ re: string; flags: string }>; +}; + +type SigningClauseConfig = { + patterns: Array<{ + prefix?: string; + suffix?: string; + prepositions?: string[]; + }>; +}; + type UnifiedSearchSources = { allRegex: PatternEntry[]; regexMeta: RegexMeta[]; @@ -387,6 +413,7 @@ type UnifiedSearchSources = { nativeDateData: NativeDateData | null; nativeMonetaryData: NativeMonetaryData | null; nativeAddressSeedData: NativeAddressSeedData | null; + nativeZoneData: NativeZoneData | null; nativeAddressContextData: NativeAddressContextData | null; nativeCoreferenceData: NativeCoreferenceData | null; nativeSigningPatterns: readonly string[]; @@ -451,6 +478,7 @@ const buildUnifiedSearchSources = async ( yearWordData, monetaryData, addressSeedData, + zoneData, addressContextData, coreferenceData, ] = await Promise.all([ @@ -498,6 +526,9 @@ const buildUnifiedSearchSources = async ( labelIsAllowed("address", allowedLabels) ? getAddressSeedData() : Promise.resolve(null), + config.enableZoneClassification + ? buildNativeZoneData() + : Promise.resolve(null), labelIsAllowed("address", allowedLabels) ? Promise.resolve(getAddressContextData()) : Promise.resolve(null), @@ -759,6 +790,7 @@ const buildUnifiedSearchSources = async ( nativeDateData, nativeMonetaryData, nativeAddressSeedData: addressSeedData, + nativeZoneData: zoneData, nativeAddressContextData: addressContextData, nativeCoreferenceData: coreferenceData === null @@ -817,6 +849,7 @@ export const buildNativeStaticSearchBundle = async ( dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, + zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, coreferenceData: sources.nativeCoreferenceData, nativeSigningPatterns: sources.nativeSigningPatterns, @@ -904,6 +937,7 @@ export const buildUnifiedSearch = async ( dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, addressSeedData: sources.nativeAddressSeedData, + zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, coreferenceData: sources.nativeCoreferenceData, nativeSigningPatterns: sources.nativeSigningPatterns, @@ -955,6 +989,7 @@ type BuildNativeStaticConfigArgs = { dateData: NativeDateData | null; monetaryData: NativeMonetaryData | null; addressSeedData: NativeAddressSeedData | null; + zoneData: NativeZoneData | null; addressContextData: NativeAddressContextData | null; coreferenceData: NativeCoreferenceData | null; nativeSigningPatterns: readonly string[]; @@ -987,6 +1022,7 @@ const buildNativeStaticConfig = ({ dateData, monetaryData, addressSeedData, + zoneData, addressContextData, coreferenceData, nativeSigningPatterns, @@ -1193,6 +1229,9 @@ const buildNativeStaticConfig = ({ if (addressSeedData) { nativeConfig.address_seed_data = addressSeedData; } + if (zoneData) { + nativeConfig.zone_data = zoneData; + } if (addressContextData) { nativeConfig.address_context_data = addressContextData; } @@ -1539,6 +1578,29 @@ const buildNativeCoreferenceData = async (): Promise => { }; }; +const buildNativeZoneData = async (): Promise => { + const [headingModule, signingModule] = await Promise.all([ + import("./data/section-headings.json"), + import("./data/signing-clauses.json"), + ]); + const headingData = (headingModule.default ?? + headingModule) as SectionHeadingsConfig; + const signingData = (signingModule.default ?? + signingModule) as SigningClauseConfig; + + return { + section_heading_patterns: headingData.patterns.map((pattern) => ({ + pattern: pattern.re, + flags: pattern.flags, + })), + signing_clauses: signingData.patterns.map((pattern) => ({ + prefix: pattern.prefix ?? "", + suffix: pattern.suffix ?? "", + prepositions: pattern.prepositions ?? [], + })), + }; +}; + const createStringGroupEncoder = (): { table: string[]; encode: (values: string | readonly string[] | undefined) => number[]; diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 655bc9c4..97505ccf 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -16,10 +16,7 @@ import { type NativeStaticRedactionResult, } from "./native"; -export type NativePipelineUnsupportedFeature = - | "enableNer" - | "enableNameCorpus" - | "enableZoneClassification"; +export type NativePipelineUnsupportedFeature = "enableNer" | "enableNameCorpus"; export type NativePipelineCompatibility = | { status: "supported" } @@ -106,9 +103,6 @@ export const getNativePipelineCompatibility = ( if (config.enableNer) unsupportedFeatures.push("enableNer"); if (config.enableNameCorpus) unsupportedFeatures.push("enableNameCorpus"); - if (config.enableZoneClassification === true) { - unsupportedFeatures.push("enableZoneClassification"); - } if (unsupportedFeatures.length === 0) { return { status: "supported" }; } From 8e59b0bf33215ea1a30806b93c9e4890af1c6439 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 07:24:14 +0200 Subject: [PATCH 069/130] fix: align native coreference context --- crates/anonymize-adapter-contract/src/lib.rs | 3 + crates/anonymize-core/src/address_context.rs | 22 +- crates/anonymize-core/src/address_seeds.rs | 131 +++++++++--- crates/anonymize-core/src/coreference.rs | 199 +++++++++++++++++- crates/anonymize-core/src/prepared.rs | 3 +- crates/anonymize-core/src/signatures.rs | 20 +- .../tests/address_seed_parity.rs | 58 +++++ crates/anonymize-core/tests/prepared.rs | 122 +++++++++++ .../scripts/migration-fixture-perf.mjs | 6 + .../__test__/native-adapter-parity.test.ts | 4 +- .../src/__test__/pipeline-config.test.ts | 37 ++++ .../anonymize/src/build-unified-search.ts | 17 +- .../src/data/coreference-org-determiners.json | 8 + packages/anonymize/src/pipeline-cache-key.ts | 2 + 14 files changed, 581 insertions(+), 51 deletions(-) create mode 100644 packages/anonymize/src/data/coreference-org-determiners.json diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 297400e6..4d226ab6 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -391,6 +391,8 @@ pub struct BindingCoreferenceData { pub role_stop_terms: Vec, #[serde(default)] pub legal_form_aliases: Vec, + #[serde(default)] + pub organization_determiners: Vec, } #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] @@ -1625,6 +1627,7 @@ fn coreference_data_from_binding( .collect(), role_stop_terms: data.role_stop_terms, legal_form_aliases: data.legal_form_aliases, + organization_determiners: data.organization_determiners, } } diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index cc2c57c6..b6e1e2ed 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -92,6 +92,7 @@ impl PreparedAddressContextData { .filter(|entity| !is_caller_owned_entity(entity)) .collect::>(); let header_end = header_end(full_text); + let offsets = ByteOffsets::new(full_text); for found in self.slash_house_number.find_iter(full_text) { let num_start = usize_to_u32("address_context.num_start", found.start())?; @@ -102,8 +103,7 @@ impl PreparedAddressContextData { let in_header = num_start < header_end; let near_address = address_entities.iter().any(|entity| { - entity.start.abs_diff(num_end) < STREET_CONTEXT_WINDOW - || entity.end.abs_diff(num_start) < STREET_CONTEXT_WINDOW + within_context_window(&offsets, entity, num_start, num_end) }); if !in_header && !near_address { continue; @@ -256,6 +256,7 @@ impl PreparedAddressContextData { existing_entities: &[PipelineEntity], ) -> Result> { let header_end = header_end(full_text); + let offsets = ByteOffsets::new(full_text); let context_entities = existing_entities .iter() .filter(|entity| { @@ -274,10 +275,9 @@ impl PreparedAddressContextData { if start >= header_end || covered_by(existing_entities, start, end) { continue; } - let has_context = context_entities.iter().any(|entity| { - entity.start.abs_diff(end) < STREET_CONTEXT_WINDOW - || entity.end.abs_diff(start) < STREET_CONTEXT_WINDOW - }); + let has_context = context_entities + .iter() + .any(|entity| within_context_window(&offsets, entity, start, end)); if !has_context { continue; } @@ -470,6 +470,16 @@ fn span_gap_utf16_units( Ok(0) } +fn within_context_window( + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + start: u32, + end: u32, +) -> bool { + span_gap_utf16_units(offsets, entity, start, end) + .is_ok_and(|distance| distance < STREET_CONTEXT_WINDOW) +} + fn is_short_ascii_digit_token(value: &str) -> bool { let mut count = 0usize; for ch in value.chars() { diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index 44b4d7a3..0da84a37 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -110,7 +110,8 @@ impl PreparedAddressSeedData { span.start.saturating_add(effective_raw.len()), ); let effective_text = full_text.get(start..end).unwrap_or_default(); - if effective_text.len() < 5 || effective_text.len() > 300 { + let effective_len = text_units(effective_text); + if !(5..=300).contains(&effective_len) { continue; } results.push(PipelineEntity::detected( @@ -222,19 +223,23 @@ impl PreparedAddressSeedData { seeds: &[Seed], ) -> bool { seeds.iter().any(|seed| { - seed.start.abs_diff(start) <= PLAIN_POSTAL_CONTEXT_WINDOW - && match seed.kind { - SeedType::AddressTrigger => true, - SeedType::City | SeedType::State => { - seed.end >= start && seed.start <= end.saturating_add(4) - || seed.end <= start - && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) - } - SeedType::StreetWord => { - has_house_number_near_street_word(full_text, seed, self) - } - SeedType::PostalCode => false, + within_text_window( + full_text, + seed.start, + start, + PLAIN_POSTAL_CONTEXT_WINDOW, + ) && match seed.kind { + SeedType::AddressTrigger => true, + SeedType::City | SeedType::State => { + seed.end >= start && seed.start <= end.saturating_add(4) + || seed.end <= start + && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) + } + SeedType::StreetWord => { + has_house_number_near_street_word(full_text, seed, self) } + SeedType::PostalCode => false, + } }) } @@ -248,7 +253,7 @@ impl PreparedAddressSeedData { if seed_covered(seeds, start, end) { continue; } - if !has_nearby_italian_cap_evidence(seeds, start) { + if !has_nearby_italian_cap_evidence(full_text, seeds, start) { continue; } seeds.push(Seed { @@ -299,16 +304,10 @@ impl PreparedAddressSeedData { let Some(search) = &self.br_cep_cue_search else { return false; }; - let window_start = floor_char_boundary( - full_text, - start.saturating_sub(BR_CEP_CONTEXT_WINDOW), - ); - let window_end = ceil_char_boundary( - full_text, - end - .saturating_add(BR_CEP_CONTEXT_WINDOW) - .min(full_text.len()), - ); + let window_start = + offset_before_text_units(full_text, start, BR_CEP_CONTEXT_WINDOW); + let window_end = + offset_after_text_units(full_text, end, BR_CEP_CONTEXT_WINDOW); full_text .get(window_start..window_end) .is_some_and(|window| search.is_match(window).unwrap_or(false)) @@ -328,7 +327,7 @@ impl PreparedAddressSeedData { } let has_context = seeds.iter().any(|seed| { - seed.start.abs_diff(start) <= US_ZIP_CONTEXT_WINDOW + within_text_window(full_text, seed.start, start, US_ZIP_CONTEXT_WINDOW) && match seed.kind { SeedType::AddressTrigger => true, SeedType::City => { @@ -713,9 +712,13 @@ fn seed_covered(seeds: &[Seed], start: usize, end: usize) -> bool { .any(|seed| seed.start <= start && seed.end >= end) } -fn has_nearby_italian_cap_evidence(seeds: &[Seed], start: usize) -> bool { +fn has_nearby_italian_cap_evidence( + full_text: &str, + seeds: &[Seed], + start: usize, +) -> bool { seeds.iter().any(|seed| { - seed.start.abs_diff(start) <= 80 + within_text_window(full_text, seed.start, start, 80) && match seed.kind { SeedType::AddressTrigger | SeedType::City | SeedType::PostalCode => { true @@ -747,14 +750,17 @@ fn cluster_seeds( }; for seed in seeds.iter().skip(1) { - let gap_ok = seed.start.saturating_sub(current.end) - <= ADDRESS_CLUSTER_MAX_GAP - && !has_cluster_barrier( - full_text, - current.end, - seed.start, - existing_entities, - ); + let gap_ok = within_text_window( + full_text, + current.end, + seed.start, + ADDRESS_CLUSTER_MAX_GAP, + ) && !has_cluster_barrier( + full_text, + current.end, + seed.start, + existing_entities, + ); if gap_ok { current.seeds.push(seed.clone()); current.end = current.end.max(seed.end); @@ -771,6 +777,61 @@ fn cluster_seeds( clusters } +fn within_text_window( + full_text: &str, + left: usize, + right: usize, + max_units: usize, +) -> bool { + let start = left.min(right); + let end = left.max(right); + full_text + .get(start..end) + .is_some_and(|gap| text_units(gap) <= max_units) +} + +fn text_units(text: &str) -> usize { + text.chars().map(char::len_utf16).sum() +} + +fn offset_before_text_units( + full_text: &str, + end: usize, + max_units: usize, +) -> usize { + let Some(prefix) = full_text.get(..end) else { + return 0; + }; + let mut units = 0usize; + for (index, ch) in prefix.char_indices().rev() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return index.saturating_add(ch.len_utf8()); + } + units = units.saturating_add(width); + } + 0 +} + +fn offset_after_text_units( + full_text: &str, + start: usize, + max_units: usize, +) -> usize { + let Some(tail) = full_text.get(start..) else { + return full_text.len(); + }; + let mut units = 0usize; + for (relative, ch) in tail.char_indices() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return start.saturating_add(relative); + } + units = units.saturating_add(width); + } + full_text.len() +} + fn has_cluster_barrier( full_text: &str, gap_start: usize, diff --git a/crates/anonymize-core/src/coreference.rs b/crates/anonymize-core/src/coreference.rs index b5331525..cc7a2a7c 100644 --- a/crates/anonymize-core/src/coreference.rs +++ b/crates/anonymize-core/src/coreference.rs @@ -1,5 +1,5 @@ use regex::{Regex, RegexBuilder}; -use std::collections::BTreeSet; +use std::collections::{BTreeMap, BTreeSet}; use crate::byte_offsets::ByteOffsets; use crate::resolution::{PipelineEntity, SourceDetail}; @@ -7,6 +7,8 @@ use crate::types::{Error, Result}; const SEARCH_WINDOW: u32 = 200; const COREFERENCE_SCORE: f64 = 0.95; +const ORG_PROPAGATION_SCORE: f64 = 0.9; +const ORG_DETERMINER_LOOKBACK: usize = 40; #[derive( Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, @@ -18,6 +20,8 @@ pub struct CoreferenceData { pub role_stop_terms: Vec, #[serde(default)] pub legal_form_aliases: Vec, + #[serde(default)] + pub organization_determiners: Vec, } #[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] @@ -31,6 +35,8 @@ pub(crate) struct PreparedCoreferenceData { definition_patterns: Vec, role_stop_terms: BTreeSet, legal_form_aliases: BTreeSet, + legal_form_suffixes: Vec, + org_determiner: Option, } struct DefinedTerm { @@ -39,6 +45,13 @@ struct DefinedTerm { source_text: String, } +#[derive(Clone, Debug, Eq, PartialEq)] +struct OrgSeed { + base_name: String, + label: String, + source_text: String, +} + impl PreparedCoreferenceData { pub(crate) fn new(data: CoreferenceData) -> Result { let mut definition_patterns = @@ -47,6 +60,9 @@ impl PreparedCoreferenceData { definition_patterns.push(compile_definition_pattern(pattern)?); } + let mut legal_form_suffixes = data.legal_form_aliases.clone(); + legal_form_suffixes.sort_by_key(|suffix| std::cmp::Reverse(suffix.len())); + Ok(Self { definition_patterns, role_stop_terms: lower_set(data.role_stop_terms), @@ -55,6 +71,8 @@ impl PreparedCoreferenceData { .into_iter() .filter_map(|alias| normalized_legal_form_alias(&alias)) .collect(), + legal_form_suffixes, + org_determiner: compile_org_determiner(&data.organization_determiners)?, }) } @@ -62,13 +80,20 @@ impl PreparedCoreferenceData { &self, full_text: &str, existing_entities: &[PipelineEntity], + threshold: f64, ) -> Result> { - if self.definition_patterns.is_empty() { - return Ok(Vec::new()); + let mut results = self.propagate_organization_names( + full_text, + existing_entities, + threshold, + )?; + + if !self.definition_patterns.is_empty() { + let terms = self.extract_defined_terms(full_text, existing_entities)?; + results.extend(Self::find_alias_spans(full_text, &terms)?); } - let terms = self.extract_defined_terms(full_text, existing_entities)?; - Self::find_alias_spans(full_text, &terms) + Ok(results) } fn extract_defined_terms( @@ -176,6 +201,141 @@ impl PreparedCoreferenceData { Ok(results) } + + fn propagate_organization_names( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + threshold: f64, + ) -> Result> { + if threshold > ORG_PROPAGATION_SCORE || self.legal_form_suffixes.is_empty() + { + return Ok(Vec::new()); + } + + let seeds = self.organization_seeds(existing_entities); + if seeds.is_empty() { + return Ok(Vec::new()); + } + + let mut covered = existing_entities + .iter() + .map(|entity| (entity.start, entity.end)) + .collect::>(); + let mut results = Vec::new(); + + for seed in seeds { + let mut search_from = 0usize; + while search_from < full_text.len() { + let Some(relative) = full_text + .get(search_from..) + .and_then(|tail| tail.find(&seed.base_name)) + else { + break; + }; + let start = search_from.saturating_add(relative); + let end = start.saturating_add(seed.base_name.len()); + if !is_word_boundary(full_text, start, end) { + search_from = next_char_boundary(full_text, start); + continue; + } + + let span_start = + self.determiner_start(full_text, start).unwrap_or(start); + let start_u32 = usize_to_u32("coreference.org_start", span_start)?; + let end_u32 = usize_to_u32("coreference.org_end", end)?; + if !span_overlaps(&covered, start_u32, end_u32) { + results.push(PipelineEntity::coreference( + start_u32, + end_u32, + seed.label.clone(), + full_text.get(span_start..end).unwrap_or_default(), + ORG_PROPAGATION_SCORE, + seed.source_text.clone(), + )); + covered.push((start_u32, end_u32)); + } + + search_from = end; + } + } + + Ok(results) + } + + fn organization_seeds( + &self, + existing_entities: &[PipelineEntity], + ) -> Vec { + let mut seed_by_base = BTreeMap::::new(); + + for entity in existing_entities { + if entity.label != "organization" || caller_owned(entity) { + continue; + } + let Some(base) = self.organization_base_name(&entity.text) else { + continue; + }; + let entry = seed_by_base.entry(base.clone()).or_insert_with(|| OrgSeed { + base_name: base.clone(), + label: entity.label.clone(), + source_text: entity.text.clone(), + }); + if entry.source_text != entity.text { + entry.source_text = base; + } + } + + seed_by_base.into_values().collect() + } + + fn organization_base_name(&self, text: &str) -> Option { + for suffix in &self.legal_form_suffixes { + let Some(base) = text.strip_suffix(suffix) else { + continue; + }; + let base = + base.trim_end_matches(|ch: char| ch == ',' || ch.is_whitespace()); + let base = base.trim(); + if text_units(base) >= 3 { + return Some(base.to_owned()); + } + } + None + } + + fn determiner_start( + &self, + full_text: &str, + match_start: usize, + ) -> Option { + let lookback_start = + offset_before_text_units(full_text, match_start, ORG_DETERMINER_LOOKBACK); + let lookback = full_text.get(lookback_start..match_start)?; + let captures = self.org_determiner.as_ref()?.captures(lookback)?; + let determiner = captures.get(1)?; + let start = lookback_start.saturating_add(determiner.start()); + previous_char(full_text, start) + .is_none_or(|ch| !is_word_char(ch)) + .then_some(start) + } +} + +fn compile_org_determiner(patterns: &[String]) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + + let pattern = format!("({})\\s+$", patterns.join("|")); + RegexBuilder::new(&pattern) + .case_insensitive(true) + .unicode(true) + .build() + .map(Some) + .map_err(|error| Error::InvalidStaticData { + field: "coreference_data.org_determiner", + reason: error.to_string(), + }) } fn compile_definition_pattern(data: &CoreferencePatternData) -> Result { @@ -361,6 +521,35 @@ fn is_word_char(ch: char) -> bool { ch.is_alphanumeric() || is_combining_mark(ch) } +fn span_overlaps(covered: &[(u32, u32)], start: u32, end: u32) -> bool { + covered.iter().any(|(covered_start, covered_end)| { + start < *covered_end && end > *covered_start + }) +} + +fn text_units(text: &str) -> usize { + text.chars().map(char::len_utf16).sum() +} + +fn offset_before_text_units( + full_text: &str, + end: usize, + max_units: usize, +) -> usize { + let Some(prefix) = full_text.get(..end) else { + return 0; + }; + let mut units = 0usize; + for (index, ch) in prefix.char_indices().rev() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return index.saturating_add(ch.len_utf8()); + } + units = units.saturating_add(width); + } + 0 +} + const fn is_combining_mark(ch: char) -> bool { matches!( ch, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 39b36ff7..acd61840 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1044,7 +1044,8 @@ impl PreparedSearch { }; let start = Instant::now(); - let coreference_entities = data.process(full_text, &existing_entities)?; + let coreference_entities = + data.process(full_text, &existing_entities, self.threshold)?; if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( DiagnosticStage::EntityCoreference, diff --git a/crates/anonymize-core/src/signatures.rs b/crates/anonymize-core/src/signatures.rs index 661a9550..d2120ff4 100644 --- a/crates/anonymize-core/src/signatures.rs +++ b/crates/anonymize-core/src/signatures.rs @@ -303,7 +303,8 @@ fn strip_post_nominal_suffix(text: &str) -> &str { } fn is_name_shape(text: &str) -> bool { - if text.len() < 3 || text.len() > MAX_NAME_LEN { + let text_len = text.chars().map(char::len_utf16).sum::(); + if !(3..=MAX_NAME_LEN).contains(&text_len) { return false; } let tokens = text.split([' ', '\t']).filter(|token| !token.is_empty()); @@ -614,6 +615,23 @@ mod tests { ); } + #[test] + fn counts_signature_name_length_in_text_units() { + let name = "Élodie ŽluťoučkýKůňÚpělĎábelskéÓdyÁÉÍÓÚÝČĎĚŇŘŠŤŽ"; + assert!(name.len() > super::MAX_NAME_LEN); + assert!( + name.chars().map(char::len_utf16).sum::() <= super::MAX_NAME_LEN + ); + + let entities = detect_signatures(&format!("/s/ {name}")); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some(name) + ); + } + #[test] fn detects_multiple_labelled_name_columns() { let entities = diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index f5bcd38f..bf4fab62 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -171,6 +171,64 @@ fn keeps_date_like_street_name_in_address_seed_span() { assert!(!result.redaction.redacted_text.contains("May 15 Street")); } +#[test] +fn clusters_address_seeds_across_multibyte_text_gap() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Springfield"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Springfield")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + let gap = "á".repeat(140); + let full_text = + format!("Send notices to Main Street, {gap} Springfield 12345."); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .expect("static redaction should succeed"); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text.contains("Main Street") + && entity.text.contains("Springfield 12345")), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); +} + #[test] fn preserves_unit_abbreviation_inside_address_seed_span() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index a085b2e4..3e8611d7 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -143,9 +143,61 @@ fn coreference_data() -> CoreferenceData { }], role_stop_terms: vec![String::from("seller")], legal_form_aliases: vec![String::from("LLC")], + organization_determiners: vec![String::from( + r"the\s+(?:company|corporation|firm)", + )], } } +fn legal_form_coreference_prepared_search( + suffixes: Vec<&str>, +) -> PreparedSearch { + let suffix_strings = suffixes + .iter() + .map(|suffix| (*suffix).to_owned()) + .collect::>(); + let regex_patterns = suffixes + .into_iter() + .map(|suffix| SearchPattern::Literal(suffix.to_owned())) + .collect::>(); + + PreparedSearch::new(PreparedSearchConfig { + regex_patterns, + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + legal_forms: PatternSlice { + start: 0, + end: u32::try_from(suffix_strings.len()).unwrap(), + }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + legal_form_data: Some(LegalFormData { + suffixes: suffix_strings.clone(), + normalized_boundary_suffixes: vec![String::from("llc")], + normalized_suffix_words: vec![String::from("llc")], + company_suffix_words: vec![String::from("Company")], + ..LegalFormData::default() + }), + coreference_data: Some(CoreferenceData { + legal_form_aliases: suffix_strings, + organization_determiners: vec![String::from( + r"the\s+(?:company|corporation|firm)", + )], + ..CoreferenceData::default() + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap() +} + #[test] fn prepared_search_runs_legal_form_pass_on_normalized_text() { let prepared = legal_form_prepared_search(vec!["Pty Ltd"]); @@ -335,6 +387,35 @@ fn prepared_search_measures_bare_house_context_in_text_offsets() { ); } +#[test] +fn prepared_search_measures_slash_address_context_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("Praha 10 {} Vinohradská 2512/2a.", "á".repeat(145)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Vinohradská 2512/2a") + ); +} + #[test] fn prepared_search_ignores_caller_owned_addresses_for_bare_house_context() { let mut meta = RegexMatchMeta::new("address", 1.0); @@ -439,6 +520,47 @@ fn prepared_search_adds_coreference_aliases_with_source_placeholder() { ); } +#[test] +fn prepared_search_propagates_bare_organization_names() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + + let result = prepared + .redact_static_entities( + "Acme LLC signed. Acme paid.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference && entity.text == "Acme" + })); + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] paid.", + ); +} + +#[test] +fn prepared_search_extends_propagated_organization_determiners() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + + let result = prepared + .redact_static_entities( + "Acme LLC signed. The Company Acme paid.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference + && entity.text == "The Company Acme" + })); + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] paid.", + ); +} + #[test] fn prepared_search_does_not_seed_coreference_from_caller_owned_entities() { let mut meta = RegexMatchMeta::new("organization", 1.0); diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index dfe03e25..7207ea94 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -96,6 +96,12 @@ const ACCEPTED_NATIVE_STATIC_DELTAS = new Map( reason: "party-organization-retained", candidateExtra: [ { start: 542, end: 585, label: "organization", source: "deny-list" }, + { + start: 3226, + end: 3247, + label: "organization", + source: "coreference", + }, ], candidateMissing: [], }, diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 55945963..52ef4da8 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1170,9 +1170,9 @@ describe("native adapter parity", () => { }); }); - test("native pipeline package matches TS coreference aliases", async () => { + test("native pipeline package matches TS organization propagation", async () => { const adapters = getAdapters(); - const fullText = 'Acme LLC (the "Acme") signed. Acme paid.'; + const fullText = "Acme LLC signed. Acme paid."; const config: PipelineConfig = { threshold: 0.5, enableTriggerPhrases: false, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 4cf62f71..e1ae94c0 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -245,6 +245,9 @@ describe("pipeline config semantics", () => { expect( search.nativeStaticConfig.coreference_data?.legal_form_aliases, ).toContain("LLC"); + expect( + search.nativeStaticConfig.coreference_data?.organization_determiners, + ).toContain("the\\s+(?:company|corporation|firm)"); }); test("native config carries zone classifier data", async () => { @@ -640,6 +643,40 @@ describe("pipeline config semantics", () => { expect(counts().compressedPrepare).toBe(2); }); + test("native pipeline package cache keys contextual native passes", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-contextual-passes", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + enableCoreference: false, + enableZoneClassification: false, + labels: ["organization"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + enableCoreference: true, + }, + context, + }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + enableZoneClassification: true, + }, + context, + }); + + expect(counts().compressedPrepare).toBe(3); + }); + test("enableLegalForms flag gates legal-form detection", async () => { const withFlag = await detect("Acme s.r.o.", { enableLegalForms: true, diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 533a6d1d..64767784 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -244,6 +244,7 @@ export type NativeCoreferenceData = { definition_patterns: NativeCoreferencePatternData[]; role_stop_terms: string[]; legal_form_aliases: string[]; + organization_determiners: string[]; }; export type NativeZonePatternData = { pattern: string; @@ -261,6 +262,7 @@ export type NativeZoneData = { type GenericRolesData = { roles: string[]; }; +type CoreferenceDeterminersData = Record; export type NativeGazetteerData = { labels: string[]; is_fuzzy: boolean[]; @@ -1550,8 +1552,13 @@ const sentenceTerminalCurrencyTerms = ( }; const buildNativeCoreferenceData = async (): Promise => { - const roleModule = await import("./data/generic-roles.json"); + const [roleModule, determinerModule] = await Promise.all([ + import("./data/generic-roles.json"), + import("./data/coreference-org-determiners.json"), + ]); const roleData = (roleModule.default ?? roleModule) as GenericRolesData; + const determinerData = (determinerModule.default ?? + determinerModule) as CoreferenceDeterminersData; const configs = await loadLanguageConfigs( "coreference", (mod) => { @@ -1575,6 +1582,14 @@ const buildNativeCoreferenceData = async (): Promise => { definition_patterns: definitionPatterns, role_stop_terms: roleData.roles, legal_form_aliases: [], + organization_determiners: Object.entries(determinerData) + .flatMap(([language, values]) => { + if (language === "_comment" || !Array.isArray(values)) { + return []; + } + return values; + }) + .toSorted((left, right) => left.localeCompare(right)), }; }; diff --git a/packages/anonymize/src/data/coreference-org-determiners.json b/packages/anonymize/src/data/coreference-org-determiners.json new file mode 100644 index 00000000..c5579c22 --- /dev/null +++ b/packages/anonymize/src/data/coreference-org-determiners.json @@ -0,0 +1,8 @@ +{ + "_comment": "Organization reference determiners used before propagated bare organization names. Values are regex fragments grouped by language.", + "cs": ["společnost(?:i|í|em|u)?", "spolecnost(?:i|em|u)?"], + "de": ["die\\s+(?:gesellschaft|firma)"], + "en": ["the\\s+(?:company|corporation|firm)"], + "es": ["la\\s+(?:empresa|sociedad)", "el\\s+(?:empresa|sociedad)"], + "fr": ["la\\s+société"] +} diff --git a/packages/anonymize/src/pipeline-cache-key.ts b/packages/anonymize/src/pipeline-cache-key.ts index c25d2219..99415a86 100644 --- a/packages/anonymize/src/pipeline-cache-key.ts +++ b/packages/anonymize/src/pipeline-cache-key.ts @@ -62,6 +62,8 @@ export const pipelineConfigKey = ( `${config.threshold}:` + `${config.enableConfidenceBoost}:` + `${config.enableHotwordRules === true}:` + + `${config.enableCoreference === true}:` + + `${config.enableZoneClassification === true}:` + `${config.labels.toSorted().join(",")}:` + `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + From fcdebca825aae049ff99758efbecb42f280bec48 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 07:28:47 +0200 Subject: [PATCH 070/130] fix: mirror coreference data --- packages/data/config/coreference-org-determiners.json | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 packages/data/config/coreference-org-determiners.json diff --git a/packages/data/config/coreference-org-determiners.json b/packages/data/config/coreference-org-determiners.json new file mode 100644 index 00000000..c5579c22 --- /dev/null +++ b/packages/data/config/coreference-org-determiners.json @@ -0,0 +1,8 @@ +{ + "_comment": "Organization reference determiners used before propagated bare organization names. Values are regex fragments grouped by language.", + "cs": ["společnost(?:i|í|em|u)?", "spolecnost(?:i|em|u)?"], + "de": ["die\\s+(?:gesellschaft|firma)"], + "en": ["the\\s+(?:company|corporation|firm)"], + "es": ["la\\s+(?:empresa|sociedad)", "el\\s+(?:empresa|sociedad)"], + "fr": ["la\\s+société"] +} From 84f6139d37c57601a9f6ac5f33f2cadcd59ddc62 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 08:00:50 +0200 Subject: [PATCH 071/130] feat: port supplemental name corpus --- crates/anonymize-adapter-contract/src/lib.rs | 81 +- crates/anonymize-core/src/diagnostics.rs | 1 + crates/anonymize-core/src/lib.rs | 2 + crates/anonymize-core/src/name_corpus.rs | 995 ++++++++++++++++++ crates/anonymize-core/src/prepared.rs | 42 +- .../tests/address_seed_parity.rs | 1 + .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 10 + crates/anonymize-core/tests/trigger_parity.rs | 1 + .../scripts/migration-fixture-perf.mjs | 2 +- .../__test__/native-adapter-parity.test.ts | 66 +- .../anonymize/src/build-unified-search.ts | 167 +++ .../anonymize/src/data/name-corpus-cjk.json | 86 ++ .../src/data/name-corpus-particles.json | 13 + .../src/data/organization-indicators.json | 36 + packages/anonymize/src/native-pipeline.ts | 4 +- packages/data/config/name-corpus-cjk.json | 86 ++ .../data/config/name-corpus-particles.json | 13 + .../data/config/organization-indicators.json | 36 + 19 files changed, 1631 insertions(+), 12 deletions(-) create mode 100644 crates/anonymize-core/src/name_corpus.rs create mode 100644 packages/anonymize/src/data/name-corpus-cjk.json create mode 100644 packages/anonymize/src/data/name-corpus-particles.json create mode 100644 packages/anonymize/src/data/organization-indicators.json create mode 100644 packages/data/config/name-corpus-cjk.json create mode 100644 packages/data/config/name-corpus-particles.json create mode 100644 packages/data/config/organization-indicators.json diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 4d226ab6..981f0c09 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -8,10 +8,10 @@ use stella_anonymize_core::{ DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, - MagnitudeSuffixData, MonetaryData, OperatorConfig, OperatorType, - PatternSlice, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, - RegexSearchOptions, SearchEngine, SearchOptions, SearchPattern, - ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, + MagnitudeSuffixData, MonetaryData, NameCorpusData, OperatorConfig, + OperatorType, PatternSlice, PreparedSearchConfig, PreparedSearchSlices, + RegexMatchMeta, RegexSearchOptions, SearchEngine, SearchOptions, + SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, StaticRedactionResult, StringGroups, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, WrittenAmountPatternData, ZoneData, @@ -21,13 +21,13 @@ use stella_anonymize_core::{ pub type Result = std::result::Result; const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; -const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 10; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 11; const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; -const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 8; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 9; const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; -const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 9; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 10; const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; -const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 9; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 10; const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; @@ -402,6 +402,40 @@ pub struct BindingCoreferencePatternData { pub flags: String, } +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingNameCorpusData { + #[serde(default)] + pub first_names: Vec, + #[serde(default)] + pub surnames: Vec, + #[serde(default)] + pub title_tokens: Vec, + #[serde(default)] + pub title_abbreviations: Vec, + #[serde(default)] + pub excluded_words: Vec, + #[serde(default)] + pub common_words: Vec, + #[serde(default)] + pub non_western_names: Vec, + #[serde(default)] + pub excluded_all_caps: Vec, + #[serde(default)] + pub ja_suffixes: Vec, + #[serde(default)] + pub arabic_connectors: Vec, + #[serde(default)] + pub relation_connectors: Vec, + #[serde(default)] + pub hyphenated_prefixes: Vec, + #[serde(default)] + pub cjk_non_person_terms: Vec, + #[serde(default)] + pub cjk_surname_starters: Vec, + #[serde(default)] + pub organization_terms: Vec, +} + #[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] pub struct BindingDenyListMatchData { #[serde(default)] @@ -513,6 +547,8 @@ pub struct BindingPreparedSearchConfig { #[serde(default)] pub coreference_data: Option, #[serde(default)] + pub name_corpus_data: Option, + #[serde(default)] pub date_data: Option, #[serde(default)] pub monetary_data: Option, @@ -574,6 +610,7 @@ struct BinaryPreparedSearchConfig { zone_data: Option, address_context_data: Option, coreference_data: Option, + name_corpus_data: Option, date_data: Option, monetary_data: Option, } @@ -778,6 +815,7 @@ impl From for BinaryPreparedSearchConfig { zone_data: config.zone_data, address_context_data: config.address_context_data, coreference_data: config.coreference_data, + name_corpus_data: config.name_corpus_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -812,6 +850,7 @@ impl From for BindingPreparedSearchConfig { zone_data: config.zone_data, address_context_data: config.address_context_data, coreference_data: config.coreference_data, + name_corpus_data: config.name_corpus_data, date_data: config.date_data, monetary_data: config.monetary_data, } @@ -1278,6 +1317,9 @@ pub fn prepared_search_config_from_binding( coreference_data: config .coreference_data .map(coreference_data_from_binding), + name_corpus_data: config + .name_corpus_data + .map(name_corpus_data_from_binding), date_data: config.date_data.map(|data| DateData { month_names_by_language: data.month_names_by_language, year_words_by_language: data.year_words_by_language, @@ -1631,6 +1673,28 @@ fn coreference_data_from_binding( } } +fn name_corpus_data_from_binding( + data: BindingNameCorpusData, +) -> NameCorpusData { + NameCorpusData { + first_names: data.first_names, + surnames: data.surnames, + title_tokens: data.title_tokens, + title_abbreviations: data.title_abbreviations, + excluded_words: data.excluded_words, + common_words: data.common_words, + non_western_names: data.non_western_names, + excluded_all_caps: data.excluded_all_caps, + ja_suffixes: data.ja_suffixes, + arabic_connectors: data.arabic_connectors, + relation_connectors: data.relation_connectors, + hyphenated_prefixes: data.hyphenated_prefixes, + cjk_non_person_terms: data.cjk_non_person_terms, + cjk_surname_starters: data.cjk_surname_starters, + organization_terms: data.organization_terms, + } +} + fn zone_data_from_binding(data: BindingZoneData) -> ZoneData { ZoneData { section_heading_patterns: data @@ -2235,6 +2299,7 @@ fn diagnostic_stage_name(stage: DiagnosticStage) -> String { DiagnosticStage::EntitySignature => "entity.signature", DiagnosticStage::EntityLegalForm => "entity.legal-form", DiagnosticStage::EntityAddressSeed => "entity.address-seed", + DiagnosticStage::EntityNameCorpus => "entity.name-corpus", DiagnosticStage::EntityZoneAdjustment => "entity.zone-adjustment", DiagnosticStage::EntityAddressContext => "entity.address-context", DiagnosticStage::EntityCoreference => "entity.coreference", diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs index 9fae4eb8..36c91b79 100644 --- a/crates/anonymize-core/src/diagnostics.rs +++ b/crates/anonymize-core/src/diagnostics.rs @@ -37,6 +37,7 @@ pub enum DiagnosticStage { EntitySignature, EntityLegalForm, EntityAddressSeed, + EntityNameCorpus, EntityZoneAdjustment, EntityAddressContext, EntityCoreference, diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs index a6adeaeb..6bd01df2 100644 --- a/crates/anonymize-core/src/lib.rs +++ b/crates/anonymize-core/src/lib.rs @@ -14,6 +14,7 @@ mod false_positives; mod hotwords; mod legal_forms; mod money; +mod name_corpus; pub(crate) mod normalize; mod placeholders; mod prepared; @@ -41,6 +42,7 @@ pub use money::{ AmountWordsData, CurrencyData, MagnitudeSuffixData, MonetaryData, ShareQuantityTermData, WrittenAmountPatternData, }; +pub use name_corpus::{NameCorpusData, PreparedNameCorpusData}; pub use normalize::normalize_for_search; pub use placeholders::build_placeholder_map; pub use prepared::{ diff --git a/crates/anonymize-core/src/name_corpus.rs b/crates/anonymize-core/src/name_corpus.rs new file mode 100644 index 00000000..27ef9ea3 --- /dev/null +++ b/crates/anonymize-core/src/name_corpus.rs @@ -0,0 +1,995 @@ +use std::collections::HashSet; + +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Error, Result}; + +const PERSON_LABEL: &str = "person"; +const CJK_HAN_RATIO_NUMERATOR: usize = 15; +const CJK_HAN_RATIO_DENOMINATOR: usize = 100; +const CJK_SCORE: f64 = 0.95; +const HIGH_CONFIDENCE_NAME_SCORE: f64 = 0.9; +const TITLE_NAME_SCORE: f64 = 0.95; +const LOW_CONFIDENCE_NAME_SCORE: f64 = 0.5; +const MAX_CHAIN: usize = 5; +const ALL_CAPS_NAME_LINE_RATIO: f64 = 0.9; +const ALL_CAPS_NAME_LINE_MIN_LETTERS: usize = 3; +const ALL_CAPS_NAME_LINE_MAX_TOKENS: usize = 6; +const MAX_HORIZONTAL_CHAIN_GAP: usize = 4; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct NameCorpusData { + #[serde(default)] + pub first_names: Vec, + #[serde(default)] + pub surnames: Vec, + #[serde(default)] + pub title_tokens: Vec, + #[serde(default)] + pub title_abbreviations: Vec, + #[serde(default)] + pub excluded_words: Vec, + #[serde(default)] + pub common_words: Vec, + #[serde(default)] + pub non_western_names: Vec, + #[serde(default)] + pub excluded_all_caps: Vec, + #[serde(default)] + pub ja_suffixes: Vec, + #[serde(default)] + pub arabic_connectors: Vec, + #[serde(default)] + pub relation_connectors: Vec, + #[serde(default)] + pub hyphenated_prefixes: Vec, + #[serde(default)] + pub cjk_non_person_terms: Vec, + #[serde(default)] + pub cjk_surname_starters: Vec, + #[serde(default)] + pub organization_terms: Vec, +} + +#[derive(Clone, Debug)] +pub struct PreparedNameCorpusData { + first_names: HashSet, + surnames: HashSet, + title_tokens: HashSet, + title_abbreviations: HashSet, + excluded_words: HashSet, + common_words: HashSet, + non_western_names: HashSet, + excluded_all_caps: HashSet, + ja_suffixes: HashSet, + arabic_connectors: HashSet, + relation_connectors: HashSet, + hyphenated_prefixes: HashSet, + cjk_non_person_terms: HashSet, + cjk_surname_starters: HashSet, + organization_terms: HashSet, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum TokenKind { + Name, + Surname, + Title, + Abbreviation, + JaSuffix, + ArabicConnector, + Capitalized, + Other, +} + +#[derive(Clone, Debug)] +struct WordSegment<'a> { + text: &'a str, + start: usize, + end: usize, +} + +#[derive(Clone, Debug)] +struct ClassifiedToken<'a> { + text: &'a str, + kind: TokenKind, + start: usize, + end: usize, + non_western: bool, + title_abbreviation: bool, +} + +impl PreparedNameCorpusData { + #[must_use] + pub fn new(data: NameCorpusData) -> Self { + Self { + first_names: string_set(data.first_names), + surnames: string_set(data.surnames), + title_tokens: lower_string_set(data.title_tokens), + title_abbreviations: lower_string_set(data.title_abbreviations), + excluded_words: lower_string_set(data.excluded_words), + common_words: lower_string_set(data.common_words), + non_western_names: string_set(data.non_western_names), + excluded_all_caps: string_set(data.excluded_all_caps), + ja_suffixes: lower_string_set(data.ja_suffixes), + arabic_connectors: lower_string_set(data.arabic_connectors), + relation_connectors: lower_string_set(data.relation_connectors), + hyphenated_prefixes: lower_string_set(data.hyphenated_prefixes), + cjk_non_person_terms: string_set(data.cjk_non_person_terms), + cjk_surname_starters: data + .cjk_surname_starters + .into_iter() + .filter_map(|value| value.chars().next()) + .collect(), + organization_terms: lower_string_set(data.organization_terms), + } + } + + pub fn detect_supplemental( + &self, + full_text: &str, + deny_list_entities: &[PipelineEntity], + ) -> Result> { + let mut entities = self.detect_cjk_names(full_text)?; + entities.extend(self.detect_token_names(full_text)?); + let mut entities = deduplicate_spans(entities); + entities.retain(|entity| { + !deny_list_entities + .iter() + .any(|deny| covers_same_label(entity, deny)) + }); + Ok(entities) + } + + fn detect_cjk_names(&self, full_text: &str) -> Result> { + if self.cjk_surname_starters.is_empty() { + return Ok(Vec::new()); + } + + let text_len = full_text.chars().count(); + let threshold = + ceil_ratio(text_len, CJK_HAN_RATIO_NUMERATOR, CJK_HAN_RATIO_DENOMINATOR); + let threshold = threshold.max(1); + let mut han_count = 0usize; + for ch in full_text.chars() { + if is_han(ch) { + han_count = han_count.saturating_add(1); + if han_count >= threshold { + break; + } + } + } + if text_len >= 100 && han_count >= threshold { + return Ok(Vec::new()); + } + + let mut entities = Vec::new(); + let mut run_start = None; + let mut run_chars = 0usize; + let mut previous_end = 0usize; + for (index, ch) in full_text.char_indices() { + if is_han(ch) { + if run_start.is_none() { + run_start = Some(index); + } + run_chars = run_chars.saturating_add(1); + previous_end = index.saturating_add(ch.len_utf8()); + continue; + } + self.push_cjk_run( + full_text, + run_start, + previous_end, + run_chars, + &mut entities, + )?; + run_start = None; + run_chars = 0; + } + self.push_cjk_run( + full_text, + run_start, + full_text.len(), + run_chars, + &mut entities, + )?; + Ok(entities) + } + + fn push_cjk_run( + &self, + full_text: &str, + start: Option, + end: usize, + char_count: usize, + entities: &mut Vec, + ) -> Result<()> { + if !(2..=4).contains(&char_count) { + return Ok(()); + } + let Some(start) = start else { + return Ok(()); + }; + let Some(text) = full_text.get(start..end) else { + return Err(invalid_name_data("cjk span is not a UTF-8 boundary")); + }; + if !self.is_likely_cjk_person_name(text) || self.is_organization(text) { + return Ok(()); + } + entities.push(PipelineEntity::detected( + usize_to_u32(start, "name_corpus.cjk.start")?, + usize_to_u32(end, "name_corpus.cjk.end")?, + PERSON_LABEL, + text, + CJK_SCORE, + DetectionSource::Regex, + )); + Ok(()) + } + + fn detect_token_names(&self, full_text: &str) -> Result> { + let words = segment_words(full_text); + let mut tokens = Vec::with_capacity(words.len()); + let mut word_index = 0usize; + while let Some(word) = words.get(word_index) { + if let Some((connector, end, consumed)) = + relation_connector(word, &words, word_index, full_text, self) + { + tokens.push(ClassifiedToken { + text: connector, + kind: TokenKind::ArabicConnector, + start: word.start, + end, + non_western: false, + title_abbreviation: false, + }); + word_index = word_index.saturating_add(consumed); + continue; + } + tokens.push(self.classify_token(word, full_text)); + word_index = word_index.saturating_add(1); + } + + let mut consumed = vec![false; tokens.len()]; + let mut entities = Vec::new(); + for index in 0..tokens.len() { + if consumed.get(index).copied().unwrap_or(false) { + continue; + } + let Some(token) = tokens.get(index) else { + continue; + }; + if !is_chain_start(token.kind) { + continue; + } + + let chain = Self::build_chain(full_text, &tokens, index); + let Some(score) = supplemental_chain_score(full_text, &chain, self) + else { + continue; + }; + let Some(first) = chain.first() else { + continue; + }; + let Some(last) = chain.last() else { + continue; + }; + let Some(text) = full_text.get(first.start..last.end) else { + return Err(invalid_name_data("name span is not a UTF-8 boundary")); + }; + if self.is_organization(text) { + continue; + } + for slot in index..index.saturating_add(chain.len()) { + if let Some(value) = consumed.get_mut(slot) { + *value = true; + } + } + entities.push(PipelineEntity::detected( + usize_to_u32(first.start, "name_corpus.start")?, + usize_to_u32(last.end, "name_corpus.end")?, + PERSON_LABEL, + text, + score, + DetectionSource::Regex, + )); + } + + Ok(entities) + } + + fn classify_token<'a>( + &self, + word: &WordSegment<'a>, + full_text: &str, + ) -> ClassifiedToken<'a> { + let text = word.text; + let lower = text.to_lowercase(); + let stripped = lower.strip_suffix('.').unwrap_or(&lower); + if self.title_tokens.contains(stripped) { + return ClassifiedToken { + text, + kind: TokenKind::Title, + start: word.start, + end: word.end, + non_western: false, + title_abbreviation: self.title_abbreviations.contains(stripped), + }; + } + if self.ja_suffixes.contains(&lower) { + return classified(word, TokenKind::JaSuffix, false); + } + if self.arabic_connectors.contains(&lower) { + return classified(word, TokenKind::ArabicConnector, false); + } + if self.is_hyphenated_prefix_name(text) { + return classified(word, TokenKind::Name, true); + } + if is_abbreviation(text) || is_multi_dot_abbreviation(text) { + return classified(word, TokenKind::Abbreviation, false); + } + if is_single_letter_initial(text, word.end, full_text) + && self.initial_has_name_context(word, full_text) + { + return classified(word, TokenKind::Abbreviation, false); + } + if self.excluded_words.contains(&lower) || text.chars().count() < 2 { + return classified(word, TokenKind::Other, false); + } + let short_token_allowed = self.is_non_western_name_token(text) + || self.ja_suffixes.contains(&lower) + || self.arabic_connectors.contains(&lower) + || (is_all_upper(text) && !self.excluded_all_caps.contains(text)); + if text.chars().count() < 3 && !short_token_allowed { + return classified(word, TokenKind::Other, false); + } + if text.chars().count() >= 3 && is_all_upper(text) { + if self.excluded_all_caps.contains(text) { + return classified(word, TokenKind::Other, false); + } + let title_cased = title_case_simple(text); + let non_western = self.is_non_western_name_token(&title_cased); + if non_western && !self.is_first_name_token(&title_cased) { + return classified(word, TokenKind::Name, true); + } + if is_all_caps_context_line(full_text, word.start) + && is_all_caps_line_name_shaped(full_text, word.start) + { + if self.is_first_name_token(&title_cased) { + return classified(word, TokenKind::Name, non_western); + } + if self.is_surname_token(&title_cased) { + return classified(word, TokenKind::Surname, non_western); + } + if non_western { + return classified(word, TokenKind::Name, true); + } + } + return classified(word, TokenKind::Other, false); + } + if !starts_uppercase(text) { + return classified(word, TokenKind::Other, false); + } + if self.is_first_name_token(text) { + return classified( + word, + TokenKind::Name, + self.is_non_western_name_token(text), + ); + } + if self.is_surname_token(text) { + return classified( + word, + TokenKind::Surname, + self.is_non_western_name_token(text), + ); + } + if self.is_non_western_name_token(text) { + return classified(word, TokenKind::Name, true); + } + classified(word, TokenKind::Capitalized, false) + } + + fn build_chain<'a>( + full_text: &str, + tokens: &'a [ClassifiedToken<'a>], + start: usize, + ) -> Vec<&'a ClassifiedToken<'a>> { + let mut chain = Vec::new(); + let Some(first) = tokens.get(start) else { + return chain; + }; + chain.push(first); + let mut index = start.saturating_add(1); + while index < tokens.len() && chain.len() < MAX_CHAIN { + let Some(next) = tokens.get(index) else { + break; + }; + let Some(previous) = chain.last().copied() else { + break; + }; + let Some(gap) = full_text.get(previous.end..next.start) else { + break; + }; + if horizontal_gap_width(gap) > MAX_HORIZONTAL_CHAIN_GAP { + break; + } + let period_is_part_of_previous = previous.kind == TokenKind::Abbreviation + || (previous.kind == TokenKind::Title && previous.title_abbreviation); + let breaks_on_period = gap.contains('.') + && !is_initial_continuation_gap(previous.text, gap) + && !period_is_part_of_previous; + if gap.contains('\n') + || gap.contains('!') + || gap.contains('?') + || gap.contains(';') + || gap.contains(':') + || breaks_on_period + { + break; + } + if next.kind == TokenKind::JaSuffix + && gap != "-" + && !gap.trim().is_empty() + { + break; + } + if next.kind == TokenKind::Other { + break; + } + chain.push(next); + index = index.saturating_add(1); + } + chain + } + + fn is_likely_cjk_person_name(&self, text: &str) -> bool { + if self.cjk_non_person_terms.contains(text) { + return false; + } + text + .chars() + .next() + .is_some_and(|first| self.cjk_surname_starters.contains(&first)) + } + + fn is_organization(&self, text: &str) -> bool { + let words = segment_words(text); + words + .iter() + .any(|word| self.organization_terms.contains(&word.text.to_lowercase())) + } + + fn is_hyphenated_prefix_name(&self, text: &str) -> bool { + let Some((prefix, tail)) = text.split_once('-') else { + return false; + }; + self.hyphenated_prefixes.contains(&prefix.to_lowercase()) + && tail.chars().next().is_some_and(char::is_uppercase) + } + + fn is_first_name_token(&self, token: &str) -> bool { + self.first_names.contains(token) + } + + fn is_surname_token(&self, token: &str) -> bool { + self.surnames.contains(token) + } + + fn is_non_western_name_token(&self, token: &str) -> bool { + self.non_western_names.contains(token) + || self + .non_western_names + .contains(&title_case_with_apostrophe(token)) + } + + fn initial_has_name_context( + &self, + word: &WordSegment<'_>, + full_text: &str, + ) -> bool { + let line = line_before(full_text, word.start); + if let Some(last_word) = trailing_word(line) + && self.lookup_name_token(last_word) + { + return true; + } + let after_dot_start = word.end.saturating_add(1); + let after_dot = full_text + .get(after_dot_start..) + .unwrap_or_default() + .trim_start(); + let Some(next_word) = leading_word(after_dot) else { + return false; + }; + self.lookup_name_token(next_word) + || (next_word.chars().count() == 1 && starts_uppercase(next_word)) + } + + fn lookup_name_token(&self, token: &str) -> bool { + self.is_first_name_token(token) + || self.is_first_name_token(&title_case_simple(token)) + || self.is_non_western_name_token(token) + } +} + +fn supplemental_chain_score( + full_text: &str, + chain: &[&ClassifiedToken<'_>], + data: &PreparedNameCorpusData, +) -> Option { + let has_title = chain.iter().any(|token| token.kind == TokenKind::Title); + let has_abbreviation = chain + .iter() + .any(|token| token.kind == TokenKind::Abbreviation); + let has_non_western = chain.iter().any(|token| token.non_western); + if !has_non_western { + return None; + } + let has_ja_suffix = + chain.iter().any(|token| token.kind == TokenKind::JaSuffix); + let has_arabic_connector = chain + .iter() + .any(|token| token.kind == TokenKind::ArabicConnector); + let capitalized_count = chain + .iter() + .filter(|token| token.kind == TokenKind::Capitalized) + .count(); + let non_western_count = + chain.iter().filter(|token| token.non_western).count(); + let chain_all_common_words = chain + .iter() + .all(|token| data.common_words.contains(&token.text.to_lowercase())); + let title_confidence = + has_title && (non_western_count > 0 || capitalized_count > 0); + let high_confidence = (has_ja_suffix + && (capitalized_count > 0 || non_western_count > 0)) + || (has_arabic_connector && non_western_count > 0) + || non_western_count >= 2 + || (non_western_count > 0 + && (capitalized_count > 0 || has_abbreviation) + && !chain_all_common_words); + let score = if title_confidence { + TITLE_NAME_SCORE + } else if high_confidence { + HIGH_CONFIDENCE_NAME_SCORE + } else if non_western_count == 1 + && chain.len() == 1 + && !is_sentence_start(full_text, chain.first()?.start) + { + LOW_CONFIDENCE_NAME_SCORE + } else { + return None; + }; + (score >= HIGH_CONFIDENCE_NAME_SCORE).then_some(score) +} + +fn segment_words(full_text: &str) -> Vec> { + let mut words = Vec::new(); + let mut start = None; + let mut end = 0usize; + for (index, ch) in full_text.char_indices() { + if is_word_char(ch) { + if start.is_none() { + start = Some(index); + } + end = index.saturating_add(ch.len_utf8()); + continue; + } + if let Some(word_start) = start.take() + && let Some(text) = full_text.get(word_start..end) + { + words.push(WordSegment { + text, + start: word_start, + end, + }); + } + } + if let Some(word_start) = start + && let Some(text) = full_text.get(word_start..end) + { + words.push(WordSegment { + text, + start: word_start, + end, + }); + } + words +} + +fn relation_connector<'a>( + word: &WordSegment<'a>, + words: &[WordSegment<'a>], + index: usize, + full_text: &'a str, + data: &PreparedNameCorpusData, +) -> Option<(&'a str, usize, usize)> { + let lower = word.text.to_lowercase(); + if !matches!(lower.as_str(), "s" | "d" | "w" | "r") { + return None; + } + let next = words.get(index.saturating_add(1))?; + if full_text.get(word.end..next.start)? != "/" + || !next.text.eq_ignore_ascii_case("o") + { + return None; + } + let connector = full_text.get(word.start..next.end)?; + data + .relation_connectors + .contains(&connector.to_lowercase()) + .then_some((connector, next.end, 2)) +} + +const fn classified<'a>( + word: &WordSegment<'a>, + kind: TokenKind, + non_western: bool, +) -> ClassifiedToken<'a> { + ClassifiedToken { + text: word.text, + kind, + start: word.start, + end: word.end, + non_western, + title_abbreviation: false, + } +} + +const fn is_chain_start(kind: TokenKind) -> bool { + matches!( + kind, + TokenKind::Title + | TokenKind::Name + | TokenKind::Surname + | TokenKind::Abbreviation + | TokenKind::ArabicConnector + ) +} + +fn covers_same_label(entity: &PipelineEntity, deny: &PipelineEntity) -> bool { + entity.label == deny.label + && deny.start <= entity.start + && deny.end >= entity.end +} + +fn deduplicate_spans(mut entities: Vec) -> Vec { + entities.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + }); + let mut result = Vec::new(); + for entity in entities { + let keep = result + .last() + .is_none_or(|last: &PipelineEntity| entity.start >= last.end); + if keep { + result.push(entity); + } + } + result +} + +fn title_case_with_apostrophe(text: &str) -> String { + let mut result = String::new(); + let mut uppercase_next = true; + for ch in text.chars() { + if uppercase_next { + result.extend(ch.to_uppercase()); + uppercase_next = false; + } else { + result.extend(ch.to_lowercase()); + } + if ch == '\'' { + uppercase_next = true; + } + } + result +} + +fn title_case_simple(text: &str) -> String { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return String::new(); + }; + let mut result = String::new(); + result.extend(first.to_uppercase()); + result.push_str(&chars.as_str().to_lowercase()); + result +} + +fn starts_uppercase(text: &str) -> bool { + text.chars().next().is_some_and(char::is_uppercase) +} + +fn is_all_upper(text: &str) -> bool { + let mut letters = 0usize; + for ch in text.chars() { + if ch.is_alphabetic() { + letters = letters.saturating_add(1); + if !ch.is_uppercase() { + return false; + } + } + } + letters > 0 +} + +fn is_abbreviation(text: &str) -> bool { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return false; + }; + chars.next() == Some('.') && chars.next().is_none() && first.is_uppercase() +} + +fn is_multi_dot_abbreviation(text: &str) -> bool { + let mut saw_upper = false; + let mut previous_dot = true; + for ch in text.chars() { + if previous_dot { + if !ch.is_uppercase() { + return false; + } + saw_upper = true; + previous_dot = false; + continue; + } + if ch != '.' { + return false; + } + previous_dot = true; + } + saw_upper +} + +fn is_single_letter_initial(text: &str, end: usize, full_text: &str) -> bool { + text.chars().count() == 1 + && starts_uppercase(text) + && full_text + .get(end..) + .is_some_and(|tail| tail.starts_with('.')) +} + +fn is_initial_continuation_gap(text: &str, gap: &str) -> bool { + if text.chars().count() == 1 && starts_uppercase(text) { + let Some(rest) = gap.strip_prefix('.') else { + return false; + }; + let spaces = rest + .chars() + .take_while(|ch| ch.is_whitespace() && *ch != '\n') + .count(); + return (1..=2).contains(&spaces) && rest.chars().count() == spaces; + } + false +} + +fn horizontal_gap_width(gap: &str) -> usize { + if gap.chars().any(|ch| ch == '\n' || !ch.is_whitespace()) { + return 0; + } + gap.chars().count() +} + +fn is_sentence_start(text: &str, pos: usize) -> bool { + if pos == 0 { + return true; + } + let Some(before) = text.get(..pos) else { + return false; + }; + for ch in before.chars().rev() { + if ch.is_whitespace() { + continue; + } + return matches!(ch, '.' | '!' | '?'); + } + true +} + +fn is_all_caps_context_line(full_text: &str, start: usize) -> bool { + let line = current_line(full_text, start); + let mut letters = 0usize; + let mut upper = 0usize; + for ch in line.chars() { + if ch.is_alphabetic() { + letters = letters.saturating_add(1); + if ch.is_uppercase() { + upper = upper.saturating_add(1); + } + } + } + if letters < ALL_CAPS_NAME_LINE_MIN_LETTERS { + return false; + } + let upper = + u32::try_from(upper).map_or_else(|_| f64::from(u32::MAX), f64::from); + let letters = + u32::try_from(letters).map_or_else(|_| f64::from(u32::MAX), f64::from); + upper / letters >= ALL_CAPS_NAME_LINE_RATIO +} + +const fn ceil_ratio( + value: usize, + numerator: usize, + denominator: usize, +) -> usize { + value.saturating_mul(numerator).div_ceil(denominator) +} + +fn is_all_caps_line_name_shaped(full_text: &str, start: usize) -> bool { + let line = current_line(full_text, start); + if line.chars().any(|ch| ch.is_ascii_digit()) { + return false; + } + let tokens = segment_words(line).len(); + tokens > 0 && tokens <= ALL_CAPS_NAME_LINE_MAX_TOKENS +} + +fn current_line(full_text: &str, start: usize) -> &str { + let line_start = full_text + .get(..start) + .and_then(|head| head.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + let line_end = full_text + .get(start..) + .and_then(|tail| tail.find('\n').map(|index| start.saturating_add(index))) + .unwrap_or(full_text.len()); + full_text.get(line_start..line_end).unwrap_or_default() +} + +fn line_before(full_text: &str, start: usize) -> &str { + let line_start = full_text + .get(..start) + .and_then(|head| head.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + full_text.get(line_start..start).unwrap_or_default() +} + +fn trailing_word(text: &str) -> Option<&str> { + segment_words(text).last().map(|word| word.text) +} + +fn leading_word(text: &str) -> Option<&str> { + segment_words(text).first().map(|word| word.text) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphanumeric() || ch == '\'' +} + +const fn is_han(ch: char) -> bool { + matches!( + ch, + '\u{3400}'..='\u{4DBF}' + | '\u{4E00}'..='\u{9FFF}' + | '\u{F900}'..='\u{FAFF}' + | '\u{20000}'..='\u{2A6DF}' + | '\u{2A700}'..='\u{2B73F}' + | '\u{2B740}'..='\u{2B81F}' + | '\u{2B820}'..='\u{2CEAF}' + | '\u{2CEB0}'..='\u{2EBEF}' + | '\u{30000}'..='\u{3134F}' + ) +} + +fn string_set(values: Vec) -> HashSet { + values.into_iter().collect() +} + +fn lower_string_set(values: Vec) -> HashSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn usize_to_u32(value: usize, field: &'static str) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} + +fn invalid_name_data(reason: &'static str) -> Error { + Error::InvalidStaticData { + field: "name_corpus", + reason: String::from(reason), + } +} + +#[cfg(test)] +#[allow(clippy::expect_used, clippy::indexing_slicing)] +mod tests { + use super::*; + + #[test] + fn supplemental_detects_cjk_name_with_configured_surname() { + let data = PreparedNameCorpusData::new(NameCorpusData { + cjk_surname_starters: vec![String::from("王")], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental("Signed by 王小明 today.", &[]) + .expect("cjk detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "王小明"); + assert!((entities[0].score - CJK_SCORE).abs() < f64::EPSILON); + } + + #[test] + fn supplemental_skips_names_covered_by_deny_list() { + let data = PreparedNameCorpusData::new(NameCorpusData { + cjk_surname_starters: vec![String::from("王")], + ..NameCorpusData::default() + }); + let text = "Signed by 王小明 today."; + let start = + u32::try_from(text.find("王小明").expect("fixture contains name")) + .expect("offset fits"); + let end = start.saturating_add( + u32::try_from("王小明".len()).expect("fixture span length fits"), + ); + let deny = PipelineEntity::detected( + start, + end, + PERSON_LABEL, + "王小明", + 0.9, + DetectionSource::DenyList, + ); + + let entities = data + .detect_supplemental(text, &[deny]) + .expect("cjk detection should succeed"); + + assert!(entities.is_empty()); + } + + #[test] + fn supplemental_detects_non_western_chain() { + let data = PreparedNameCorpusData::new(NameCorpusData { + non_western_names: vec![String::from("Sato"), String::from("Kenji")], + ja_suffixes: vec![String::from("san")], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental("The signer is Sato Kenji.", &[]) + .expect("name detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Sato Kenji"); + assert!( + (entities[0].score - HIGH_CONFIDENCE_NAME_SCORE).abs() < f64::EPSILON + ); + } + + #[test] + fn supplemental_does_not_cross_signature_column_gap() { + let data = PreparedNameCorpusData::new(NameCorpusData { + non_western_names: vec![ + String::from("Priya"), + String::from("Ramanathan"), + ], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental( + "Name: Priya Ramanathan Name: Jonathan", + &[], + ) + .expect("name detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Priya Ramanathan"); + } +} diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index acd61840..f9fabcb8 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -13,6 +13,9 @@ use crate::legal_forms::{ LegalFormData, PreparedLegalFormData, process_legal_form_matches, }; use crate::money::{MonetaryData, PreparedMonetaryData}; +use crate::name_corpus::{ + NameCorpusData, PreparedNameCorpusData as PreparedNames, +}; use crate::normalize::{ NormalizedSearchText, normalize_for_search_with_byte_map, }; @@ -71,6 +74,7 @@ pub struct PreparedSearch { zone_data: Option, address_context_data: Option, coreference_data: Option, + name_corpus_data: Option, date_data: Option, monetary_data: Option, } @@ -123,6 +127,8 @@ pub struct PreparedSearchConfig { pub address_context_data: Option, #[serde(default)] pub coreference_data: Option, + #[serde(default)] + pub name_corpus_data: Option, pub date_data: Option, pub monetary_data: Option, } @@ -211,6 +217,7 @@ pub struct StaticDetectionResult { pub signature_entities: Vec, pub legal_form_entities: Vec, pub address_seed_entities: Vec, + pub name_corpus_entities: Vec, } #[derive(Clone, Debug, PartialEq)] @@ -242,6 +249,7 @@ struct StaticEntityPasses { signature: TimedEntities, legal_form: TimedEntities, address_seed: TimedEntities, + name_corpus: TimedEntities, } pub struct PreparedSearchBuildResult { @@ -457,6 +465,7 @@ impl PreparedSearch { config.address_context_data, )?, coreference_data: prepare_coreference_data(config.coreference_data)?, + name_corpus_data: config.name_corpus_data.map(PreparedNames::new), date_data, monetary_data, }) @@ -611,6 +620,7 @@ impl PreparedSearch { signature_entities: passes.signature.entities, legal_form_entities: passes.legal_form.entities, address_seed_entities: passes.address_seed.entities, + name_corpus_entities: passes.name_corpus.entities, }) } @@ -698,6 +708,9 @@ impl PreparedSearch { ], )?; + let name_corpus = + self.process_name_corpus_entities(full_text, &deny_list.entities)?; + Ok(StaticEntityPasses { regex, custom_regex, @@ -709,6 +722,7 @@ impl PreparedSearch { signature, legal_form, address_seed, + name_corpus, }) } @@ -825,6 +839,24 @@ impl PreparedSearch { }) } + fn process_name_corpus_entities( + &self, + full_text: &str, + deny_list_entities: &[PipelineEntity], + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.name_corpus_data { + data.detect_supplemental(full_text, deny_list_entities)? + } else { + Vec::new() + }; + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + pub fn redact_static_entities( &self, full_text: &str, @@ -1291,6 +1323,12 @@ fn record_static_entity_diagnostics( full_text, Some(passes.address_seed.elapsed_us), ); + diagnostics.record_entities( + DiagnosticStage::EntityNameCorpus, + &passes.name_corpus.entities, + full_text, + Some(passes.name_corpus.elapsed_us), + ); } fn address_seed_context(layers: &[&[PipelineEntity]]) -> Vec { @@ -1983,7 +2021,8 @@ impl StaticDetectionResult { .saturating_add(self.trigger_entities.len()) .saturating_add(self.signature_entities.len()) .saturating_add(self.legal_form_entities.len()) - .saturating_add(self.address_seed_entities.len()); + .saturating_add(self.address_seed_entities.len()) + .saturating_add(self.name_corpus_entities.len()); let mut entities = Vec::with_capacity(capacity); entities.extend(self.regex_entities.iter().cloned()); entities.extend(self.custom_regex_entities.iter().cloned()); @@ -1995,6 +2034,7 @@ impl StaticDetectionResult { entities.extend(self.signature_entities.iter().cloned()); entities.extend(self.legal_form_entities.iter().cloned()); entities.extend(self.address_seed_entities.iter().cloned()); + entities.extend(self.name_corpus_entities.iter().cloned()); entities } } diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs index bf4fab62..2a1d67f4 100644 --- a/crates/anonymize-core/tests/address_seed_parity.rs +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -31,6 +31,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index ffd55b66..9f729b8b 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -33,6 +33,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, } diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 3e8611d7..cb1ea756 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -40,6 +40,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, } @@ -245,6 +246,7 @@ fn prepared_search_runs_normalized_literal_pass() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }) @@ -500,6 +502,7 @@ fn prepared_search_adds_coreference_aliases_with_source_placeholder() { threshold: 0.5, allowed_labels: vec![String::from("organization")], coreference_data: Some(coreference_data()), + name_corpus_data: None, ..empty_config(PreparedSearchSlices::default()) }) .unwrap(); @@ -577,6 +580,7 @@ fn prepared_search_does_not_seed_coreference_from_caller_owned_entities() { threshold: 0.5, allowed_labels: vec![String::from("organization")], coreference_data: Some(coreference_data()), + name_corpus_data: None, ..empty_config(PreparedSearchSlices::default()) }) .unwrap(); @@ -614,6 +618,7 @@ fn prepared_search_rejects_role_and_legal_form_coreference_aliases() { threshold: 0.5, allowed_labels: vec![String::from("organization")], coreference_data: Some(coreference_data()), + name_corpus_data: None, ..empty_config(PreparedSearchSlices::default()) }) .unwrap(); @@ -670,6 +675,7 @@ fn prepared_search_artifacts_match_direct_prepare() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }; @@ -821,6 +827,7 @@ fn prepared_search_emits_static_detector_entities() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }) @@ -1521,6 +1528,7 @@ fn prepared_search_redacts_static_entities_end_to_end() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }) @@ -1966,6 +1974,7 @@ fn prepared_search_reports_static_redaction_diagnostics() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }) @@ -2052,6 +2061,7 @@ fn prepared_search_redacts_custom_deny_list_entities() { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, }) diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index 0bf8f8ff..e9f73586 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -31,6 +31,7 @@ fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { zone_data: None, address_context_data: None, coreference_data: None, + name_corpus_data: None, date_data: None, monetary_data: None, } diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 7207ea94..a23c4fcd 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1345,7 +1345,7 @@ function describeUnsupportedPipelineStages( if (config.enableTriggerPhrases && !nativeRuntime) { stages.push("triggers"); } - if (config.enableNameCorpus) { + if (config.enableNameCorpus && (!nativeRuntime || !config.enableDenyList)) { stages.push( config.enableDenyList ? "name-corpus-supplemental" : "name-corpus", ); diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 52ef4da8..936249a3 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1298,6 +1298,70 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package matches TS supplemental name corpus", async () => { + const adapters = getAdapters(); + const fullText = "The agreement is signed by Sato Kenji."; + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["person"], + workspaceId: "native-pipeline-name-corpus-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual([ + expect.objectContaining({ + label: "person", + text: "Sato Kenji", + score: 0.9, + }), + ]); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, @@ -1318,7 +1382,7 @@ describe("native adapter parity", () => { expect(getNativePipelineCompatibility(config)).toEqual({ status: "unsupported", - unsupportedFeatures: ["enableNer", "enableNameCorpus"], + unsupportedFeatures: ["enableNer"], }); }); diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 64767784..fff8a79e 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -19,6 +19,9 @@ import type { PatternEntry, TextSearch } from "@stll/text-search"; import legalFormRuleWords from "./data/legal-form-rule-words.json"; +import nameCorpusCjk from "./data/name-corpus-cjk.json"; +import nameCorpusParticles from "./data/name-corpus-particles.json"; +import organizationIndicators from "./data/organization-indicators.json"; import { getTextSearch } from "./search-engine"; @@ -246,6 +249,23 @@ export type NativeCoreferenceData = { legal_form_aliases: string[]; organization_determiners: string[]; }; +export type NativeNameCorpusData = { + first_names: string[]; + surnames: string[]; + title_tokens: string[]; + title_abbreviations: string[]; + excluded_words: string[]; + common_words: string[]; + non_western_names: string[]; + excluded_all_caps: string[]; + ja_suffixes: string[]; + arabic_connectors: string[]; + relation_connectors: string[]; + hyphenated_prefixes: string[]; + cjk_non_person_terms: string[]; + cjk_surname_starters: string[]; + organization_terms: string[]; +}; export type NativeZonePatternData = { pattern: string; flags: string; @@ -316,6 +336,7 @@ export type NativePreparedSearchConfig = { zone_data?: NativeZoneData; address_context_data?: NativeAddressContextData; coreference_data?: NativeCoreferenceData; + name_corpus_data?: NativeNameCorpusData; date_data?: NativeDateData; monetary_data?: NativeMonetaryData; }; @@ -383,6 +404,30 @@ type CoreferenceConfigRow = { flags: string; }; +type NameCorpusCjkLanguageData = { + nonPersonTerms: string[]; + surnameStarters: string[]; +}; + +type NameCorpusCjkData = Record< + string, + NameCorpusCjkLanguageData | string | undefined +>; + +type NameCorpusParticleLanguageData = { + connectors?: string[]; + relationConnectors?: string[]; + suffixes?: string[]; + hyphenatedPrefixes?: string[]; +}; + +type NameCorpusParticleData = Record< + string, + NameCorpusParticleLanguageData | string | undefined +>; + +type OrganizationIndicatorData = Record; + type SectionHeadingsConfig = { patterns: Array<{ re: string; flags: string }>; }; @@ -418,6 +463,7 @@ type UnifiedSearchSources = { nativeZoneData: NativeZoneData | null; nativeAddressContextData: NativeAddressContextData | null; nativeCoreferenceData: NativeCoreferenceData | null; + nativeNameCorpusData: NativeNameCorpusData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: string[]; hotwordRules: readonly HotwordRule[]; @@ -442,6 +488,118 @@ export type NativeStaticSearchBundle = { falsePositiveFilters: DenyListFilterData; }; +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const NAME_CORPUS_CJK = nameCorpusCjk as NameCorpusCjkData; +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const NAME_CORPUS_PARTICLES = nameCorpusParticles as NameCorpusParticleData; +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const ORGANIZATION_INDICATORS = + organizationIndicators as OrganizationIndicatorData; + +const CJK_LANGUAGE_ALIASES: Record = { + zh: ["zh", "zh-latn", "zh-hans", "zh-hant"], + ja: ["ja", "ja-latn"], + ko: ["ko", "ko-latn"], +}; + +const buildNativeNameCorpusData = ( + config: PipelineConfig, + ctx: PipelineContext, +): NativeNameCorpusData | null => { + if (!config.enableNameCorpus || !config.enableDenyList || !ctx.nameCorpus) { + return null; + } + + const languages = config.nameCorpusLanguages?.map((language) => + language.toLowerCase(), + ); + const cjkNonPersonTerms: string[] = []; + const cjkSurnameStarters: string[] = []; + for (const [language, value] of Object.entries(NAME_CORPUS_CJK)) { + if (!isNameCorpusCjkLanguageData(value)) continue; + if (!languageIsSelected(language, languages, CJK_LANGUAGE_ALIASES)) { + continue; + } + cjkNonPersonTerms.push(...value.nonPersonTerms); + cjkSurnameStarters.push(...value.surnameStarters); + } + + const jaSuffixes: string[] = []; + const arabicConnectors: string[] = []; + const relationConnectors: string[] = []; + const hyphenatedPrefixes: string[] = []; + for (const [language, value] of Object.entries(NAME_CORPUS_PARTICLES)) { + if (!isNameCorpusParticleLanguageData(value)) continue; + if (!languageIsSelected(language, languages)) continue; + jaSuffixes.push(...(value.suffixes ?? [])); + arabicConnectors.push(...(value.connectors ?? [])); + relationConnectors.push(...(value.relationConnectors ?? [])); + hyphenatedPrefixes.push(...(value.hyphenatedPrefixes ?? [])); + } + + const organizationTerms: string[] = []; + for (const value of Object.values(ORGANIZATION_INDICATORS)) { + if (Array.isArray(value)) { + organizationTerms.push(...value); + } + } + + return { + first_names: [...ctx.nameCorpus.firstNamesList], + surnames: [...ctx.nameCorpus.surnamesList], + title_tokens: [...ctx.nameCorpus.titlesList], + title_abbreviations: [...ctx.nameCorpus.titleAbbreviations], + excluded_words: [...ctx.nameCorpus.excludedList], + common_words: [...ctx.nameCorpus.commonWords], + non_western_names: [...ctx.nameCorpus.nonWesternNamesList], + excluded_all_caps: [...ctx.nameCorpus.excludedAllCapsList], + ja_suffixes: uniqueStrings(jaSuffixes), + arabic_connectors: uniqueStrings(arabicConnectors), + relation_connectors: uniqueStrings(relationConnectors), + hyphenated_prefixes: uniqueStrings(hyphenatedPrefixes), + cjk_non_person_terms: uniqueStrings(cjkNonPersonTerms), + cjk_surname_starters: uniqueStrings(cjkSurnameStarters), + organization_terms: uniqueStrings(organizationTerms), + }; +}; + +const isNameCorpusCjkLanguageData = ( + value: NameCorpusCjkData[string], +): value is NameCorpusCjkLanguageData => + typeof value === "object" && + value !== null && + Array.isArray(value.nonPersonTerms) && + Array.isArray(value.surnameStarters); + +const isNameCorpusParticleLanguageData = ( + value: NameCorpusParticleData[string], +): value is NameCorpusParticleLanguageData => + typeof value === "object" && value !== null; + +const languageIsSelected = ( + language: string, + selectedLanguages: readonly string[] | undefined, + aliases: Record = {}, +): boolean => { + if (selectedLanguages === undefined) { + return true; + } + const normalized = language.toLowerCase(); + const accepted = aliases[normalized] ?? [normalized]; + return accepted.some((entry) => selectedLanguages.includes(entry)); +}; + +const uniqueStrings = (values: readonly string[]): string[] => { + const seen = new Set(); + const result: string[] = []; + for (const value of values) { + if (seen.has(value)) continue; + seen.add(value); + result.push(value); + } + return result; +}; + const buildUnifiedSearchSources = async ( config: PipelineConfig, gazetteerEntries: GazetteerEntry[] = [], @@ -647,6 +805,7 @@ const buildUnifiedSearchSources = async ( year_words_by_language: yearWordData ?? {}, }; const nativeMonetaryData = monetaryData; + const nativeNameCorpusData = buildNativeNameCorpusData(config, ctx); let offset = 0; @@ -801,6 +960,7 @@ const buildUnifiedSearchSources = async ( ...coreferenceData, legal_form_aliases: nativeLegalFormSuffixes, }, + nativeNameCorpusData, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -854,6 +1014,7 @@ export const buildNativeStaticSearchBundle = async ( zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, coreferenceData: sources.nativeCoreferenceData, + nameCorpusData: sources.nativeNameCorpusData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -942,6 +1103,7 @@ export const buildUnifiedSearch = async ( zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, coreferenceData: sources.nativeCoreferenceData, + nameCorpusData: sources.nativeNameCorpusData, nativeSigningPatterns: sources.nativeSigningPatterns, partyPositionTerms: sources.partyPositionTerms, hotwordRules: sources.hotwordRules, @@ -994,6 +1156,7 @@ type BuildNativeStaticConfigArgs = { zoneData: NativeZoneData | null; addressContextData: NativeAddressContextData | null; coreferenceData: NativeCoreferenceData | null; + nameCorpusData: NativeNameCorpusData | null; nativeSigningPatterns: readonly string[]; partyPositionTerms: readonly string[]; hotwordRules: readonly HotwordRule[]; @@ -1027,6 +1190,7 @@ const buildNativeStaticConfig = ({ zoneData, addressContextData, coreferenceData, + nameCorpusData, nativeSigningPatterns, partyPositionTerms, hotwordRules, @@ -1240,6 +1404,9 @@ const buildNativeStaticConfig = ({ if (coreferenceData) { nativeConfig.coreference_data = coreferenceData; } + if (nameCorpusData) { + nativeConfig.name_corpus_data = nameCorpusData; + } if (dateData) { nativeConfig.date_data = dateData; } diff --git a/packages/anonymize/src/data/name-corpus-cjk.json b/packages/anonymize/src/data/name-corpus-cjk.json new file mode 100644 index 00000000..aa9c3bb4 --- /dev/null +++ b/packages/anonymize/src/data/name-corpus-cjk.json @@ -0,0 +1,86 @@ +{ + "_comment": "CJK name-corpus heuristics organised by script language. Used by supplemental name-corpus detection.", + "zh": { + "nonPersonTerms": [ + "中国", + "中國", + "中文", + "人民", + "公司", + "香港", + "台湾", + "臺灣" + ], + "surnameStarters": [ + "王", + "李", + "张", + "張", + "刘", + "劉", + "陈", + "陳", + "杨", + "楊", + "黄", + "黃", + "赵", + "趙", + "吴", + "吳", + "周", + "徐", + "孙", + "孫", + "马", + "馬", + "朱", + "胡", + "郭", + "何", + "林", + "高", + "梁", + "郑", + "鄭", + "罗", + "羅", + "宋", + "谢", + "謝", + "唐", + "韩", + "韓", + "曹", + "许", + "許", + "邓", + "鄧", + "萧", + "蕭", + "田" + ] + }, + "ja": { + "nonPersonTerms": ["日本"], + "surnameStarters": ["山", "佐", "鈴", "渡", "伊", "中", "小", "吉"] + }, + "ko": { + "nonPersonTerms": ["韩国", "韓國"], + "surnameStarters": [ + "金", + "朴", + "박", + "김", + "이", + "최", + "정", + "강", + "조", + "윤", + "장", + "임", + "한" + ] + } +} diff --git a/packages/anonymize/src/data/name-corpus-particles.json b/packages/anonymize/src/data/name-corpus-particles.json new file mode 100644 index 00000000..da0f14c3 --- /dev/null +++ b/packages/anonymize/src/data/name-corpus-particles.json @@ -0,0 +1,13 @@ +{ + "_comment": "Language-specific particles and suffixes for supplemental name-corpus detection.", + "ar": { + "connectors": ["bin", "bint", "ibn", "al", "el"], + "hyphenatedPrefixes": ["al", "el"] + }, + "in": { + "relationConnectors": ["s/o", "d/o", "w/o", "r/o"] + }, + "ja-latn": { + "suffixes": ["san", "sama", "sensei"] + } +} diff --git a/packages/anonymize/src/data/organization-indicators.json b/packages/anonymize/src/data/organization-indicators.json new file mode 100644 index 00000000..033479c5 --- /dev/null +++ b/packages/anonymize/src/data/organization-indicators.json @@ -0,0 +1,36 @@ +{ + "_comment": "Organisation indicator words used to suppress person-name spans.", + "en": [ + "Group", + "Company", + "LLC", + "LLP", + "LP", + "Inc", + "Ltd", + "Corp", + "Corporation", + "Holdings", + "Partners", + "Association", + "University", + "Bank", + "Fund", + "Trust", + "Agency", + "Government", + "Ministry", + "Office", + "Department", + "Council", + "Board", + "Committee", + "Commission", + "Services", + "Solutions", + "Technologies", + "Systems", + "Analytics", + "Software" + ] +} diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 97505ccf..05721b83 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -102,7 +102,9 @@ export const getNativePipelineCompatibility = ( const unsupportedFeatures: NativePipelineUnsupportedFeature[] = []; if (config.enableNer) unsupportedFeatures.push("enableNer"); - if (config.enableNameCorpus) unsupportedFeatures.push("enableNameCorpus"); + if (config.enableNameCorpus && !config.enableDenyList) { + unsupportedFeatures.push("enableNameCorpus"); + } if (unsupportedFeatures.length === 0) { return { status: "supported" }; } diff --git a/packages/data/config/name-corpus-cjk.json b/packages/data/config/name-corpus-cjk.json new file mode 100644 index 00000000..aa9c3bb4 --- /dev/null +++ b/packages/data/config/name-corpus-cjk.json @@ -0,0 +1,86 @@ +{ + "_comment": "CJK name-corpus heuristics organised by script language. Used by supplemental name-corpus detection.", + "zh": { + "nonPersonTerms": [ + "中国", + "中國", + "中文", + "人民", + "公司", + "香港", + "台湾", + "臺灣" + ], + "surnameStarters": [ + "王", + "李", + "张", + "張", + "刘", + "劉", + "陈", + "陳", + "杨", + "楊", + "黄", + "黃", + "赵", + "趙", + "吴", + "吳", + "周", + "徐", + "孙", + "孫", + "马", + "馬", + "朱", + "胡", + "郭", + "何", + "林", + "高", + "梁", + "郑", + "鄭", + "罗", + "羅", + "宋", + "谢", + "謝", + "唐", + "韩", + "韓", + "曹", + "许", + "許", + "邓", + "鄧", + "萧", + "蕭", + "田" + ] + }, + "ja": { + "nonPersonTerms": ["日本"], + "surnameStarters": ["山", "佐", "鈴", "渡", "伊", "中", "小", "吉"] + }, + "ko": { + "nonPersonTerms": ["韩国", "韓國"], + "surnameStarters": [ + "金", + "朴", + "박", + "김", + "이", + "최", + "정", + "강", + "조", + "윤", + "장", + "임", + "한" + ] + } +} diff --git a/packages/data/config/name-corpus-particles.json b/packages/data/config/name-corpus-particles.json new file mode 100644 index 00000000..da0f14c3 --- /dev/null +++ b/packages/data/config/name-corpus-particles.json @@ -0,0 +1,13 @@ +{ + "_comment": "Language-specific particles and suffixes for supplemental name-corpus detection.", + "ar": { + "connectors": ["bin", "bint", "ibn", "al", "el"], + "hyphenatedPrefixes": ["al", "el"] + }, + "in": { + "relationConnectors": ["s/o", "d/o", "w/o", "r/o"] + }, + "ja-latn": { + "suffixes": ["san", "sama", "sensei"] + } +} diff --git a/packages/data/config/organization-indicators.json b/packages/data/config/organization-indicators.json new file mode 100644 index 00000000..033479c5 --- /dev/null +++ b/packages/data/config/organization-indicators.json @@ -0,0 +1,36 @@ +{ + "_comment": "Organisation indicator words used to suppress person-name spans.", + "en": [ + "Group", + "Company", + "LLC", + "LLP", + "LP", + "Inc", + "Ltd", + "Corp", + "Corporation", + "Holdings", + "Partners", + "Association", + "University", + "Bank", + "Fund", + "Trust", + "Agency", + "Government", + "Ministry", + "Office", + "Department", + "Council", + "Board", + "Committee", + "Commission", + "Services", + "Solutions", + "Technologies", + "Systems", + "Analytics", + "Software" + ] +} From b55644b3a9bf0b97040bfcaea002ac4499713a90 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 09:32:59 +0200 Subject: [PATCH 072/130] feat: expand native validator coverage --- Cargo.lock | 94 ++++++++++++++++++- crates/anonymize-core/Cargo.toml | 2 +- .../src/__test__/pipeline-config.test.ts | 4 +- .../anonymize/src/build-unified-search.ts | 3 +- packages/anonymize/src/detectors/regex.ts | 40 ++++++++ 5 files changed, 138 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0eb37809..30eeef42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,6 +78,15 @@ dependencies = [ "cpufeatures", ] +[[package]] +name = "block-buffer" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" +dependencies = [ + "hybrid-array", +] + [[package]] name = "cc" version = "1.2.65" @@ -96,6 +105,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -120,6 +135,15 @@ dependencies = [ "libc", ] +[[package]] +name = "crypto-common" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453" +dependencies = [ + "hybrid-array", +] + [[package]] name = "ctor" version = "1.0.7" @@ -132,6 +156,17 @@ version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", +] + [[package]] name = "fancy-regex" version = "0.18.0" @@ -255,6 +290,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + [[package]] name = "itoa" version = "1.0.18" @@ -271,6 +315,16 @@ dependencies = [ "libc", ] +[[package]] +name = "keccak" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e24a010dd405bd7ed803e5253182815b41bf2e6a80cc3bfc066658e03a198aa" +dependencies = [ + "cfg-if", + "cpufeatures", +] + [[package]] name = "libc" version = "0.2.186" @@ -547,6 +601,28 @@ dependencies = [ "zmij", ] +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc9bad02c26382724b2d2692c6f179285e4b54eeecd7968f52a50059c3c11759" +dependencies = [ + "digest", + "keccak", + "sponge-cursor", +] + [[package]] name = "shlex" version = "2.0.1" @@ -559,6 +635,12 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" +[[package]] +name = "sponge-cursor" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a0219bd7d979d58245a4f41f695e1ac9f8befdffadd7f61f1bae9e39abc6620" + [[package]] name = "stella-aho-corasick-core" version = "1.0.4" @@ -640,7 +722,11 @@ dependencies = [ [[package]] name = "stella-stdnum-core" version = "2.1.1" -source = "git+https://github.com/stella/stdnum?rev=b4949ece8981b84c53a21c26f7a5068dba553142#b4949ece8981b84c53a21c26f7a5068dba553142" +source = "git+https://github.com/stella/stdnum?rev=2f3c3f107e3976ac059cc438d77916a592595d59#2f3c3f107e3976ac059cc438d77916a592595d59" +dependencies = [ + "sha2", + "sha3", +] [[package]] name = "stella-text-search-core" @@ -684,6 +770,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + [[package]] name = "unicode-case-mapping" version = "1.0.0" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index ebe00d77..dd44fc81 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -13,7 +13,7 @@ categories = ["text-processing"] fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } -stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "b4949ece8981b84c53a21c26f7a5068dba553142" } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } [lints] diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index e1ae94c0..7b960ea0 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -297,7 +297,7 @@ describe("pipeline config semantics", () => { ).toBeGreaterThan(0); }); - test("native config keeps unsupported validator regexes fail-fast", async () => { + test("native config carries stdnum validator metadata", async () => { const search = await buildUnifiedSearch( { ...BASE_CONFIG, @@ -317,8 +317,8 @@ describe("pipeline config semantics", () => { expect(meta).toMatchObject({ label: "national identification number", requires_validation: true, + validator_id: "cn.ric", }); - expect(meta?.validator_id).toBeUndefined(); }); test("content language scopes deny-list search build", async () => { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index fff8a79e..fa54227e 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -1675,7 +1675,8 @@ const nativeSupportsRegexMeta = (meta: RegexMeta): boolean => { meta.validatorId !== undefined && NATIVE_REGEX_VALIDATOR_IDS.has(meta.validatorId) && (meta.validatorInputKind === undefined || - meta.validatorInputKind === "digits-only") + meta.validatorInputKind === "digits-only" || + meta.validatorInputKind === "crypto-wallet-candidate") ); }; diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 384c9c0e..1f8e60d8 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -283,27 +283,67 @@ const VALIDATOR_IDS = new Map([ export const NATIVE_REGEX_VALIDATOR_IDS: ReadonlySet = new Set([ "au.abn", "au.acn", + "at.businessid", "at.tin", + "at.uid", + "be.nn", + "be.vat", + "bg.vat", "br.cnpj", "br.cpf", + "ch.uid", + "cn.ric", + "crypto.wallet", + "cy.vat", "cz.dic", "cz.rc", "de.idnr", "de.stnr", + "de.svnr", + "de.vat", "dk.cpr", + "dk.vat", + "ee.ik", + "ee.vat", "es.cif", "es.dni", "es.nie", "es.nss", + "es.vat", "fi.hetu", "fi.vat", "fi.ytunnus", + "fr.nir", "fr.siren", + "fr.siret", + "fr.tva", "gb.nhs", "gb.nino", + "gb.vat", + "gr.vat", + "hr.vat", + "hu.vat", "ie.pps", + "ie.vat", + "it.codiceFiscale", + "it.iva", + "lt.asmens", + "lt.vat", + "lu.vat", + "lv.vat", + "mt.vat", + "nl.vat", "no.mva", "no.orgnr", + "pl.nip", + "pl.pesel", + "pt.cc", + "pt.vat", + "ro.cnp", + "ro.vat", + "se.personnummer", + "si.vat", + "sk.dic", "us.ein", "us.rtn", ]); From 273c0604c25419b7e4e068d7edeca3ca2b5c3162 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 09:33:06 +0200 Subject: [PATCH 073/130] perf: prewarm prepared package cache --- crates/anonymize-napi/src/lib.rs | 22 ++++++++++++++----- .../__test__/native-adapter-parity.test.ts | 20 +++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index c05bde15..c569b5ae 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -346,16 +346,26 @@ fn prepare_static_search_package_bytes_with( let core_config = prepared_search_config_from_binding(binding_config) .map_err(|error| to_napi_contract_error(&error))?; let artifacts = PreparedSearch::prepare_artifacts(core_config.clone()) - .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_napi_core_error(&error))?; + let artifact_bytes = artifacts + .to_bytes() .map_err(|error| to_napi_core_error(&error))?; let package = if compressed { - prepared_search_core_package_to_compressed_bytes(&core_config, &artifacts) + prepared_search_core_package_to_compressed_bytes( + &core_config, + &artifact_bytes, + ) } else { - prepared_search_core_package_to_bytes(&core_config, &artifacts) + prepared_search_core_package_to_bytes(&core_config, &artifact_bytes) }; - package - .map(Buffer::from) - .map_err(|error| to_napi_contract_error(&error)) + let package = package.map_err(|error| to_napi_contract_error(&error))?; + let prepared = PreparedSearch::new_with_artifacts(core_config, &artifacts) + .map_err(|error| to_napi_core_error(&error))?; + prepared_search_cache_insert( + prepared_search_package_cache_key(&package), + Arc::new(prepared), + ); + Ok(Buffer::from(package)) } #[napi] diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 936249a3..8857a422 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -700,10 +700,20 @@ describe("native adapter parity", () => { adapters.native.NativePreparedSearch.fromPreparedPackageBytes( packageBytes, ); + const diagnosticsJson = prepared.prepareDiagnosticsJson?.(); + if (diagnosticsJson === undefined) { + throw new Error("missing prepare diagnostics"); + } + const diagnostics = JSON.parse(diagnosticsJson); expect(prepared.redactStaticEntities(text)).toEqual( direct.redactStaticEntities(text), ); + expect( + diagnostics.events.some( + (event: { stage?: unknown }) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); const expectedJson = JSON.parse( adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), ); @@ -759,10 +769,20 @@ describe("native adapter parity", () => { adapters.native.NativePreparedSearch.fromPreparedPackageBytes( packageBytes, ); + const diagnosticsJson = prepared.prepareDiagnosticsJson?.(); + if (diagnosticsJson === undefined) { + throw new Error("missing prepare diagnostics"); + } + const diagnostics = JSON.parse(diagnosticsJson); expect(prepared.redactStaticEntities(text)).toEqual( direct.redactStaticEntities(text), ); + expect( + diagnostics.events.some( + (event: { stage?: unknown }) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); const expectedJson = JSON.parse( adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), ); From e5816fe87f58111e8c1934e22dbfbc518e750f5a Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 09:59:56 +0200 Subject: [PATCH 074/130] fix: align native pipeline review edges --- crates/anonymize-core/src/prepared.rs | 19 +++--- .../__test__/native-adapter-parity.test.ts | 66 +++++++++++++++++++ .../src/__test__/pipeline-config.test.ts | 24 +++++++ .../anonymize/src/build-unified-search.ts | 32 ++++++--- 4 files changed, 122 insertions(+), 19 deletions(-) diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index f9fabcb8..73a0b7ca 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -693,6 +693,9 @@ impl PreparedSearch { let legal_form = self.process_legal_form_entities(matches, full_text)?; + let name_corpus = + self.process_name_corpus_entities(full_text, &deny_list.entities)?; + let address_seed = self.process_address_seed_entities( matches, full_text, @@ -705,12 +708,10 @@ impl PreparedSearch { &legal_form.entities, &deny_list.entities, &gazetteer.entities, + &name_corpus.entities, ], )?; - let name_corpus = - self.process_name_corpus_entities(full_text, &deny_list.entities)?; - Ok(StaticEntityPasses { regex, custom_regex, @@ -1317,18 +1318,18 @@ fn record_static_entity_diagnostics( full_text, Some(passes.legal_form.elapsed_us), ); - diagnostics.record_entities( - DiagnosticStage::EntityAddressSeed, - &passes.address_seed.entities, - full_text, - Some(passes.address_seed.elapsed_us), - ); diagnostics.record_entities( DiagnosticStage::EntityNameCorpus, &passes.name_corpus.entities, full_text, Some(passes.name_corpus.elapsed_us), ); + diagnostics.record_entities( + DiagnosticStage::EntityAddressSeed, + &passes.address_seed.entities, + full_text, + Some(passes.address_seed.elapsed_us), + ); } fn address_seed_context(layers: &[&[PipelineEntity]]) -> Vec { diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 8857a422..1559772a 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1382,6 +1382,72 @@ describe("native adapter parity", () => { }); }); + test("native pipeline keeps supplemental names outside address seeds", async () => { + const adapters = getAdapters(); + const fullText = + "Sato Kenji, address: 100 Main Street, Boston, MA 02101-1234."; + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["person", "address"], + workspaceId: "native-pipeline-name-address-boundary-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + const address = tsEntities.find((entity) => entity.label === "address"); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ label: "person", text: "Sato Kenji" }), + ]), + ); + expect(address?.text).toContain("100 Main Street"); + expect(address?.text).not.toContain("Sato Kenji"); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline compatibility rejects TS-only contextual passes", () => { const config: PipelineConfig = { threshold: 0.3, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 7b960ea0..922da425 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -319,6 +319,30 @@ describe("pipeline config semantics", () => { requires_validation: true, validator_id: "cn.ric", }); + expect( + search.nativeStaticConfig.regex_meta.filter( + (entry) => entry.requires_validation === true && !entry.validator_id, + ), + ).toEqual([]); + }); + + test("native config keeps trigger currency terms separate from monetary detection", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: false, + enableTriggerPhrases: true, + labels: [], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.trigger_data?.sentence_terminal_currency_terms + .length, + ).toBeGreaterThan(0); + expect(search.nativeStaticConfig.monetary_data).toBeUndefined(); }); test("content language scopes deny-list search build", async () => { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index fa54227e..a11ec247 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -459,6 +459,7 @@ type UnifiedSearchSources = { nativeLegalFormData: NativeLegalFormData | null; nativeDateData: NativeDateData | null; nativeMonetaryData: NativeMonetaryData | null; + nativeSentenceTerminalCurrencyTerms: string[]; nativeAddressSeedData: NativeAddressSeedData | null; nativeZoneData: NativeZoneData | null; nativeAddressContextData: NativeAddressContextData | null; @@ -614,6 +615,8 @@ const buildUnifiedSearchSources = async ( ? expandLabelsForHotwordRuleSet(config.labels, hotwordRules) : config.labels; const allowedLabels = createAllowedLabelSet(searchLabels); + const regexMonetaryEnabled = + config.enableRegex && labelIsAllowed("monetary amount", allowedLabels); const customRegexes = config.enableRegex ? (config.customRegexes ?? []).filter((entry) => labelIsAllowed(entry.label, allowedLabels), @@ -661,7 +664,7 @@ const buildUnifiedSearchSources = async ( return buildDenyListFilterData(ctx); })(), buildStreetTypePatterns(), - config.enableRegex && labelIsAllowed("monetary amount", allowedLabels) + regexMonetaryEnabled ? getCurrencyPatternEntries() : Promise.resolve([] as PatternEntry[]), config.enableRegex && labelIsAllowed("date", allowedLabels) @@ -679,8 +682,7 @@ const buildUnifiedSearchSources = async ( config.enableRegex && labelIsAllowed("date", allowedLabels) ? getYearWordData() : Promise.resolve(null), - config.enableTriggerPhrases || - (config.enableRegex && labelIsAllowed("monetary amount", allowedLabels)) + config.enableTriggerPhrases || regexMonetaryEnabled ? getMonetaryData() : Promise.resolve(null), labelIsAllowed("address", allowedLabels) @@ -804,7 +806,9 @@ const buildUnifiedSearchSources = async ( month_names_by_language: dateMonthData, year_words_by_language: yearWordData ?? {}, }; - const nativeMonetaryData = monetaryData; + const nativeMonetaryData = regexMonetaryEnabled ? monetaryData : null; + const nativeSentenceTerminalCurrencyTerms = + sentenceTerminalCurrencyTerms(monetaryData); const nativeNameCorpusData = buildNativeNameCorpusData(config, ctx); let offset = 0; @@ -950,6 +954,7 @@ const buildUnifiedSearchSources = async ( nativeLegalFormData, nativeDateData, nativeMonetaryData, + nativeSentenceTerminalCurrencyTerms, nativeAddressSeedData: addressSeedData, nativeZoneData: zoneData, nativeAddressContextData: addressContextData, @@ -1010,6 +1015,8 @@ export const buildNativeStaticSearchBundle = async ( legalFormData: sources.nativeLegalFormData, dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, + sentenceTerminalCurrencyTerms: + sources.nativeSentenceTerminalCurrencyTerms, addressSeedData: sources.nativeAddressSeedData, zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, @@ -1099,6 +1106,7 @@ export const buildUnifiedSearch = async ( legalFormData: sources.nativeLegalFormData, dateData: sources.nativeDateData, monetaryData: sources.nativeMonetaryData, + sentenceTerminalCurrencyTerms: sources.nativeSentenceTerminalCurrencyTerms, addressSeedData: sources.nativeAddressSeedData, zoneData: sources.nativeZoneData, addressContextData: sources.nativeAddressContextData, @@ -1152,6 +1160,7 @@ type BuildNativeStaticConfigArgs = { legalFormData: NativeLegalFormData | null; dateData: NativeDateData | null; monetaryData: NativeMonetaryData | null; + sentenceTerminalCurrencyTerms: readonly string[]; addressSeedData: NativeAddressSeedData | null; zoneData: NativeZoneData | null; addressContextData: NativeAddressContextData | null; @@ -1186,6 +1195,7 @@ const buildNativeStaticConfig = ({ legalFormData, dateData, monetaryData, + sentenceTerminalCurrencyTerms, addressSeedData, zoneData, addressContextData, @@ -1385,8 +1395,7 @@ const buildNativeStaticConfig = ({ rules: triggerRules.map(toNativeTriggerRule), address_stop_keywords: [...getAddressStopKeywordsSync()], party_position_terms: [...partyPositionTerms], - sentence_terminal_currency_terms: - sentenceTerminalCurrencyTerms(monetaryData), + sentence_terminal_currency_terms: [...sentenceTerminalCurrencyTerms], }; } if (legalFormData) { @@ -1653,11 +1662,14 @@ const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { } if (meta.validator) { const isSupportedValidator = nativeSupportsRegexMeta(meta); - result.requires_validation = true; - if (isSupportedValidator && meta.validatorId) { - result.validator_id = meta.validatorId; + if (!isSupportedValidator || !meta.validatorId) { + throw new Error( + `Native static config does not support regex validator ${meta.validatorId ?? "unknown"}`, + ); } - if (isSupportedValidator && meta.validatorInputKind) { + result.requires_validation = true; + result.validator_id = meta.validatorId; + if (meta.validatorInputKind) { result.validator_input = meta.validatorInputKind; } } From da57db85a5b814656abe553393af6a5b06090d9c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 10:25:26 +0200 Subject: [PATCH 075/130] feat: add native package file workflow --- packages/anonymize/README.md | 27 ++++ packages/anonymize/package.json | 6 +- .../scripts/build-native-pipeline-package.mjs | 137 ++++++++++++++++++ packages/anonymize/scripts/dist-smoke.mjs | 8 +- .../src/__test__/native-node.test.ts | 50 ++++++- packages/anonymize/src/index-shared.ts | 2 +- packages/anonymize/src/native-node.ts | 33 +++++ packages/anonymize/src/native-pipeline.ts | 53 +------ packages/anonymize/src/native.ts | 40 +++++ 9 files changed, 307 insertions(+), 49 deletions(-) create mode 100755 packages/anonymize/scripts/build-native-pipeline-package.mjs diff --git a/packages/anonymize/README.md b/packages/anonymize/README.md index f411f81d..3651db4b 100644 --- a/packages/anonymize/README.md +++ b/packages/anonymize/README.md @@ -92,6 +92,33 @@ const entities = await runPipeline({ }); ``` +## Native prepared packages + +For Node.js deployments that need low click-time latency, prepare the native pipeline package during your build and load the bytes at runtime. + +```bash +bunx stella-anonymize-build-native-package \ + --config ./anonymize-native-config.mjs \ + --out ./dist/anonymize.stlanonpkg +``` + +```ts +import { + createNativePipelineFromPackageFile, + loadNativeAnonymizeBinding, +} from "@stll/anonymize/native-node"; + +const native = loadNativeAnonymizeBinding(); +const pipeline = createNativePipelineFromPackageFile({ + binding: native, + packagePath: "./dist/anonymize.stlanonpkg", +}); + +const result = pipeline.redactText(text); +``` + +The config module may export a `PipelineConfig` directly or `{ config, gazetteerEntries }`. Include `@stll/anonymize-data` dictionaries there if your runtime config uses the deny-list or name-corpus layers; keep the corresponding layers enabled for caller-owned `customDenyList`, `customRegexes`, and gazetteers. Those inputs are part of the prepared package and should be regenerated when they change. + ## Browser setup If you use Vite with the WASM build, exclude the bundle from dependency pre-bundling: diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 1ae4e57b..192060d7 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -26,10 +26,14 @@ } }, "types": "dist/index.d.mts", + "bin": { + "stella-anonymize-build-native-package": "./scripts/build-native-pipeline-package.mjs" + }, "files": [ "dist", "index.cjs", - "*.node" + "*.node", + "scripts/build-native-pipeline-package.mjs" ], "publishConfig": { "access": "public" diff --git a/packages/anonymize/scripts/build-native-pipeline-package.mjs b/packages/anonymize/scripts/build-native-pipeline-package.mjs new file mode 100755 index 00000000..dc5f1e48 --- /dev/null +++ b/packages/anonymize/scripts/build-native-pipeline-package.mjs @@ -0,0 +1,137 @@ +#!/usr/bin/env node +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { pathToFileURL } from "node:url"; + +import { + createPipelineContext, + DEFAULT_ENTITY_LABELS, + prepareNativePipelinePackage, +} from "../dist/index.mjs"; +import { loadNativeAnonymizeBinding } from "../dist/native-node.mjs"; + +const args = parseArgs(process.argv.slice(2)); +const outputPath = resolve(args.out ?? "native-pipeline.stlanonpkg"); +const compressed = args.raw !== true; +const { config, gazetteerEntries } = await loadPackageInput(args); +const binding = loadNativeAnonymizeBinding(); +const packageBytes = await prepareNativePipelinePackage({ + binding, + config, + gazetteerEntries, + context: createPipelineContext(), + compressed, +}); + +mkdirSync(dirname(outputPath), { recursive: true }); +writeFileSync(outputPath, packageBytes); + +console.log( + JSON.stringify({ + event: "native-pipeline-package", + outputPath, + bytes: packageBytes.byteLength, + compressed, + nativeVersion: binding.nativePackageVersion(), + }), +); + +function parseArgs(values) { + const result = {}; + for (let index = 0; index < values.length; index += 1) { + const value = values[index]; + switch (value) { + case "--config": { + result.config = requiredValue(values, index, value); + index += 1; + break; + } + case "--export": { + result.exportName = requiredValue(values, index, value); + index += 1; + break; + } + case "--out": { + result.out = requiredValue(values, index, value); + index += 1; + break; + } + case "--raw": { + result.raw = true; + break; + } + case "--help": { + printHelp(); + process.exit(0); + } + default: + throw new Error(`Unknown option: ${value}`); + } + } + return result; +} + +function requiredValue(values, index, option) { + const value = values[index + 1]; + if (value === undefined || value.startsWith("--")) { + throw new Error(`${option} requires a value`); + } + return value; +} + +async function loadPackageInput(options) { + if (!options.config) { + return { config: defaultNativePipelineConfig(), gazetteerEntries: [] }; + } + const moduleUrl = pathToFileURL(resolve(options.config)).href; + // eslint-disable-next-line stll/no-dynamic-import-specifier + const loaded = await import(moduleUrl); + const exportName = options.exportName ?? "default"; + const candidate = + exportName === "default" ? loaded.default : loaded[exportName]; + if (candidate === undefined) { + throw new Error(`Config module does not export ${exportName}`); + } + const value = + typeof candidate === "function" ? await candidate() : await candidate; + if (!value || typeof value !== "object") { + throw new TypeError("Native package config export must be an object"); + } + if ("config" in value) { + return { + config: value.config, + gazetteerEntries: value.gazetteerEntries ?? [], + }; + } + return { config: value, gazetteerEntries: [] }; +} + +function defaultNativePipelineConfig() { + return { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "native-pipeline-package", + }; +} + +function printHelp() { + console.log(`Usage: build-native-pipeline-package [options] + +Options: + --out Output package path. Defaults to native-pipeline.stlanonpkg. + --config ESM module exporting a PipelineConfig or { config, gazetteerEntries }. + --export Export name to read from the config module. Defaults to default. + --raw Write an uncompressed package. +`); +} diff --git a/packages/anonymize/scripts/dist-smoke.mjs b/packages/anonymize/scripts/dist-smoke.mjs index 23c73194..8733ba73 100644 --- a/packages/anonymize/scripts/dist-smoke.mjs +++ b/packages/anonymize/scripts/dist-smoke.mjs @@ -11,7 +11,10 @@ */ import { createPipelineContext, runPipeline } from "../dist/index.mjs"; import { createNativeAnonymizerFromPackage } from "../dist/native.mjs"; -import { loadNativeAnonymizeBinding } from "../dist/native-node.mjs"; +import { + createNativePipelineFromPackageFile, + loadNativeAnonymizeBinding, +} from "../dist/native-node.mjs"; if (typeof createNativeAnonymizerFromPackage !== "function") { throw new TypeError("dist native entrypoint is missing its package loader"); @@ -19,6 +22,9 @@ if (typeof createNativeAnonymizerFromPackage !== "function") { if (typeof loadNativeAnonymizeBinding !== "function") { throw new TypeError("dist native-node entrypoint is missing its loader"); } +if (typeof createNativePipelineFromPackageFile !== "function") { + throw new TypeError("dist native-node entrypoint is missing file loading"); +} const warnings = []; const originalWarn = console.warn; diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 8d10ed12..2a15dacf 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -1,9 +1,14 @@ import { describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import type { NativeAnonymizeBinding } from "../native"; import { + createNativePipelineFromPackageFile, loadNativeAnonymizeBinding, nativePlatformPackageName, + readNativePipelinePackageFile, } from "../native-node"; describe("native node loader", () => { @@ -110,10 +115,50 @@ describe("native node loader", () => { }), ).toThrow("does not match 1.5.0"); }); + + test("loads native pipeline package bytes from a file", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-package-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, Uint8Array.of(1, 2, 3, 4)); + + expect([...readNativePipelinePackageFile(packagePath)]).toEqual([ + 1, 2, 3, 4, + ]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("creates a native pipeline from a package file", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-pipeline-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(7, 8, 9)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const pipeline = createNativePipelineFromPackageFile({ + binding, + expectedVersion: "1.5.0", + packagePath, + }); + + expect(capturedBytes).toEqual([[7, 8, 9]]); + expect(pipeline.redactText("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); }); type FakeNativeBindingOptions = { preparedSearchAsConstructor?: boolean; + onPreparedPackageBytes?: (bytes: Uint8Array) => void; }; const fakeNativeBinding = ( @@ -122,7 +167,10 @@ const fakeNativeBinding = ( ): NativeAnonymizeBinding => { const preparedSearch = { fromConfigJsonBytes: () => fakePreparedSearch(), - fromPreparedPackageBytes: () => fakePreparedSearch(), + fromPreparedPackageBytes: (bytes: Uint8Array) => { + options.onPreparedPackageBytes?.(bytes); + return fakePreparedSearch(); + }, }; const NativePreparedSearch = options.preparedSearchAsConstructor ? Object.assign(function NativePreparedSearch() {}, preparedSearch) diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index 4befc7a8..9d520e94 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -66,6 +66,7 @@ export type { NativeBindingVersionOptions, NativeOperatorConfig, NativePipelineEntity, + NativePipelineFromPackageOptions, NativePreparedSearchBinding, NativeRedactionResult, NativeSearchPackageOptions, @@ -83,7 +84,6 @@ export { export type { NativePipelineBuildOptions, NativePipelineCompatibility, - NativePipelineFromPackageOptions, NativePipelinePackageOptions, NativePipelineUnsupportedFeature, } from "./native-pipeline"; diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 4a022dde..810d5427 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -1,9 +1,12 @@ import { createRequire } from "node:module"; +import { readFileSync } from "node:fs"; import process from "node:process"; import { assertNativeBindingVersion, + createNativePipelineFromPackage, type NativeAnonymizeBinding, + type PreparedNativePipeline, } from "./native"; export * from "./native"; @@ -27,6 +30,11 @@ export type LoadNativeBindingOptions = { requireModule?: NativeRequire; }; +export type NativePipelinePackageFileOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; + packagePath: string; +}; + const LOCAL_NATIVE_LOADER = "../index.cjs"; const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; @@ -92,6 +100,31 @@ export const loadNativeAnonymizeBinding = ( throw new Error(`${platformMessage}:\n${errors.join("\n")}`); }; +export const readNativePipelinePackageFile = ( + packagePath: string, +): Uint8Array => new Uint8Array(readFileSync(packagePath)); + +export const createNativePipelineFromPackageFile = ({ + binding, + packagePath, + expectedVersion, + ...loadOptions +}: NativePipelinePackageFileOptions): PreparedNativePipeline => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + return createNativePipelineFromPackage({ + binding: resolvedBinding, + packageBytes: readNativePipelinePackageFile(packagePath), + }); +}; + type NativeBindingSpecifiersOptions = { platform: string; arch: string; diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 05721b83..75d51dde 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -8,12 +8,15 @@ import { applyPipelineLanguageScope } from "./language-scope"; import { pipelineConfigKey } from "./pipeline-cache-key"; import type { Dictionaries, GazetteerEntry, PipelineConfig } from "./types"; import { - createNativeAnonymizerFromPackage, + createNativePipelineFromPackage, prepareNativeSearchPackage, - PreparedNativeAnonymizer, + PreparedNativePipeline, type NativeAnonymizeBinding, - type NativeOperatorConfig, - type NativeStaticRedactionResult, +} from "./native"; + +export { + PreparedNativePipeline, + createNativePipelineFromPackage, } from "./native"; export type NativePipelineUnsupportedFeature = "enableNer" | "enableNameCorpus"; @@ -36,10 +39,7 @@ export type NativePipelinePackageOptions = NativePipelineBuildOptions & { compressed?: boolean; }; -export type NativePipelineFromPackageOptions = { - binding: NativeAnonymizeBinding; - packageBytes: Uint8Array; -}; +export type { NativePipelineFromPackageOptions } from "./native"; type NativePipelinePackageCacheValue = Promise | Uint8Array; @@ -67,35 +67,6 @@ const sharedPackageCacheFor = ( return created; }; -export class PreparedNativePipeline { - readonly #anonymizer: PreparedNativeAnonymizer; - - constructor(anonymizer: PreparedNativeAnonymizer) { - this.#anonymizer = anonymizer; - } - - prepareDiagnosticsJson(): string | null { - return this.#anonymizer.prepareDiagnosticsJson(); - } - - redactText( - fullText: string, - operators?: NativeOperatorConfig, - ): NativeStaticRedactionResult { - return this.#anonymizer.redactStaticEntities(fullText, operators); - } - - redactTextDiagnosticsJson( - fullText: string, - operators?: NativeOperatorConfig, - ): string | null { - return this.#anonymizer.redactStaticEntitiesDiagnosticsJson( - fullText, - operators, - ); - } -} - export const getNativePipelineCompatibility = ( config: PipelineConfig, ): NativePipelineCompatibility => { @@ -170,14 +141,6 @@ export const createNativePipelineFromConfig = async ({ return createNativePipelineFromPackage({ binding, packageBytes }); }; -export const createNativePipelineFromPackage = ({ - binding, - packageBytes, -}: NativePipelineFromPackageOptions): PreparedNativePipeline => - new PreparedNativePipeline( - createNativeAnonymizerFromPackage({ binding, packageBytes }), - ); - const getCachedNativePipelinePackage = async ({ binding, config, diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index be58ed60..d13b1894 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -109,6 +109,9 @@ export type NativeAnonymizerFromPackageOptions = { packageBytes: Uint8Array; }; +export type NativePipelineFromPackageOptions = + NativeAnonymizerFromPackageOptions; + export type NativeBindingVersionOptions = { binding: NativeAnonymizeBinding; expectedVersion: string; @@ -149,6 +152,35 @@ export class PreparedNativeAnonymizer { } } +export class PreparedNativePipeline { + readonly #anonymizer: PreparedNativeAnonymizer; + + constructor(anonymizer: PreparedNativeAnonymizer) { + this.#anonymizer = anonymizer; + } + + prepareDiagnosticsJson(): string | null { + return this.#anonymizer.prepareDiagnosticsJson(); + } + + redactText( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.#anonymizer.redactStaticEntities(fullText, operators); + } + + redactTextDiagnosticsJson( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.#anonymizer.redactStaticEntitiesDiagnosticsJson( + fullText, + operators, + ); + } +} + export const encodeNativeSearchConfig = ( config: NativePreparedSearchConfig, ): Uint8Array => new TextEncoder().encode(JSON.stringify(config)); @@ -198,6 +230,14 @@ export const createNativeAnonymizerFromPackage = ({ binding.NativePreparedSearch.fromPreparedPackageBytes(packageBytes), ); +export const createNativePipelineFromPackage = ({ + binding, + packageBytes, +}: NativePipelineFromPackageOptions): PreparedNativePipeline => + new PreparedNativePipeline( + createNativeAnonymizerFromPackage({ binding, packageBytes }), + ); + const toBindingOperatorConfig = ( config: NativeOperatorConfig | undefined, ): NativeBindingOperatorConfig | undefined => { From e603420981ccfa0861fd52b444f3daa0da1b300c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 10:59:12 +0200 Subject: [PATCH 076/130] fix: align native static review edges --- crates/anonymize-adapter-contract/src/lib.rs | 3 + crates/anonymize-core/src/hotwords.rs | 83 +++++++++++++++--- crates/anonymize-core/src/prepared.rs | 70 ++++++++-------- crates/anonymize-core/tests/prepared.rs | 84 +++++++++++++++---- .../scripts/migration-fixture-perf.mjs | 8 +- .../src/__test__/pipeline-config.test.ts | 72 +++++++++++++--- .../anonymize/src/build-unified-search.ts | 23 +---- packages/anonymize/src/detectors/regex.ts | 4 +- packages/anonymize/src/native-pipeline.ts | 16 ++++ 9 files changed, 270 insertions(+), 93 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 981f0c09..cef22d5e 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -169,6 +169,8 @@ pub struct BindingHotwordRuleData { #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] pub struct BindingHotwordRule { + #[serde(default)] + pub hotwords: Vec, #[serde(default)] pub target_labels: Vec, pub score_adjustment: f64, @@ -1644,6 +1646,7 @@ fn hotword_data_from_binding(data: BindingHotwordRuleData) -> HotwordRuleData { .rules .into_iter() .map(|rule| HotwordRule { + hotwords: rule.hotwords, target_labels: rule.target_labels, score_adjustment: rule.score_adjustment, reclassify_to: rule.reclassify_to, diff --git a/crates/anonymize-core/src/hotwords.rs b/crates/anonymize-core/src/hotwords.rs index f2f489d8..eceb5e29 100644 --- a/crates/anonymize-core/src/hotwords.rs +++ b/crates/anonymize-core/src/hotwords.rs @@ -1,16 +1,21 @@ use crate::byte_offsets::ByteOffsets; -use crate::processors::PatternSlice; use crate::resolution::{PipelineEntity, SourceDetail}; +use crate::search::{ + LiteralSearchOptions, SearchIndex, SearchOptions, SearchPattern, +}; use crate::types::{Error, Result, SearchMatch}; #[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] pub struct HotwordRuleData { pub rules: Vec, + #[serde(default)] pub pattern_rule_indices: Vec, } #[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] pub struct HotwordRule { + #[serde(default)] + pub hotwords: Vec, pub target_labels: Vec, pub score_adjustment: f64, pub reclassify_to: Option, @@ -18,15 +23,65 @@ pub struct HotwordRule { pub proximity_after: u32, } +pub(crate) struct PreparedHotwordData { + rules: Vec, + pattern_rule_indices: Vec, + search: SearchIndex, +} + +impl PreparedHotwordData { + pub(crate) fn new(data: HotwordRuleData) -> Result { + let mut patterns = Vec::new(); + let mut pattern_rule_indices = Vec::new(); + + for (rule_index, rule) in data.rules.iter().enumerate() { + let rule_index = + u32::try_from(rule_index).map_err(|_| Error::InvalidStaticData { + field: "hotword_data.rules", + reason: String::from("rule index exceeds u32 range"), + })?; + for hotword in &rule.hotwords { + if hotword.is_empty() { + return Err(Error::InvalidStaticData { + field: "hotword_data.rules.hotwords", + reason: String::from("hotword must not be empty"), + }); + } + patterns.push(SearchPattern::LiteralWithOptions { + pattern: hotword.clone(), + case_insensitive: Some(true), + whole_words: Some(true), + }); + pattern_rule_indices.push(rule_index); + } + } + + let search = SearchIndex::new( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: true, + }, + ..SearchOptions::default() + }, + )?; + + Ok(Self { + rules: data.rules, + pattern_rule_indices, + search, + }) + } +} + pub(crate) fn apply_hotword_rules( entities: Vec, full_text: &str, - matches: &[SearchMatch], - slice: PatternSlice, - data: &HotwordRuleData, + data: &PreparedHotwordData, allowed_labels: &[String], ) -> Result> { - let hits_by_rule = collect_hits_by_rule(matches, slice, data)?; + let hits_by_rule = collect_hits_by_rule(full_text, data)?; let offsets = ByteOffsets::new(full_text); let mut result = Vec::with_capacity(entities.len()); @@ -46,15 +101,17 @@ pub(crate) fn apply_hotword_rules( } fn collect_hits_by_rule( - matches: &[SearchMatch], - slice: PatternSlice, - data: &HotwordRuleData, + full_text: &str, + data: &PreparedHotwordData, ) -> Result>> { let mut hits_by_rule = vec![Vec::new(); data.rules.len()]; - for found in matches { - let Some(local_index) = slice.local_index(found.pattern()) else { - continue; + for found in data.search.find_iter(full_text)? { + let Ok(local_index) = usize::try_from(found.pattern()) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("pattern index exceeds usize range"), + }); }; let Some(rule_index) = data.pattern_rule_indices.get(local_index) else { continue; @@ -71,7 +128,7 @@ fn collect_hits_by_rule( reason: String::from("rule index out of range"), }); }; - bucket.push(*found); + bucket.push(found); } Ok(hits_by_rule) @@ -80,7 +137,7 @@ fn collect_hits_by_rule( fn apply_entity_rules( mut entity: PipelineEntity, offsets: &ByteOffsets<'_>, - data: &HotwordRuleData, + data: &PreparedHotwordData, hits_by_rule: &[Vec], ) -> Result { let mut best = None::; diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 73a0b7ca..25aafe6f 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -8,7 +8,9 @@ use crate::coreference::{CoreferenceData, PreparedCoreferenceData}; use crate::dates::{DateData, PreparedDateData}; use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; use crate::false_positives::filter_entity_false_positives; -use crate::hotwords::{HotwordRuleData, apply_hotword_rules}; +use crate::hotwords::{ + HotwordRuleData, PreparedHotwordData, apply_hotword_rules, +}; use crate::legal_forms::{ LegalFormData, PreparedLegalFormData, process_legal_form_matches, }; @@ -67,7 +69,7 @@ pub struct PreparedSearch { false_positive_filters: Option, gazetteer_data: Option, country_data: Option, - hotword_data: Option, + hotword_data: Option, trigger_data: Option, legal_form_data: Option, address_seed_data: Option, @@ -453,11 +455,8 @@ impl PreparedSearch { false_positive_filters: config.false_positive_filters, gazetteer_data: config.gazetteer_data, country_data: config.country_data, - hotword_data: config.hotword_data, - trigger_data: config - .trigger_data - .map(PreparedTriggerData::new) - .transpose()?, + hotword_data: prepare_hotword_data(config.hotword_data)?, + trigger_data: prepare_trigger_data(config.trigger_data)?, legal_form_data: config.legal_form_data.map(PreparedLegalFormData::new), address_seed_data: prepare_address_seed_data(config.address_seed_data)?, zone_data: prepare_zone_data(config.zone_data.as_ref())?, @@ -1013,19 +1012,12 @@ impl PreparedSearch { &self, entities: Vec, full_text: &str, - literal_matches: &[SearchMatch], + _literal_matches: &[SearchMatch], ) -> Result> { let Some(data) = &self.hotword_data else { return Ok(entities); }; - apply_hotword_rules( - entities, - full_text, - literal_matches, - self.slices.hotwords, - data, - &self.allowed_labels, - ) + apply_hotword_rules(entities, full_text, data, &self.allowed_labels) } fn apply_zone_adjustments( @@ -1557,6 +1549,18 @@ fn prepare_address_seed_data( data.map(PreparedAddressSeedData::new).transpose() } +fn prepare_hotword_data( + data: Option, +) -> Result> { + data.map(PreparedHotwordData::new).transpose() +} + +fn prepare_trigger_data( + data: Option, +) -> Result> { + data.map(PreparedTriggerData::new).transpose() +} + fn prepare_address_context_data( data: Option, ) -> Result> { @@ -1917,7 +1921,11 @@ fn validate_country_config(config: &PreparedSearchConfig) -> Result<()> { } fn validate_hotword_config(config: &PreparedSearchConfig) -> Result<()> { - if config.slices.hotwords.is_empty() { + if !config.slices.hotwords.is_empty() { + return Err(Error::UnsupportedStaticSlice { slice: "hotwords" }); + } + + if config.hotword_data.is_none() { return Ok(()); } @@ -1927,25 +1935,21 @@ fn validate_hotword_config(config: &PreparedSearchConfig) -> Result<()> { }); }; - validate_static_data_length( - "hotword_data.pattern_rule_indices", - config.slices.hotwords, - data.pattern_rule_indices.len(), - )?; - - for rule_index in &data.pattern_rule_indices { - let Ok(rule_index) = usize::try_from(*rule_index) else { + for rule in &data.rules { + if rule.hotwords.is_empty() { return Err(Error::InvalidStaticData { - field: "hotword_data.pattern_rule_indices", - reason: String::from("rule index exceeds usize range"), - }); - }; - if rule_index >= data.rules.len() { - return Err(Error::InvalidStaticData { - field: "hotword_data.pattern_rule_indices", - reason: String::from("rule index out of range"), + field: "hotword_data.rules.hotwords", + reason: String::from("native hotword rules require hotword strings"), }); } + for hotword in &rule.hotwords { + if hotword.is_empty() { + return Err(Error::InvalidStaticData { + field: "hotword_data.rules.hotwords", + reason: String::from("hotword must not be empty"), + }); + } + } } Ok(()) diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index cb1ea756..70226c5d 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1751,28 +1751,23 @@ fn prepared_search_hotword_distance_uses_utf16_offsets() { regex_patterns: vec![SearchPattern::Regex(String::from( r"\b\d{2}\.\d{2}\.\d{4}\b", ))], - literal_patterns: vec![SearchPattern::LiteralWithOptions { - pattern: String::from("born"), - case_insensitive: Some(true), - whole_words: Some(true), - }], allowed_labels: vec![String::from("date of birth")], threshold: 0.8, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, - hotwords: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() }, regex_meta: vec![RegexMatchMeta::new("date", 0.7)], hotword_data: Some(HotwordRuleData { rules: vec![HotwordRule { + hotwords: vec![String::from("born")], target_labels: vec![String::from("date")], score_adjustment: 1.0, reclassify_to: Some(String::from("date of birth")), proximity_before: 40, proximity_after: 40, }], - pattern_rule_indices: vec![0], + pattern_rule_indices: vec![], }), ..empty_config(PreparedSearchSlices::default()) }) @@ -1786,34 +1781,95 @@ fn prepared_search_hotword_distance_uses_utf16_offsets() { assert!(result.resolved_entities.is_empty()); } +#[test] +fn prepared_search_hotword_searches_original_text() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + allowed_labels: vec![String::from("date")], + threshold: 0.96, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.95)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("tax ID")], + target_labels: vec![String::from("date")], + score_adjustment: 0.1, + reclassify_to: None, + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "tax\u{00a0}ID 12.03.1990", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_rejects_legacy_hotword_slice() { + let result = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("born"))], + slices: PreparedSearchSlices { + hotwords: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("born")], + target_labels: vec![String::from("date")], + score_adjustment: 0.1, + reclassify_to: None, + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![0], + }), + ..empty_config(PreparedSearchSlices::default()) + }); + + assert!(matches!( + result, + Err(Error::UnsupportedStaticSlice { slice: "hotwords" }) + )); +} + #[test] fn prepared_search_applies_hotword_reclassification_before_threshold() { let prepared = PreparedSearch::new(PreparedSearchConfig { regex_patterns: vec![SearchPattern::Regex(String::from( r"\b\d{2}\.\d{2}\.\d{4}\b", ))], - literal_patterns: vec![SearchPattern::LiteralWithOptions { - pattern: String::from("narozen"), - case_insensitive: Some(true), - whole_words: Some(true), - }], allowed_labels: vec![String::from("date of birth")], threshold: 0.8, slices: PreparedSearchSlices { regex: PatternSlice { start: 0, end: 1 }, - hotwords: PatternSlice { start: 0, end: 1 }, ..PreparedSearchSlices::default() }, regex_meta: vec![RegexMatchMeta::new("date", 0.7)], hotword_data: Some(HotwordRuleData { rules: vec![HotwordRule { + hotwords: vec![String::from("narozen")], target_labels: vec![String::from("date")], score_adjustment: 0.15, reclassify_to: Some(String::from("date of birth")), proximity_before: 60, proximity_after: 60, }], - pattern_rule_indices: vec![0], + pattern_rule_indices: vec![], }), ..empty_config(PreparedSearchSlices::default()) }) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index a23c4fcd..cc517cdf 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -107,11 +107,17 @@ const ACCEPTED_NATIVE_STATIC_DELTAS = new Map( }, { fixture: "en/software-license-agreement.txt", - reason: "phone-leading-parenthesis", + reason: "wider-notice-address-spans", candidateExtra: [ + { start: 506, end: 541, label: "address", source: "regex" }, + { start: 1624, end: 1664, label: "address", source: "regex" }, + { start: 1813, end: 1848, label: "address", source: "regex" }, { start: 1857, end: 1871, label: "phone number", source: "regex" }, ], candidateMissing: [ + { start: 515, end: 531, label: "address", source: "deny-list" }, + { start: 1629, end: 1654, label: "address", source: "deny-list" }, + { start: 1822, end: 1838, label: "address", source: "deny-list" }, { start: 1858, end: 1871, label: "phone number", source: "trigger" }, ], }, diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 922da425..26870ec4 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -10,7 +10,11 @@ import { runPipeline, } from "../index"; import { buildUnifiedSearch } from "../build-unified-search"; -import { REGEX_META } from "../detectors/regex"; +import { + REGEX_META, + getNativeSigningClausePatterns, + getSigningClausePatterns, +} from "../detectors/regex"; import { applyPipelineLanguageScope } from "../language-scope"; import type { NativeAnonymizeBinding } from "../native"; import type { Dictionaries, PipelineConfig } from "../types"; @@ -210,18 +214,66 @@ describe("pipeline config semantics", () => { ); expect(search.nativeStaticConfig.allowed_labels).toEqual(["date of birth"]); - expect(search.nativeStaticConfig.slices.hotwords?.end).toBeGreaterThan( - search.nativeStaticConfig.slices.hotwords?.start ?? 0, - ); + expect(search.nativeStaticConfig.slices.hotwords).toEqual({ + start: search.nativeStaticConfig.slices.hotwords?.start ?? 0, + end: search.nativeStaticConfig.slices.hotwords?.start ?? 0, + }); expect( - search.nativeStaticConfig.hotword_data?.rules.length, - ).toBeGreaterThan(0); + search.nativeStaticConfig.hotword_data?.rules.some((rule) => + rule.hotwords.includes("born"), + ), + ).toBe(true); expect( - search.nativeStaticConfig.hotword_data?.pattern_rule_indices.length, - ).toBe( - (search.nativeStaticConfig.slices.hotwords?.end ?? 0) - - (search.nativeStaticConfig.slices.hotwords?.start ?? 0), + search.nativeStaticConfig.literal_patterns.some( + (pattern) => pattern.pattern === "born", + ), + ).toBe(false); + expect( + search.nativeStaticConfig.hotword_data?.pattern_rule_indices, + ).toEqual([]); + }); + + test("native signing-place patterns match TypeScript signing patterns", async () => { + const [tsPatterns, nativePatterns] = await Promise.all([ + getSigningClausePatterns(), + getNativeSigningClausePatterns(), + ]); + + expect(nativePatterns).toEqual(tsPatterns); + expect(nativePatterns.some((pattern) => pattern.includes("Signed"))).toBe( + true, ); + expect(nativePatterns.some((pattern) => pattern.includes("À"))).toBe(true); + }); + + test("native pipeline package context cache is scoped by dictionary identity", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-context-dictionaries", + ); + const context = createPipelineContext(); + const cacheDictionaries = { + firstNames: { + en: ["Ada"], + }, + } satisfies Dictionaries; + const config = { + ...BASE_CONFIG, + dictionaries: cacheDictionaries, + enableCountries: false, + labels: ["person"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + dictionaries: { ...cacheDictionaries }, + }, + context, + }); + + expect(counts().compressedPrepare).toBe(2); }); test("native config carries coreference definition data", async () => { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index a11ec247..6a1d1996 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -289,6 +289,7 @@ export type NativeGazetteerData = { }; export type NativeHotwordRule = { + hotwords: string[]; target_labels: string[]; score_adjustment: number; reclassify_to?: string; @@ -1276,15 +1277,6 @@ const buildNativeStaticConfig = ({ ? toNativeGlobalLiteralPattern(patternEntryText(pattern)) : toNativeLiteralPattern(pattern), ); - const nativeHotwordPatterns: NativeSearchPattern[] = []; - const nativeHotwordPatternRuleIndices: number[] = []; - for (const [ruleIndex, rule] of hotwordRules.entries()) { - for (const hotword of rule.hotwords) { - nativeHotwordPatterns.push(toNativeHotwordPattern(hotword)); - nativeHotwordPatternRuleIndices.push(ruleIndex); - } - } - let literalOffset = 0; const denyListPatternCount = denyListPatternsFromData ? (denyListData?.originals.length ?? 0) @@ -1311,7 +1303,7 @@ const buildNativeStaticConfig = ({ literalOffset = countriesSlice.end; const hotwordsSlice = { start: literalOffset, - end: literalOffset + nativeHotwordPatterns.length, + end: literalOffset, }; const hasGazetteerFuzzyPatterns = gazetteerData?.isFuzzy.some((isFuzzy) => isFuzzy) ?? false; @@ -1324,7 +1316,6 @@ const buildNativeStaticConfig = ({ ...streetTypeNativePatterns, ...gazetteerNativePatterns, ...countryNativePatterns, - ...nativeHotwordPatterns, ], regex_options: { literal_case_insensitive: true, @@ -1387,7 +1378,7 @@ const buildNativeStaticConfig = ({ if (hotwordRules.length > 0) { nativeConfig.hotword_data = { rules: hotwordRules.map(toNativeHotwordRule), - pattern_rule_indices: nativeHotwordPatternRuleIndices, + pattern_rule_indices: [], }; } if (triggerRules.length > 0) { @@ -1441,15 +1432,9 @@ const toNativeTriggerPattern = (pattern: string): NativeSearchPattern => ({ case_insensitive: true, }); -const toNativeHotwordPattern = (pattern: string): NativeSearchPattern => ({ - kind: "literal-with-options", - pattern, - case_insensitive: true, - whole_words: true, -}); - const toNativeHotwordRule = (rule: HotwordRule): NativeHotwordRule => { const result: NativeHotwordRule = { + hotwords: [...rule.hotwords], target_labels: [...rule.targetLabels], score_adjustment: rule.scoreAdjustment, proximity_before: rule.proximityBefore, diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 1f8e60d8..914f8e9e 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -2186,9 +2186,7 @@ const loadSigningPatterns = async (): Promise => { const loadNativeSigningPatterns = async (): Promise => { const mod = await import("../data/signing-clauses.json"); const data: SigningClauseConfig = mod.default ?? mod; - return buildSigningClausePatterns({ - patterns: data.patterns.filter((entry) => entry.lang === "de"), - }); + return buildSigningClausePatterns(data); }; export const getSigningClausePatterns = (): Promise => { diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 75d51dde..630b6bc1 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -51,6 +51,21 @@ const sharedPackageWithoutDictionaries = new Map< string, NativePipelinePackageCacheValue >(); +const dictionaryCacheIds = new WeakMap(); +let nextDictionaryCacheId = 0; + +const dictionaryCacheKey = (dictionaries: Dictionaries | undefined): string => { + if (dictionaries === undefined) { + return "none"; + } + const existing = dictionaryCacheIds.get(dictionaries); + if (existing !== undefined) { + return `dict:${existing}`; + } + nextDictionaryCacheId += 1; + dictionaryCacheIds.set(dictionaries, nextDictionaryCacheId); + return `dict:${nextDictionaryCacheId}`; +}; const sharedPackageCacheFor = ( dictionaries: Dictionaries | undefined, @@ -242,5 +257,6 @@ const nativePackageCacheKey = ({ [ binding.nativePackageVersion(), compressed ? "compressed" : "raw", + dictionaryCacheKey(config.dictionaries), pipelineConfigKey(config, gazetteerEntries), ].join(":"); From 1dc2747a610eb0e2499841f63425a0a2248337e4 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 11:18:32 +0200 Subject: [PATCH 077/130] fix: tighten native adapter edges --- crates/anonymize-core/src/money.rs | 50 ++++++++++++++++++- crates/anonymize-core/tests/prepared.rs | 37 ++++++++++++++ crates/anonymize-napi/src/lib.rs | 5 +- .../__test__/native-adapter-parity.test.ts | 15 +++++- .../src/__test__/native-node.test.ts | 10 ++-- packages/anonymize/src/native-node.ts | 22 +------- 6 files changed, 110 insertions(+), 29 deletions(-) diff --git a/crates/anonymize-core/src/money.rs b/crates/anonymize-core/src/money.rs index 985a74a9..5278eae7 100644 --- a/crates/anonymize-core/src/money.rs +++ b/crates/anonymize-core/src/money.rs @@ -9,6 +9,8 @@ use crate::types::Result; const MONEY_LABEL: &str = "monetary amount"; const MONEY_SCORE: f64 = 0.9; const MAX_LEFT_SCAN_BYTES: usize = 96; +const MAX_MONEY_NUMBER_SCAN_BYTES: usize = 48; +const MAX_UNGROUPED_MONEY_DIGITS: usize = 9; #[derive( Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, @@ -433,15 +435,20 @@ fn parse_number_forward(text: &str, index: usize) -> Option { let mut digits = 0usize; let mut end = index; let mut value_end = index; + let mut current_group_digits = 0usize; + let mut first_component_digits = 0usize; + let mut has_separator = false; + let mut has_grouping_separator = false; for (offset, ch) in str_tail(text, index)?.char_indices() { let char_start = index.saturating_add(offset); - if char_start.saturating_sub(index) > 48 { + if char_start.saturating_sub(index) > MAX_MONEY_NUMBER_SCAN_BYTES { break; } if ch.is_ascii_digit() { digits = digits.saturating_add(1); + current_group_digits = current_group_digits.saturating_add(1); end = char_start.saturating_add(ch.len_utf8()); value_end = end; continue; @@ -459,6 +466,19 @@ fn parse_number_forward(text: &str, index: usize) -> Option { ch, ) { + if !has_separator { + first_component_digits = current_group_digits; + } + let next_index = char_start.saturating_add(ch.len_utf8()); + let next_group_digits = digit_run_after_separator(text, next_index, ch); + if current_group_digits > 0 + && current_group_digits <= 3 + && next_group_digits == 3 + { + has_grouping_separator = true; + } + has_separator = true; + current_group_digits = 0; end = char_start.saturating_add(ch.len_utf8()); continue; } @@ -469,6 +489,14 @@ fn parse_number_forward(text: &str, index: usize) -> Option { if digits == 0 { return None; } + let leading_digits = if has_separator { + first_component_digits + } else { + digits + }; + if !has_grouping_separator && leading_digits > MAX_UNGROUPED_MONEY_DIGITS { + return None; + } Some(NumberSpan { start: index, @@ -476,6 +504,26 @@ fn parse_number_forward(text: &str, index: usize) -> Option { }) } +fn digit_run_after_separator( + text: &str, + index: usize, + separator: char, +) -> usize { + let mut count = 0usize; + let mut skipping_spaces = separator.is_whitespace(); + for ch in str_tail(text, index).into_iter().flat_map(str::chars) { + if skipping_spaces && ch.is_whitespace() && ch != '\n' && ch != '\r' { + continue; + } + skipping_spaces = false; + if !ch.is_ascii_digit() { + break; + } + count = count.saturating_add(1); + } + count +} + fn number_separator_continues( text: &str, index: usize, diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 70226c5d..2567ccbc 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1429,6 +1429,43 @@ fn prepared_search_extracts_money_from_anchored_data() { ); } +#[test] +fn prepared_search_rejects_long_ungrouped_money_numbers() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![String::from("USD")], + symbols: vec![String::from("$")], + local_names: vec![], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![], + magnitude_suffixes: vec![], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Reject USD 123456789012345 and $123456789012345. Keep USD 123456789, $123456789.00 and USD 1,234,567,890.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert!(!entities.contains(&"USD 123456789012345")); + assert!(!entities.contains(&"$123456789012345")); + assert!(entities.contains(&"USD 123456789")); + assert!(entities.contains(&"$123456789.00")); + assert!(entities.contains(&"USD 1,234,567,890")); +} + #[test] fn prepared_search_extends_money_to_written_amount_parenthetical() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index c569b5ae..b3083e95 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -631,10 +631,13 @@ impl NativePreparedSearch { let operators = operator_config_from_binding(operators.map(to_binding_operator_config)) .map_err(|error| to_napi_contract_error(&error))?; - let result = self + let mut result = self .inner .redact_static_entities_with_diagnostics(&full_text, &operators) .map_err(|error| to_napi_core_error(&error))?; + let mut diagnostics = self.prepare_diagnostics.clone(); + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; let result = static_redaction_diagnostic_result_to_utf16_binding(result, &full_text) .map_err(|error| to_napi_contract_error(&error))?; diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 1559772a..90abf193 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -42,7 +42,7 @@ import { applyPipelineLanguageScope } from "../language-scope"; import { contractTestConfig } from "./contract-config"; import { loadTestDictionaries } from "./load-dictionaries"; -setDefaultTimeout(120_000); +setDefaultTimeout(240_000); type NativeAdapter = Omit< NativeAnonymizeBinding, @@ -714,6 +714,19 @@ describe("native adapter parity", () => { (event: { stage?: unknown }) => event.stage === "prepare.cache.hit", ), ).toBe(true); + const runDiagnosticsJson = + prepared.redactStaticEntitiesDiagnosticsJson?.(text); + if (runDiagnosticsJson === undefined) { + throw new Error("missing prepared run diagnostics"); + } + const runDiagnostics = JSON.parse( + runDiagnosticsJson, + ) as StaticRedactionDiagnosticResult; + expect( + runDiagnostics.diagnostics.events.some( + (event) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); const expectedJson = JSON.parse( adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), ); diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 2a15dacf..04a90e8e 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -37,7 +37,7 @@ describe("native node loader", () => { ).toBeNull(); }); - test("loads the platform package after the local loader", () => { + test("loads the bundled native loader", () => { const calls: string[] = []; const binding = fakeNativeBinding("1.5.0"); const loaded = loadNativeAnonymizeBinding({ @@ -47,7 +47,7 @@ describe("native node loader", () => { env: {}, requireModule: (specifier) => { calls.push(specifier); - if (specifier === "@stll/anonymize-darwin-arm64") { + if (specifier === "../index.cjs") { return binding; } throw new Error("not found"); @@ -55,7 +55,7 @@ describe("native node loader", () => { }); expect(loaded).toBe(binding); - expect(calls).toEqual(["../index.cjs", "@stll/anonymize-darwin-arm64"]); + expect(calls).toEqual(["../index.cjs"]); }); test("loads an explicit native library path first", () => { @@ -89,7 +89,7 @@ describe("native node loader", () => { env: {}, requireModule: (specifier) => { calls.push(specifier); - if (specifier === "@stll/anonymize-darwin-arm64") { + if (specifier === "../index.cjs") { return binding; } throw new Error("not found"); @@ -107,7 +107,7 @@ describe("native node loader", () => { arch: "arm64", env: {}, requireModule: (specifier) => { - if (specifier === "@stll/anonymize-darwin-arm64") { + if (specifier === "../index.cjs") { return fakeNativeBinding("1.4.0"); } throw new Error("not found"); diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 810d5427..3b966038 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -62,12 +62,7 @@ export const loadNativeAnonymizeBinding = ( const platform = options.platform ?? process.platform; const arch = options.arch ?? process.arch; const env = options.env ?? process.env; - const specifiers = nativeBindingSpecifiers({ - platform, - arch, - env, - ...(options.libc !== undefined ? { libc: options.libc } : {}), - }); + const specifiers = nativeBindingSpecifiers({ env }); const errors: string[] = []; for (const specifier of specifiers) { @@ -126,16 +121,10 @@ export const createNativePipelineFromPackageFile = ({ }; type NativeBindingSpecifiersOptions = { - platform: string; - arch: string; - libc?: NativeLibc; env: Record; }; const nativeBindingSpecifiers = ({ - platform, - arch, - libc, env, }: NativeBindingSpecifiersOptions): string[] => { const specifiers: string[] = []; @@ -144,15 +133,6 @@ const nativeBindingSpecifiers = ({ specifiers.push(overridePath); } specifiers.push(LOCAL_NATIVE_LOADER); - - const packageName = nativePlatformPackageName({ - platform, - arch, - ...(libc !== undefined ? { libc } : {}), - }); - if (packageName) { - specifiers.push(packageName); - } return specifiers; }; From 31527f3450246ccacc38583bfe8d9ecc64e83824 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 11:56:06 +0200 Subject: [PATCH 078/130] fix: load city dictionaries in node builds --- packages/data/dictionaries/index.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/data/dictionaries/index.ts b/packages/data/dictionaries/index.ts index 150e36ae..2f54316b 100644 --- a/packages/data/dictionaries/index.ts +++ b/packages/data/dictionaries/index.ts @@ -847,9 +847,9 @@ export const loadCityDictionary = async ( if (!/^[A-Z]{2}$/.test(cc)) { return []; } - const mod = (await import( - `../dictionaries/cities/${cc}.json` - )) as JsonModule; + const mod = (await import(`../dictionaries/cities/${cc}.json`, { + with: { type: "json" }, + })) as JsonModule; const entries = mod.default; cityCache.set(cc, entries); return entries; From 41861594a400ce24b501410c63db7d3d1d901835 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 11:56:14 +0200 Subject: [PATCH 079/130] feat: build default native pipeline package --- .gitignore | 1 + packages/anonymize/package.json | 1 + .../anonymize/scripts/build-native-node.mjs | 14 ++++ .../scripts/build-native-pipeline-package.mjs | 68 ++++++++++++++----- packages/anonymize/scripts/dist-smoke.mjs | 21 +++++- .../src/__test__/native-node.test.ts | 26 +++++++ packages/anonymize/src/index-shared.ts | 1 + .../anonymize/src/native-default-config.ts | 20 ++++++ packages/anonymize/src/native-node.ts | 46 +++++++++++++ 9 files changed, 179 insertions(+), 19 deletions(-) create mode 100644 packages/anonymize/src/native-default-config.ts diff --git a/.gitignore b/.gitignore index 7f14bb34..a7d4bb93 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ node_modules dist .turbo target/ +*.stlanonpkg # Claude Code local worktrees. .claude/worktrees/ diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 192060d7..75890fa1 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -33,6 +33,7 @@ "dist", "index.cjs", "*.node", + "native-pipeline.stlanonpkg", "scripts/build-native-pipeline-package.mjs" ], "publishConfig": { diff --git a/packages/anonymize/scripts/build-native-node.mjs b/packages/anonymize/scripts/build-native-node.mjs index ae79c1e6..cbf5a3be 100644 --- a/packages/anonymize/scripts/build-native-node.mjs +++ b/packages/anonymize/scripts/build-native-node.mjs @@ -32,3 +32,17 @@ if (!existsSync(source)) { } copyFileSync(source, join(packageRoot, "stella_anonymize_napi.node")); + +execFileSync( + process.execPath, + [ + join(packageRoot, "scripts", "build-native-pipeline-package.mjs"), + "--out", + join(packageRoot, "native-pipeline.stlanonpkg"), + "--default-dictionaries", + ], + { + cwd: packageRoot, + stdio: "inherit", + }, +); diff --git a/packages/anonymize/scripts/build-native-pipeline-package.mjs b/packages/anonymize/scripts/build-native-pipeline-package.mjs index dc5f1e48..bc1ca0b3 100755 --- a/packages/anonymize/scripts/build-native-pipeline-package.mjs +++ b/packages/anonymize/scripts/build-native-pipeline-package.mjs @@ -5,7 +5,7 @@ import { pathToFileURL } from "node:url"; import { createPipelineContext, - DEFAULT_ENTITY_LABELS, + DEFAULT_NATIVE_PIPELINE_CONFIG, prepareNativePipelinePackage, } from "../dist/index.mjs"; import { loadNativeAnonymizeBinding } from "../dist/native-node.mjs"; @@ -60,6 +60,10 @@ function parseArgs(values) { result.raw = true; break; } + case "--default-dictionaries": { + result.defaultDictionaries = true; + break; + } case "--help": { printHelp(); process.exit(0); @@ -80,6 +84,20 @@ function requiredValue(values, index, option) { } async function loadPackageInput(options) { + const input = await loadBasePackageInput(options); + if (!options.defaultDictionaries || input.config.dictionaries !== undefined) { + return input; + } + return { + ...input, + config: { + ...input.config, + dictionaries: await loadDefaultDictionaries(), + }, + }; +} + +async function loadBasePackageInput(options) { if (!options.config) { return { config: defaultNativePipelineConfig(), gazetteerEntries: [] }; } @@ -108,30 +126,44 @@ async function loadPackageInput(options) { function defaultNativePipelineConfig() { return { - threshold: 0.3, - enableTriggerPhrases: true, - enableRegex: true, - enableLegalForms: true, - enableNameCorpus: false, - enableDenyList: false, - enableGazetteer: false, - enableNer: false, - enableConfidenceBoost: true, - enableCoreference: true, - enableHotwordRules: true, - enableZoneClassification: true, - labels: [...DEFAULT_ENTITY_LABELS], + ...DEFAULT_NATIVE_PIPELINE_CONFIG, + labels: [...DEFAULT_NATIVE_PIPELINE_CONFIG.labels], workspaceId: "native-pipeline-package", }; } +async function loadDefaultDictionaries() { + let loaded; + try { + loaded = await import("@stll/anonymize-data/dictionaries"); + } catch (error) { + throw new Error( + `--default-dictionaries requires @stll/anonymize-data: ${formatError(error)}`, + ); + } + if (typeof loaded.loadDictionaryBundle !== "function") { + throw new TypeError( + "@stll/anonymize-data/dictionaries does not export loadDictionaryBundle", + ); + } + return loaded.loadDictionaryBundle(); +} + +function formatError(error) { + if (error instanceof Error) { + return error.message; + } + return String(error); +} + function printHelp() { console.log(`Usage: build-native-pipeline-package [options] Options: - --out Output package path. Defaults to native-pipeline.stlanonpkg. - --config ESM module exporting a PipelineConfig or { config, gazetteerEntries }. - --export Export name to read from the config module. Defaults to default. - --raw Write an uncompressed package. + --out Output package path. Defaults to native-pipeline.stlanonpkg. + --config ESM module exporting a PipelineConfig or { config, gazetteerEntries }. + --export Export name to read from the config module. Defaults to default. + --default-dictionaries Load @stll/anonymize-data into configs that do not provide dictionaries. + --raw Write an uncompressed package. `); } diff --git a/packages/anonymize/scripts/dist-smoke.mjs b/packages/anonymize/scripts/dist-smoke.mjs index 8733ba73..833c1ba1 100644 --- a/packages/anonymize/scripts/dist-smoke.mjs +++ b/packages/anonymize/scripts/dist-smoke.mjs @@ -12,6 +12,7 @@ import { createPipelineContext, runPipeline } from "../dist/index.mjs"; import { createNativeAnonymizerFromPackage } from "../dist/native.mjs"; import { + createNativePipelineFromDefaultPackage, createNativePipelineFromPackageFile, loadNativeAnonymizeBinding, } from "../dist/native-node.mjs"; @@ -25,6 +26,11 @@ if (typeof loadNativeAnonymizeBinding !== "function") { if (typeof createNativePipelineFromPackageFile !== "function") { throw new TypeError("dist native-node entrypoint is missing file loading"); } +if (typeof createNativePipelineFromDefaultPackage !== "function") { + throw new TypeError( + "dist native-node entrypoint is missing default package loading", + ); +} const warnings = []; const originalWarn = console.warn; @@ -77,6 +83,19 @@ if (!person) { ); } +const nativePipeline = createNativePipelineFromDefaultPackage(); +const nativeResult = nativePipeline.redactText( + "A contract was signed by Jan Novak at Praha on 1. 1. 2025.", +); +if (nativeResult.resolvedEntities.length === 0) { + throw new Error("default native pipeline package did not detect any entity"); +} + console.log( - JSON.stringify({ event: "dist-smoke", ok: true, detected: person.text }), + JSON.stringify({ + event: "dist-smoke", + ok: true, + detected: person.text, + nativeEntityCount: nativeResult.resolvedEntities.length, + }), ); diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 04a90e8e..15fe9080 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -5,6 +5,7 @@ import { join } from "node:path"; import type { NativeAnonymizeBinding } from "../native"; import { + createNativePipelineFromDefaultPackage, createNativePipelineFromPackageFile, loadNativeAnonymizeBinding, nativePlatformPackageName, @@ -154,6 +155,31 @@ describe("native node loader", () => { rmSync(dir, { recursive: true, force: true }); } }); + + test("creates a native pipeline from the default package path override", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-pipeline-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(10, 11, 12)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const pipeline = createNativePipelineFromDefaultPackage({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(capturedBytes).toEqual([[10, 11, 12]]); + expect(pipeline.redactText("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); }); type FakeNativeBindingOptions = { diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index 9d520e94..f12d97c6 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -72,6 +72,7 @@ export type { NativeSearchPackageOptions, NativeStaticRedactionResult, } from "./native"; +export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; export { PreparedNativePipeline, assertNativePipelineSupported, diff --git a/packages/anonymize/src/native-default-config.ts b/packages/anonymize/src/native-default-config.ts new file mode 100644 index 00000000..54118ca3 --- /dev/null +++ b/packages/anonymize/src/native-default-config.ts @@ -0,0 +1,20 @@ +import { DEFAULT_ENTITY_LABELS } from "./constants"; +import type { PipelineConfig } from "./types"; + +export const DEFAULT_NATIVE_PIPELINE_CONFIG: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: true, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "native-pipeline-default", +}; diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 3b966038..c49a40ad 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -35,8 +35,19 @@ export type NativePipelinePackageFileOptions = LoadNativeBindingOptions & { packagePath: string; }; +export type DefaultNativePipelinePackageOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; + packagePath?: string; +}; + const LOCAL_NATIVE_LOADER = "../index.cjs"; const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; +const DEFAULT_NATIVE_PIPELINE_PACKAGE_URL = new URL( + "../native-pipeline.stlanonpkg", + import.meta.url, +); + +export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; export const nativePlatformPackageName = ({ platform, @@ -99,6 +110,16 @@ export const readNativePipelinePackageFile = ( packagePath: string, ): Uint8Array => new Uint8Array(readFileSync(packagePath)); +export const readDefaultNativePipelinePackageFile = (): Uint8Array => { + try { + return new Uint8Array(readFileSync(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL)); + } catch (error) { + throw new Error( + `Default native pipeline package is unavailable: ${formatLoadError(error)}`, + ); + } +}; + export const createNativePipelineFromPackageFile = ({ binding, packagePath, @@ -120,6 +141,31 @@ export const createNativePipelineFromPackageFile = ({ }); }; +export const createNativePipelineFromDefaultPackage = ({ + binding, + packagePath, + expectedVersion, + ...loadOptions +}: DefaultNativePipelinePackageOptions = {}): PreparedNativePipeline => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + const packageBytes = + packagePath === undefined + ? readDefaultNativePipelinePackageFile() + : readNativePipelinePackageFile(packagePath); + return createNativePipelineFromPackage({ + binding: resolvedBinding, + packageBytes, + }); +}; + type NativeBindingSpecifiersOptions = { env: Record; }; From 162a16b31e9b6ae3810eb72a4b36de59f1599ee6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 11:56:19 +0200 Subject: [PATCH 080/130] chore: filter prepare stages from fixture diagnostics --- packages/anonymize/scripts/migration-fixture-perf.mjs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index cc517cdf..40cd961a 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -582,7 +582,9 @@ function collectNativeDiagnostics({ runner, fixtures }) { ); fixtureDiagnostics.push({ fixture: relative(FIXTURES_DIR, fixturePath), - stages: diagnosticStageSummaries(report.diagnostics.events), + stages: diagnosticStageSummaries(report.diagnostics.events).filter( + isRunStage, + ), }); } @@ -608,6 +610,10 @@ function collectNativeDiagnostics({ runner, fixtures }) { }; } +function isRunStage(stage) { + return !stage.stage.startsWith("prepare."); +} + function summarizeFixtureDiagnostics(fixtureDiagnostics) { const stageBuckets = new Map(); const byFixture = []; From d8677b05bc5e1a9ed29e7d48acd43f795632a902 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 11:58:47 +0200 Subject: [PATCH 081/130] feat: cache default native pipeline --- .../src/__test__/native-node.test.ts | 38 +++++++++ packages/anonymize/src/native-node.ts | 77 ++++++++++++++++++- 2 files changed, 114 insertions(+), 1 deletion(-) diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 15fe9080..f8288574 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -7,8 +7,10 @@ import type { NativeAnonymizeBinding } from "../native"; import { createNativePipelineFromDefaultPackage, createNativePipelineFromPackageFile, + getDefaultNativePipeline, loadNativeAnonymizeBinding, nativePlatformPackageName, + preloadDefaultNativePipeline, readNativePipelinePackageFile, } from "../native-node"; @@ -180,6 +182,42 @@ describe("native node loader", () => { rmSync(dir, { recursive: true, force: true }); } }); + + test("caches the default native pipeline per binding and package path", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-cache-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(13, 14, 15)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const first = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + const second = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + const preloaded = preloadDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(preloaded).toBe(first); + expect(capturedBytes).toEqual([[13, 14, 15]]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); }); type FakeNativeBindingOptions = { diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index c49a40ad..d701101b 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -40,12 +40,22 @@ export type DefaultNativePipelinePackageOptions = LoadNativeBindingOptions & { packagePath?: string; }; +type ResolvedDefaultNativePipelineOptions = { + binding: NativeAnonymizeBinding; + packagePath?: string; +}; + const LOCAL_NATIVE_LOADER = "../index.cjs"; const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; const DEFAULT_NATIVE_PIPELINE_PACKAGE_URL = new URL( "../native-pipeline.stlanonpkg", import.meta.url, ); +const DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY = ""; +const defaultNativePipelineCache = new WeakMap< + NativeAnonymizeBinding, + Map +>(); export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; @@ -156,16 +166,81 @@ export const createNativePipelineFromDefaultPackage = ({ if (binding && expectedVersion !== undefined) { assertNativeBindingVersion({ binding, expectedVersion }); } + return createNativePipelineFromResolvedDefaultPackage({ + binding: resolvedBinding, + ...(packagePath !== undefined ? { packagePath } : {}), + }); +}; + +export const getDefaultNativePipeline = ({ + binding, + packagePath, + expectedVersion, + ...loadOptions +}: DefaultNativePipelinePackageOptions = {}): PreparedNativePipeline => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + const cache = defaultPipelineCacheFor(resolvedBinding); + const key = defaultPipelineCacheKey({ + binding: resolvedBinding, + ...(packagePath !== undefined ? { packagePath } : {}), + }); + const cached = cache.get(key); + if (cached !== undefined) { + return cached; + } + const pipeline = createNativePipelineFromResolvedDefaultPackage({ + binding: resolvedBinding, + ...(packagePath !== undefined ? { packagePath } : {}), + }); + cache.set(key, pipeline); + return pipeline; +}; + +export const preloadDefaultNativePipeline = getDefaultNativePipeline; + +const createNativePipelineFromResolvedDefaultPackage = ({ + binding, + packagePath, +}: ResolvedDefaultNativePipelineOptions): PreparedNativePipeline => { const packageBytes = packagePath === undefined ? readDefaultNativePipelinePackageFile() : readNativePipelinePackageFile(packagePath); return createNativePipelineFromPackage({ - binding: resolvedBinding, + binding, packageBytes, }); }; +const defaultPipelineCacheFor = ( + binding: NativeAnonymizeBinding, +): Map => { + const cached = defaultNativePipelineCache.get(binding); + if (cached !== undefined) { + return cached; + } + const created = new Map(); + defaultNativePipelineCache.set(binding, created); + return created; +}; + +const defaultPipelineCacheKey = ({ + binding, + packagePath, +}: ResolvedDefaultNativePipelineOptions): string => + [ + binding.nativePackageVersion(), + packagePath ?? DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY, + ].join("\0"); + type NativeBindingSpecifiersOptions = { env: Record; }; From 9787f80914b2dd9019603a48a663439744918559 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:17:19 +0200 Subject: [PATCH 082/130] fix: align native trigger edge cases --- crates/anonymize-adapter-contract/src/lib.rs | 7 + crates/anonymize-core/src/triggers.rs | 130 ++++++++++-------- .../tests/false_positive_parity.rs | 2 + crates/anonymize-core/tests/prepared.rs | 14 +- crates/anonymize-core/tests/trigger_parity.rs | 102 ++++++++++++++ .../src/__test__/pipeline-config.test.ts | 34 +++++ .../anonymize/src/build-unified-search.ts | 6 +- 7 files changed, 237 insertions(+), 58 deletions(-) diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index cef22d5e..3acddc03 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -187,6 +187,8 @@ pub struct BindingTriggerData { #[serde(default)] pub party_position_terms: Vec, #[serde(default)] + pub post_nominals: Vec, + #[serde(default)] pub sentence_terminal_currency_terms: Vec, } @@ -622,6 +624,8 @@ struct BinaryTriggerData { rules: Vec, address_stop_keywords: Vec, party_position_terms: Vec, + #[serde(default)] + post_nominals: Vec, sentence_terminal_currency_terms: Vec, } @@ -869,6 +873,7 @@ impl From for BinaryTriggerData { .collect(), address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, + post_nominals: data.post_nominals, sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } @@ -884,6 +889,7 @@ impl From for BindingTriggerData { .collect(), address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, + post_nominals: data.post_nominals, sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } @@ -1996,6 +2002,7 @@ fn trigger_data_from_binding( address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, legal_form_suffixes, + post_nominals: data.post_nominals, sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, } } diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 684d7692..4d70fc9e 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -22,6 +22,8 @@ pub struct TriggerData { pub address_stop_keywords: Vec, pub party_position_terms: Vec, pub legal_form_suffixes: Vec, + #[serde(default)] + pub post_nominals: Vec, pub sentence_terminal_currency_terms: Vec, } @@ -75,6 +77,7 @@ pub(crate) struct PreparedTriggerData { address_stop_keywords: Vec, party_position_terms: Vec, legal_form_suffixes: Vec, + post_nominals: Vec, sentence_terminal_currency_terms: Vec, } @@ -124,6 +127,7 @@ struct ExtractedValue { struct TriggerExtractionData<'a> { address_stop_keywords: &'a [String], party_position_terms: &'a [String], + post_nominals: &'a [String], sentence_terminal_currency_terms: &'a [String], } @@ -139,6 +143,11 @@ impl PreparedTriggerData { address_stop_keywords: data.address_stop_keywords, party_position_terms: data.party_position_terms, legal_form_suffixes: data.legal_form_suffixes, + post_nominals: data + .post_nominals + .into_iter() + .filter(|term| !term.trim().is_empty()) + .collect(), sentence_terminal_currency_terms: data .sentence_terminal_currency_terms .into_iter() @@ -223,6 +232,7 @@ pub(crate) fn process_trigger_matches( let extraction_data = TriggerExtractionData { address_stop_keywords: &data.address_stop_keywords, party_position_terms: &data.party_position_terms, + post_nominals: &data.post_nominals, sentence_terminal_currency_terms: &data.sentence_terminal_currency_terms, }; @@ -349,10 +359,11 @@ fn extract_value( ) -> Result> { let trigger_end_byte = offsets.validate_offset(trigger_end)?; let lookahead = get_trigger_lookahead(strategy); - let lookahead_end = floor_char_boundary( - text, - text.len().min(trigger_end_byte.saturating_add(lookahead)), - ); + let lookahead_end_offset = offsets.offset_after_utf16_units( + trigger_end, + u32::try_from(lookahead).unwrap_or(u32::MAX), + )?; + let lookahead_end = offsets.validate_offset(lookahead_end_offset)?; let remaining = text .get(trigger_end_byte..lookahead_end) .unwrap_or_default(); @@ -375,6 +386,7 @@ fn extract_value( label, stop_words, max_length.unwrap_or(MAX_TRIGGER_VALUE_LEN), + data.post_nominals, data.sentence_terminal_currency_terms, ), PreparedTriggerStrategy::ToEndOfLine => { @@ -407,6 +419,7 @@ fn extract_to_next_comma( label: &str, stop_words: &[String], length_cap: usize, + post_nominals: &[String], sentence_terminal_currency_terms: &[String], ) -> Option { let mut end = 0; @@ -436,7 +449,7 @@ fn extract_to_next_comma( continue; } if label == "person" - && let Some(skip) = post_nominal_len(after) + && let Some(skip) = post_nominal_len(after, post_nominals) { end = end.saturating_add(skip); continue; @@ -488,7 +501,7 @@ fn extract_n_words( value_text: &str, value_start_byte: usize, count: usize, - label: &str, + _label: &str, ) -> Option { let cell_end = value_text.find('\t').unwrap_or(value_text.len()); let cell = value_text.get(..cell_end)?; @@ -502,11 +515,11 @@ fn extract_n_words( let relative = cell.get(search_pos..)?.find(word)?; let start = search_pos.saturating_add(relative); words.push(WordToken { - text: word, + _text: word, start, end: start.saturating_add(word.len()), }); - if words.len() >= date_aware_word_count(label, count, &words) { + if words.len() >= count { break; } } @@ -521,39 +534,11 @@ fn extract_n_words( #[derive(Clone, Copy)] struct WordToken<'a> { - text: &'a str, + _text: &'a str, start: usize, end: usize, } -fn date_aware_word_count( - label: &str, - configured_count: usize, - words: &[WordToken<'_>], -) -> usize { - if configured_count != 1 || !matches!(label, "date" | "date of birth") { - return configured_count; - } - if words - .first() - .is_some_and(|word| starts_written_day_token(word.text)) - { - return 3; - } - configured_count -} - -fn starts_written_day_token(text: &str) -> bool { - let token = text.trim_end_matches(|ch: char| { - matches!(ch, ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”' | '’') - }); - let Some(day) = token.strip_suffix('.') else { - return false; - }; - let digit_count = day.chars().filter(char::is_ascii_digit).count(); - (1..=2).contains(&digit_count) && day.chars().all(|ch| ch.is_ascii_digit()) -} - fn extract_company_id_value( text: &str, trigger_end_byte: usize, @@ -680,7 +665,7 @@ fn extract_match_pattern( .split_once('\n') .map_or(value_text, |(head, _)| head); let found = regex.find(line).ok().flatten()?; - if found.start() == found.end() { + if found.start() != 0 || found.start() == found.end() { return None; } Some(ByteValue { @@ -920,13 +905,6 @@ fn previous_char_boundary(text: &str, byte: usize) -> usize { .map_or(0, |(index, _)| index) } -const fn floor_char_boundary(text: &str, mut byte: usize) -> usize { - while byte > 0 && !text.is_char_boundary(byte) { - byte = byte.saturating_sub(1); - } - byte -} - fn is_word_byte(text: &str, byte: usize) -> bool { text .get(byte..) @@ -985,18 +963,45 @@ fn is_decimal_comma(text: &str) -> bool { .is_some_and(|ch| ch.is_ascii_digit() || matches!(ch, '-' | '–' | '—')) } -fn post_nominal_len(text: &str) -> Option { +fn post_nominal_len(text: &str, post_nominals: &[String]) -> Option { let trimmed = text.strip_prefix(',')?.trim_start(); let len_before = text.len().saturating_sub(trimmed.len()); - let mut token_end = 0; - for (index, ch) in trimmed.char_indices() { - if ch.is_alphabetic() || ch == '.' { - token_end = index.saturating_add(ch.len_utf8()); + post_nominals + .iter() + .filter_map(|term| post_nominal_prefix_len(trimmed, term)) + .max() + .map(|term_len| len_before.saturating_add(term_len)) +} + +fn post_nominal_prefix_len(text: &str, term: &str) -> Option { + let mut text_index = 0usize; + for expected in term.chars() { + if expected == '.' { + let next = text.get(text_index..)?.chars().next()?; + if next != '.' { + return None; + } + text_index = text_index.saturating_add(next.len_utf8()); + let rest = text.get(text_index..)?; + text_index = text_index + .saturating_add(rest.len().saturating_sub(rest.trim_start().len())); continue; } - break; + + let next = text.get(text_index..)?.chars().next()?; + if !next.eq_ignore_ascii_case(&expected) { + return None; + } + text_index = text_index.saturating_add(next.len_utf8()); } - (token_end > 0).then_some(len_before.saturating_add(token_end)) + + if text + .get(text_index..) + .is_some_and(|tail| tail.starts_with('.')) + { + text_index = text_index.saturating_add(1); + } + Some(text_index) } fn is_sentence_terminator( @@ -1126,6 +1131,13 @@ fn phone_shape_end(text: &str) -> Option { } let mut end = first.len_utf8(); for (index, ch) in chars { + if ch == '.' + && text + .get(index.saturating_add(ch.len_utf8())..) + .is_some_and(|tail| tail.starts_with(char::is_whitespace)) + { + break; + } if ch.is_ascii_digit() || ch.is_whitespace() || matches!(ch, '(' | ')' | '.' | '/' | '-' | '–' | '—' | '‑') @@ -1343,6 +1355,7 @@ fn number_label_len(text: &str) -> Option { fn id_value_prefix(text: &str) -> Option<&str> { let mut end = 0; let mut digits = 0_usize; + let mut leading_alpha = 0_usize; let mut previous_was_digit = false; for (index, ch) in text.char_indices() { let allowed = if ch.is_ascii_digit() { @@ -1351,6 +1364,9 @@ fn id_value_prefix(text: &str) -> Option<&str> { true } else if ch.is_ascii_alphabetic() { let allow = digits == 0 || previous_was_digit; + if digits == 0 { + leading_alpha = leading_alpha.saturating_add(1); + } previous_was_digit = false; allow } else if matches!(ch, ' ' | '.' | '-' | '/' | '\t') { @@ -1365,8 +1381,11 @@ fn id_value_prefix(text: &str) -> Option<&str> { end = index.saturating_add(ch.len_utf8()); } let candidate = text.get(..end)?; - (digits >= 2 && end >= 5 && !single_digit_dotted_prefix(candidate)) - .then_some(candidate) + (digits >= 2 + && end >= 5 + && leading_alpha <= 3 + && !single_digit_dotted_prefix(candidate)) + .then_some(candidate) } fn single_digit_dotted_prefix(text: &str) -> bool { @@ -1537,6 +1556,7 @@ mod tests { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }) .unwrap(); @@ -1609,6 +1629,7 @@ mod tests { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }) .unwrap(); @@ -1648,6 +1669,7 @@ mod tests { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }) .unwrap(); diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index 9f729b8b..5303c78e 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -88,6 +88,7 @@ fn keeps_trigger_address_with_extra_component_anchor() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), deny_list_data: Some(empty_deny_list_data(DenyListFilterData { @@ -205,6 +206,7 @@ fn rejects_only_ambiguous_street_type_trigger_addresses() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), deny_list_data: Some(empty_deny_list_data(DenyListFilterData { diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 2567ccbc..0232fe9b 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -1081,6 +1081,7 @@ fn prepared_search_extracts_written_date_of_birth_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1104,7 +1105,7 @@ fn prepared_search_extracts_written_date_of_birth_trigger() { } #[test] -fn prepared_search_extends_single_word_written_date_trigger() { +fn prepared_search_honors_single_word_written_date_trigger_count() { let prepared = PreparedSearch::new(PreparedSearchConfig { regex_patterns: vec![SearchPattern::LiteralWithOptions { pattern: String::from("geboren am"), @@ -1126,6 +1127,7 @@ fn prepared_search_extends_single_word_written_date_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1143,8 +1145,7 @@ fn prepared_search_extends_single_word_written_date_trigger() { result .resolved_entities .iter() - .any(|entity| entity.label == "date of birth" - && entity.text == "21. März 1968") + .any(|entity| entity.label == "date of birth" && entity.text == "21.") ); } @@ -1187,6 +1188,7 @@ fn prepared_search_extracts_year_after_duplicate_year_word_noise() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1230,6 +1232,7 @@ fn prepared_search_trigger_caps_by_characters_not_bytes() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1274,6 +1277,7 @@ fn prepared_search_trigger_validations_count_characters_not_bytes() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1315,6 +1319,7 @@ fn prepared_search_rejects_lowercase_acronym_trigger_collisions() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1363,6 +1368,7 @@ fn prepared_search_trims_party_position_before_triggered_address() { address_stop_keywords: Vec::new(), party_position_terms: vec![String::from("prodávajícího")], legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -1973,6 +1979,7 @@ fn prepared_search_keeps_person_name_particles_after_trigger() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), ..empty_config(PreparedSearchSlices::default()) @@ -2479,6 +2486,7 @@ fn prepared_search_expands_plain_postal_city_addresses() { address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), sentence_terminal_currency_terms: Vec::new(), }), deny_list_data: Some(DenyListMatchData { diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs index e9f73586..7830b9a7 100644 --- a/crates/anonymize-core/tests/trigger_parity.rs +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -63,6 +63,11 @@ fn prepared_for_trigger( address_stop_keywords: Vec::new(), party_position_terms: Vec::new(), legal_form_suffixes: Vec::new(), + post_nominals: vec![ + String::from("Ph.D."), + String::from("CSc."), + String::from("MBA"), + ], sentence_terminal_currency_terms: vec![String::from("Kč")], }), ..empty_config(PreparedSearchSlices::default()) @@ -134,6 +139,64 @@ fn labelled_phone_trigger_keeps_extension_suffixes() { } } +#[test] +fn labelled_phone_trigger_stops_before_numbered_sentences() { + let prepared = + prepared_for_trigger("PHONE", "phone number", TriggerStrategy::ToEndOfLine); + + let result = prepared + .detect_static_entities("PHONE: +36 1 234 5678. 1. Definitions") + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), ["+36 1 234 5678"]); +} + +#[test] +fn person_trigger_only_skips_known_post_nominals_after_comma() { + let prepared = prepared_for_trigger( + "represented by", + "person", + TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: Some(100), + }, + ); + + let prose = prepared + .detect_static_entities("represented by John Smith, and shall continue.") + .expect("static detection should succeed"); + let degree = prepared + .detect_static_entities( + "represented by John Smith, Ph.D., and shall continue.", + ) + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&prose), ["John Smith"]); + assert_eq!(trigger_texts(°ree), ["John Smith, Ph.D."]); +} + +#[test] +fn match_pattern_trigger_requires_match_at_value_start() { + let prepared = prepared_for_trigger( + "Telephone", + "phone number", + TriggerStrategy::MatchPattern { + pattern: String::from(r"\d+"), + flags: None, + }, + ); + + let rejected = prepared + .detect_static_entities("Telephone : non communique SIREN : 123456789") + .expect("static detection should succeed"); + let accepted = prepared + .detect_static_entities("Telephone : 123456789 SIREN") + .expect("static detection should succeed"); + + assert!(rejected.trigger_entities.is_empty()); + assert_eq!(trigger_texts(&accepted), ["123456789"]); +} + #[test] fn to_next_comma_stops_after_short_currency_abbreviation_sentence_tail() { let prepared = prepared_for_trigger( @@ -189,6 +252,25 @@ fn company_id_trigger_rejects_single_digit_dotted_date() { assert!(result.trigger_entities.is_empty()); } +#[test] +fn company_id_trigger_caps_leading_alpha_prefixes() { + let prepared = prepared_for_trigger( + "Company No.", + "registration number", + TriggerStrategy::CompanyIdValue, + ); + + let rejected = prepared + .detect_static_entities("Company No. ReferenceCode12345") + .expect("static detection should succeed"); + let accepted = prepared + .detect_static_entities("Company No. AB12345") + .expect("static detection should succeed"); + + assert!(rejected.trigger_entities.is_empty()); + assert_eq!(trigger_texts(&accepted), ["AB12345"]); +} + #[test] fn address_trigger_stops_after_short_proper_noun_before_real_sentence() { let prepared = prepared_for_trigger( @@ -209,3 +291,23 @@ fn address_trigger_stops_after_short_proper_noun_before_real_sentence() { result.trigger_entities, ); } + +#[test] +fn trigger_lookahead_counts_text_units_not_utf8_bytes() { + let prepared = prepared_for_trigger( + "residing at", + "address", + TriggerStrategy::Address { + max_chars: Some(120), + }, + ); + let dense_prefix = "京".repeat(90); + let expected = format!("{dense_prefix} Main Street 1"); + let text = format!("residing at {expected}\nNext line."); + + let result = prepared + .detect_static_entities(&text) + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), [expected]); +} diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 26870ec4..69b29129 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -397,6 +397,40 @@ describe("pipeline config semantics", () => { expect(search.nativeStaticConfig.monetary_data).toBeUndefined(); }); + test("native date data gates year words on trigger phrases", async () => { + const regexOnly = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableTriggerPhrases: false, + labels: ["date"], + }, + [], + createPipelineContext(), + ); + const withTriggers = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableTriggerPhrases: true, + labels: ["date"], + }, + [], + createPipelineContext(), + ); + + expect( + Object.values( + regexOnly.nativeStaticConfig.date_data?.year_words_by_language ?? {}, + ).flat(), + ).toEqual([]); + expect( + Object.values( + withTriggers.nativeStaticConfig.date_data?.year_words_by_language ?? {}, + ).flat().length, + ).toBeGreaterThan(0); + }); + test("content language scopes deny-list search build", async () => { const testDictionaries = await getDictionaries(); const config = { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 6a1d1996..f06804e6 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -37,6 +37,7 @@ import type { TriggerRule } from "./types"; import type { DenyListData, DenyListFilterData } from "./detectors/deny-list"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; +import { POST_NOMINALS } from "./config/titles"; import { loadLanguageConfigs } from "./util/lang-loader"; import { @@ -209,6 +210,7 @@ export type NativeTriggerData = { rules: NativeTriggerRule[]; address_stop_keywords: string[]; party_position_terms: string[]; + post_nominals: string[]; sentence_terminal_currency_terms: string[]; }; @@ -805,7 +807,8 @@ const buildUnifiedSearchSources = async ( ? null : { month_names_by_language: dateMonthData, - year_words_by_language: yearWordData ?? {}, + year_words_by_language: + config.enableTriggerPhrases === true ? (yearWordData ?? {}) : {}, }; const nativeMonetaryData = regexMonetaryEnabled ? monetaryData : null; const nativeSentenceTerminalCurrencyTerms = @@ -1386,6 +1389,7 @@ const buildNativeStaticConfig = ({ rules: triggerRules.map(toNativeTriggerRule), address_stop_keywords: [...getAddressStopKeywordsSync()], party_position_terms: [...partyPositionTerms], + post_nominals: [...POST_NOMINALS], sentence_terminal_currency_terms: [...sentenceTerminalCurrencyTerms], }; } From 4de1e8d5275fd07a717dd601fd447e4de912efb1 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:17:30 +0200 Subject: [PATCH 083/130] fix: simplify native binding loading --- .../src/__test__/native-node.test.ts | 26 -------------- packages/anonymize/src/native-node.ts | 36 ++----------------- 2 files changed, 3 insertions(+), 59 deletions(-) diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index f8288574..a75a076f 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -9,37 +9,11 @@ import { createNativePipelineFromPackageFile, getDefaultNativePipeline, loadNativeAnonymizeBinding, - nativePlatformPackageName, preloadDefaultNativePipeline, readNativePipelinePackageFile, } from "../native-node"; describe("native node loader", () => { - test("maps supported platform package names", () => { - expect( - nativePlatformPackageName({ platform: "darwin", arch: "arm64" }), - ).toBe("@stll/anonymize-darwin-arm64"); - expect(nativePlatformPackageName({ platform: "darwin", arch: "x64" })).toBe( - "@stll/anonymize-darwin-x64", - ); - expect(nativePlatformPackageName({ platform: "linux", arch: "x64" })).toBe( - "@stll/anonymize-linux-x64-gnu", - ); - expect( - nativePlatformPackageName({ platform: "linux", arch: "arm64" }), - ).toBe("@stll/anonymize-linux-arm64-gnu"); - expect(nativePlatformPackageName({ platform: "win32", arch: "x64" })).toBe( - "@stll/anonymize-win32-x64-msvc", - ); - expect( - nativePlatformPackageName({ - platform: "linux", - arch: "x64", - libc: "musl", - }), - ).toBeNull(); - }); - test("loads the bundled native loader", () => { const calls: string[] = []; const binding = fakeNativeBinding("1.5.0"); diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index d701101b..cc69e991 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -15,12 +15,6 @@ export type NativeRequire = (specifier: string) => unknown; export type NativeLibc = "gnu" | "musl"; -export type NativePlatformPackageOptions = { - platform: string; - arch: string; - libc?: NativeLibc; -}; - export type LoadNativeBindingOptions = { expectedVersion?: string; platform?: string; @@ -59,23 +53,6 @@ const defaultNativePipelineCache = new WeakMap< export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; -export const nativePlatformPackageName = ({ - platform, - arch, - libc = "gnu", -}: NativePlatformPackageOptions): string | null => { - if (platform === "darwin" && (arch === "arm64" || arch === "x64")) { - return `@stll/anonymize-darwin-${arch}`; - } - if (platform === "linux" && (arch === "arm64" || arch === "x64")) { - return libc === "gnu" ? `@stll/anonymize-linux-${arch}-gnu` : null; - } - if (platform === "win32" && arch === "x64") { - return "@stll/anonymize-win32-x64-msvc"; - } - return null; -}; - export const loadNativeAnonymizeBinding = ( options: LoadNativeBindingOptions = {}, ): NativeAnonymizeBinding => { @@ -104,16 +81,9 @@ export const loadNativeAnonymizeBinding = ( return binding; } - const packageName = nativePlatformPackageName({ - platform, - arch, - ...(options.libc !== undefined ? { libc: options.libc } : {}), - }); - const platformMessage = - packageName === null - ? `Unsupported native anonymize platform ${platform}/${arch}` - : `Unable to load native anonymize binding for ${platform}/${arch}`; - throw new Error(`${platformMessage}:\n${errors.join("\n")}`); + throw new Error( + `Unable to load native anonymize binding for ${platform}/${arch}:\n${errors.join("\n")}`, + ); }; export const readNativePipelinePackageFile = ( From 5d404fcdf8476991bff632d88318d1196efeaa6c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:32:05 +0200 Subject: [PATCH 084/130] chore: pin rust toolchain --- .github/workflows/ci.yml | 4 ++-- rust-toolchain.toml | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 rust-toolchain.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d5bf0670..fe7a8dcb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,8 +45,8 @@ jobs: - name: Setup Rust run: | - rustup toolchain install stable --profile minimal --component rustfmt,clippy - rustup default stable + rustup toolchain install 1.96.0 --profile minimal --component rustfmt,clippy + rustup default 1.96.0 - name: Rust checks run: bun run rust:check diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 00000000..0e9dcbcd --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "1.96.0" +components = ["rustfmt", "clippy"] +profile = "minimal" From ba7ded548f74dddfc3e62740296f922c36250efb Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:32:13 +0200 Subject: [PATCH 085/130] fix: tighten native review edge cases --- .../examples/native_adapter_perf.rs | 18 ++++-- crates/anonymize-adapter-contract/src/lib.rs | 58 +++++++++++++++---- crates/anonymize-core/src/address_context.rs | 11 +++- crates/anonymize-core/src/address_seeds.rs | 3 + crates/anonymize-core/src/byte_offsets.rs | 10 +--- crates/anonymize-core/src/coreference.rs | 2 +- crates/anonymize-core/src/false_positives.rs | 24 +++++--- crates/anonymize-core/src/normalize.rs | 11 ++++ crates/anonymize-core/src/prepared.rs | 8 +-- crates/anonymize-core/src/processors.rs | 32 +++++----- crates/anonymize-core/src/redact.rs | 9 ++- .../anonymize-core/src/resolution/boundary.rs | 50 ++++++---------- .../anonymize-core/src/resolution/sanitize.rs | 2 +- crates/anonymize-core/src/resolution/types.rs | 1 + crates/anonymize-core/src/search.rs | 2 +- crates/anonymize-core/src/triggers.rs | 4 +- .../tests/false_positive_parity.rs | 1 + crates/anonymize-core/tests/prepared.rs | 33 ++++++++++- crates/anonymize-core/tests/redaction.rs | 36 ++++++++++++ 19 files changed, 214 insertions(+), 101 deletions(-) diff --git a/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs index 6d0a204e..528de76d 100644 --- a/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs +++ b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs @@ -36,17 +36,25 @@ fn main() -> Result<(), Box> { PreparedSearch::new(prepared_search_config_from_binding(config)?)?; let prepare_ms = elapsed_ms(prepare_start); - let run_start = Instant::now(); - let mut entity_count = 0_usize; - for _ in 0..payload.iterations { - for item in &payload.cases { + let run_cases = payload + .cases + .iter() + .map(|item| -> Result<_, Box> { let operators = item .operators_json .as_deref() .map(serde_json::from_str::) .transpose()?; let operators = operator_config_from_binding(operators)?; - let result = prepared.redact_static_entities(&item.text, &operators)?; + Ok((item.text.as_str(), operators)) + }) + .collect::, _>>()?; + + let run_start = Instant::now(); + let mut entity_count = 0_usize; + for _ in 0..payload.iterations { + for (text, operators) in &run_cases { + let result = prepared.redact_static_entities(text, operators)?; entity_count = entity_count.saturating_add(result.redaction.entity_count); } } diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 3acddc03..1908a41d 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -1819,7 +1819,7 @@ pub fn static_redaction_diagnostic_result_to_utf16_binding( &mut result.result.resolved_entities, &offsets, )?; - convert_diagnostic_offsets(&mut result.diagnostics.events, &offsets); + convert_diagnostic_offsets(&mut result.diagnostics.events, &offsets)?; Ok(result) } @@ -1842,7 +1842,7 @@ pub fn static_redaction_diagnostics_to_utf16_binding( ) -> Result { let offsets = Utf16OffsetMap::new(full_text)?; let mut diagnostics = static_redaction_diagnostics_to_binding(diagnostics); - convert_diagnostic_offsets(&mut diagnostics.events, &offsets); + convert_diagnostic_offsets(&mut diagnostics.events, &offsets)?; Ok(diagnostics) } @@ -1883,19 +1883,16 @@ fn convert_pipeline_entity_offsets( fn convert_diagnostic_offsets( events: &mut [BindingDiagnosticEvent], offsets: &Utf16OffsetMap, -) { +) -> Result<()> { for event in events { - if let Some(start) = event.start - && let Some(converted) = offsets.try_convert(start) - { - event.start = Some(converted); + if let Some(start) = event.start { + event.start = Some(offsets.convert(start)?); } - if let Some(end) = event.end - && let Some(converted) = offsets.try_convert(end) - { - event.end = Some(converted); + if let Some(end) = event.end { + event.end = Some(offsets.convert(end)?); } } + Ok(()) } struct Utf16OffsetMap { @@ -2358,7 +2355,12 @@ mod tests { prepared_search_core_package_to_compressed_bytes, prepared_search_package_from_bytes, prepared_search_package_has_core_payload, prepared_search_package_to_bytes, - prepared_search_package_to_compressed_bytes, write_package_header, + prepared_search_package_to_compressed_bytes, + static_redaction_diagnostics_to_utf16_binding, write_package_header, + }; + use stella_anonymize_core::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, + StaticRedactionDiagnostics, }; #[test] @@ -2424,6 +2426,38 @@ mod tests { assert!(core.custom_regex_options.regex.overlap_all); } + #[test] + fn utf16_diagnostics_reject_invalid_byte_offsets() { + let diagnostics = StaticRedactionDiagnostics { + events: vec![DiagnosticEvent { + stage: DiagnosticStage::EntityRegex, + kind: DiagnosticEventKind::Entity, + count: None, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: Some(1), + end: Some(2), + text: None, + score: None, + span_valid: None, + elapsed_us: None, + input_bytes: None, + reason: None, + }], + }; + + let error = static_redaction_diagnostics_to_utf16_binding(diagnostics, "á") + .unwrap_err(); + + assert!(matches!( + error, + ContractError::InvalidBindingOffset { offset: 1 } + )); + } + #[test] fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { let config = package_test_config(); diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index b6e1e2ed..a35c1a65 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -48,7 +48,7 @@ impl PreparedAddressContextData { address_prepositions: lowercased_set(data.address_prepositions), temporal_prepositions: lowercased_set(data.temporal_prepositions), street_abbreviations: lowercased_set(data.street_abbreviations), - bare_house_stopwords: data.bare_house_stopwords.into_iter().collect(), + bare_house_stopwords: lowercased_set(data.bare_house_stopwords), slash_house_number: compile_regex( "address_context.slash_house_number", r"(?u)\b(?:\d{1,4}/\d+[A-Za-z]\b|\d{3,4}/\d+\b|(?:1[3-9]|[2-9]\d)/\d{3,}\b)", @@ -228,8 +228,13 @@ impl PreparedAddressContextData { continue; } - let word = captured.as_str().split_whitespace().next().unwrap_or(""); - if self.bare_house_stopwords.contains(word) { + let word = captured + .as_str() + .split_whitespace() + .next() + .unwrap_or("") + .to_lowercase(); + if self.bare_house_stopwords.contains(&word) { continue; } if overlaps_any(existing_entities, start, end) diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs index 0da84a37..7651ab9f 100644 --- a/crates/anonymize-core/src/address_seeds.rs +++ b/crates/anonymize-core/src/address_seeds.rs @@ -19,8 +19,11 @@ const US_ZIP_CONTEXT_WINDOW: usize = 120; Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, )] pub struct AddressSeedData { + #[serde(default)] pub boundary_words: Vec, + #[serde(default)] pub br_cep_cue_words: Vec, + #[serde(default)] pub unit_abbreviations: Vec, } diff --git a/crates/anonymize-core/src/byte_offsets.rs b/crates/anonymize-core/src/byte_offsets.rs index 1e40b586..51c0af67 100644 --- a/crates/anonymize-core/src/byte_offsets.rs +++ b/crates/anonymize-core/src/byte_offsets.rs @@ -39,12 +39,7 @@ impl<'a> ByteOffsets<'a> { .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) } - pub(crate) fn slice( - &self, - full_text: &str, - start: u32, - end: u32, - ) -> Result { + pub(crate) fn slice(&self, start: u32, end: u32) -> Result { if start > end { return Err(Error::InvalidSpan { start, end }); } @@ -53,7 +48,8 @@ impl<'a> ByteOffsets<'a> { let end_byte = self.validate_offset(end)?; Ok( - full_text + self + .text .get(start_byte..end_byte) .ok_or(Error::InvalidSpan { start, end })? .to_owned(), diff --git a/crates/anonymize-core/src/coreference.rs b/crates/anonymize-core/src/coreference.rs index cc7a2a7c..5209a27e 100644 --- a/crates/anonymize-core/src/coreference.rs +++ b/crates/anonymize-core/src/coreference.rs @@ -139,7 +139,7 @@ impl PreparedCoreferenceData { else { continue; }; - let gap = offsets.slice(full_text, source.end, definition_start)?; + let gap = offsets.slice(source.end, definition_start)?; if has_clause_boundary(&gap) { continue; } diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs index ef0acaf4..02554372 100644 --- a/crates/anonymize-core/src/false_positives.rs +++ b/crates/anonymize-core/src/false_positives.rs @@ -37,9 +37,7 @@ pub(crate) fn filter_entity_false_positives( continue; } - let Some(normalized) = - normalize_entity(&entity, full_text, &offsets, filters)? - else { + let Some(normalized) = normalize_entity(&entity, &offsets, filters)? else { continue; }; if should_reject_entity(&normalized, full_text, &offsets, filters)? { @@ -53,11 +51,10 @@ pub(crate) fn filter_entity_false_positives( fn normalize_entity( entity: &PipelineEntity, - full_text: &str, offsets: &ByteOffsets<'_>, filters: Option<&DenyListFilterData>, ) -> Result> { - let raw_text = offsets.slice(full_text, entity.start, entity.end)?; + let raw_text = offsets.slice(entity.start, entity.end)?; let mut start_byte = 0usize; let mut end_byte = raw_text.len(); @@ -637,8 +634,9 @@ fn find_ambiguous_component_occurrence( text: &str, term: &str, ) -> Option<(usize, usize)> { - text.match_indices(term).find_map(|(start, _)| { - let end = start.saturating_add(term.len()); + text.char_indices().find_map(|(start, _)| { + let match_len = case_insensitive_prefix_len(text.get(start..)?, term)?; + let end = start.saturating_add(match_len); let left_ok = text .get(..start) .and_then(|prefix| prefix.chars().next_back()) @@ -651,6 +649,18 @@ fn find_ambiguous_component_occurrence( }) } +fn case_insensitive_prefix_len(text: &str, prefix: &str) -> Option { + let mut consumed = 0usize; + for expected in prefix.chars() { + let actual = text.get(consumed..)?.chars().next()?; + if !actual.eq_ignore_ascii_case(&expected) { + return None; + } + consumed = consumed.saturating_add(actual.len_utf8()); + } + Some(consumed) +} + fn starts_with_capitalized_token_after_space(text: &str) -> bool { let leading = leading_whitespace_len(text); if leading == 0 { diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 68321de3..1085c6c3 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -384,12 +384,14 @@ fn compact_ascii_identifier_from( predicate: &impl Fn(&str) -> bool, ) -> Option { let mut compact = String::new(); + let mut token = String::new(); let mut last_valid = None; let tail = text.get(start..)?; for ch in tail.chars() { if ch.is_ascii_alphanumeric() { compact.push(ch.to_ascii_uppercase()); + token.push(ch.to_ascii_uppercase()); continue; } @@ -397,18 +399,27 @@ fn compact_ascii_identifier_from( if predicate(&compact) { last_valid = Some(compact.clone()); } + token.clear(); continue; } break; } + if allow_whitespace && token_is_trailing_prose(&token) && last_valid.is_some() + { + return last_valid; + } if predicate(&compact) { return Some(compact); } last_valid } +fn token_is_trailing_prose(token: &str) -> bool { + token.len() >= 3 && token.chars().all(|ch| ch.is_ascii_alphabetic()) +} + fn is_identifier_start(text: &str, index: usize, ch: char) -> bool { ch.is_ascii_alphanumeric() && text diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 25aafe6f..1832e5b6 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -1925,14 +1925,8 @@ fn validate_hotword_config(config: &PreparedSearchConfig) -> Result<()> { return Err(Error::UnsupportedStaticSlice { slice: "hotwords" }); } - if config.hotword_data.is_none() { - return Ok(()); - } - let Some(data) = &config.hotword_data else { - return Err(Error::MissingStaticData { - field: "hotword_data", - }); + return Ok(()); }; for rule in &data.rules { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index c2b901b7..75110345 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -302,7 +302,7 @@ pub fn process_regex_matches( let Some(entry) = meta.get(local_index) else { continue; }; - let text = offsets.slice(full_text, found.start(), found.end())?; + let text = offsets.slice(found.start(), found.end())?; if let Some(validator_id) = &entry.validator_id { if !validate_id(validator_id, &text, entry.validator_input.as_deref()) { continue; @@ -484,7 +484,7 @@ fn collect_deny_list_matches( }; validate_deny_list_sources(sources)?; - let match_text = offsets.slice(full_text, found.start(), found.end())?; + let match_text = offsets.slice(found.start(), found.end())?; let keyword = match_text.to_lowercase(); let pattern = data.originals.get(local_index).map_or("", String::as_str); let custom_pattern_labels = data @@ -730,7 +730,7 @@ fn append_person_name_hits( let Some(prev) = chain.last() else { break; }; - let gap = offsets.slice(full_text, prev.end, next.start)?; + let gap = offsets.slice(prev.end, next.start)?; if person_chain_breaks(prev.text.as_str(), gap.as_str()) { break; } @@ -809,13 +809,13 @@ pub fn process_gazetteer_matches( let Some(label) = data.labels.get(local_index) else { continue; }; - let extended = try_gazetteer_prefix_extension(full_text, &offsets, found)?; + let extended = try_gazetteer_prefix_extension(&offsets, found)?; let (end, text, source_detail) = if let Some(extension) = extended { extension } else { ( found.end(), - offsets.slice(full_text, found.start(), found.end())?, + offsets.slice(found.start(), found.end())?, None, ) }; @@ -858,7 +858,7 @@ pub fn process_gazetteer_matches( found.start(), found.end(), label.clone(), - offsets.slice(full_text, found.start(), found.end())?, + offsets.slice(found.start(), found.end())?, GAZETTEER_FUZZY_SCORE, DetectionSource::Gazetteer, )); @@ -891,7 +891,7 @@ pub fn process_country_matches( found.start(), found.end(), label.clone(), - offsets.slice(full_text, found.start(), found.end())?, + offsets.slice(found.start(), found.end())?, COUNTRY_SCORE, DetectionSource::Country, )); @@ -1074,7 +1074,7 @@ fn has_adjacent_address_evidence( let window_start = offsets.floor_offset(start.saturating_sub(40))?; let window_end = offsets.floor_offset(end.saturating_add(40).min(full_len))?; - let window = offsets.slice(full_text, window_start, window_end)?; + let window = offsets.slice(window_start, window_end)?; Ok(has_address_format(&window) || has_street_type(&window, filters)) } @@ -1369,7 +1369,7 @@ fn extend_person_name( Ok(ExtendedName { end: new_end, - text: offsets.slice(full_text, start, new_end)?, + text: offsets.slice(start, new_end)?, }) } @@ -1605,24 +1605,23 @@ fn extend_city_districts( match_district_suffix(slice_from(full_text, offsets, entity.end)?) { entity.end = entity.end.saturating_add(byte_len(suffix)); - entity.text = offsets.slice(full_text, entity.start, entity.end)?; + entity.text = offsets.slice(entity.start, entity.end)?; } if let Some(suffix) = match_dash_district(slice_from(full_text, offsets, entity.end)?) { entity.end = entity.end.saturating_add(byte_len(suffix)); - entity.text = offsets.slice(full_text, entity.start, entity.end)?; + entity.text = offsets.slice(entity.start, entity.end)?; } let before = offsets.slice( - full_text, offsets.floor_offset(entity.start.saturating_sub(10))?, entity.start, )?; if let Some(prefix) = postal_prefix(&before) { entity.start = entity.start.saturating_sub(byte_len(prefix)); - entity.text = offsets.slice(full_text, entity.start, entity.end)?; + entity.text = offsets.slice(entity.start, entity.end)?; } if let Some(filters) = filters @@ -1632,7 +1631,7 @@ fn extend_city_districts( ) { entity.end = entity.end.saturating_add(byte_len(suffix)); - entity.text = offsets.slice(full_text, entity.start, entity.end)?; + entity.text = offsets.slice(entity.start, entity.end)?; } } @@ -1828,7 +1827,6 @@ fn consume_whitespace_no_newline( } fn try_gazetteer_prefix_extension( - full_text: &str, offsets: &ByteOffsets<'_>, found: &SearchMatch, ) -> Result)>> { @@ -1838,7 +1836,7 @@ fn try_gazetteer_prefix_extension( return Ok(None); } - let after = offsets.slice(full_text, found.end(), max_end)?; + let after = offsets.slice(found.end(), max_end)?; if !after.starts_with(' ') { return Ok(None); } @@ -1851,7 +1849,7 @@ fn try_gazetteer_prefix_extension( let new_end = found.end().saturating_add(suffix_end); Ok(Some(( new_end, - offsets.slice(full_text, found.start(), new_end)?, + offsets.slice(found.start(), new_end)?, Some(SourceDetail::GazetteerExtension), ))) } diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs index 79ab16f6..f30fdfee 100644 --- a/crates/anonymize-core/src/redact.rs +++ b/crates/anonymize-core/src/redact.rs @@ -24,7 +24,7 @@ pub fn redact_text( validate_spans(entities, &offsets)?; let placeholder_map = build_placeholder_map(entities, full_text); - let mut sorted = redaction_spans(full_text, entities, &offsets)?; + let mut sorted = redaction_spans(entities, &offsets)?; sorted.sort_by_key(|span| span.entity.start); // Existing contract: first accepted span wins overlaps. @@ -45,7 +45,7 @@ pub fn redact_text( for span in &non_overlapping { let entity = &span.entity; if entity.start > cursor { - parts.push(offsets.slice(full_text, cursor, entity.start)?); + parts.push(offsets.slice(cursor, entity.start)?); } let placeholder = placeholder_map @@ -74,7 +74,7 @@ pub fn redact_text( let full_text_len = offsets.len()?; if cursor < full_text_len { - parts.push(offsets.slice(full_text, cursor, full_text_len)?); + parts.push(offsets.slice(cursor, full_text_len)?); } Ok(RedactionResult { @@ -125,7 +125,6 @@ struct RedactionSpan { } fn redaction_spans( - full_text: &str, entities: &[Entity], offsets: &ByteOffsets<'_>, ) -> Result> { @@ -134,7 +133,7 @@ fn redaction_spans( for entity in entities { resolved.push(RedactionSpan { entity: entity.clone(), - source_text: offsets.slice(full_text, entity.start, entity.end)?, + source_text: offsets.slice(entity.start, entity.end)?, }); } diff --git a/crates/anonymize-core/src/resolution/boundary.rs b/crates/anonymize-core/src/resolution/boundary.rs index 53f3b396..0bff96a6 100644 --- a/crates/anonymize-core/src/resolution/boundary.rs +++ b/crates/anonymize-core/src/resolution/boundary.rs @@ -13,11 +13,10 @@ pub fn enforce_boundary_consistency( let offsets = ByteOffsets::new(full_text); let spans = char_spans(full_text); let boundaries = word_boundaries(&spans); - let fixed = - fix_partial_words(entities, full_text, &offsets, &spans, &boundaries)?; - let resolved = resolve_cross_label_overlaps(&fixed, full_text, &offsets)?; + let fixed = fix_partial_words(entities, &offsets, &spans, &boundaries)?; + let resolved = resolve_cross_label_overlaps(&fixed, &offsets)?; let deduped = deduplicate_spans(&resolved); - let merged = merge_adjacent(&deduped, full_text, &offsets)?; + let merged = merge_adjacent(&deduped, &offsets)?; Ok(remove_nested_same_label(&merged)) } @@ -30,7 +29,6 @@ struct CharSpan { fn fix_partial_words( entities: &[PipelineEntity], - full_text: &str, offsets: &ByteOffsets<'_>, spans: &[CharSpan], boundaries: &BTreeSet, @@ -45,7 +43,7 @@ fn fix_partial_words( continue; } - if entity.text != offsets.slice(full_text, entity.start, entity.end)? { + if entity.text != offsets.slice(entity.start, entity.end)? { fixed.push(entity.clone()); continue; } @@ -73,7 +71,7 @@ fn fix_partial_words( let mut adjusted = entity.clone(); adjusted.start = new_start; adjusted.end = new_end; - adjusted.text = offsets.slice(full_text, new_start, new_end)?; + adjusted.text = offsets.slice(new_start, new_end)?; fixed.push(adjusted); } @@ -82,7 +80,6 @@ fn fix_partial_words( fn resolve_cross_label_overlaps( entities: &[PipelineEntity], - full_text: &str, offsets: &ByteOffsets<'_>, ) -> Result> { let mut sorted = entities.to_vec(); @@ -127,8 +124,7 @@ fn resolve_cross_label_overlaps( let new_start = left.end; if let Some(right_mut) = sorted.get_mut(right_index) { right_mut.start = new_start; - right_mut.text = - offsets.slice(full_text, new_start, right_mut.end)?; + right_mut.text = offsets.slice(new_start, right_mut.end)?; } right_index = right_index.saturating_add(1); continue; @@ -137,7 +133,7 @@ fn resolve_cross_label_overlaps( let new_end = right.start; if let Some(left_mut) = sorted.get_mut(left_index) { left_mut.end = new_end; - left_mut.text = offsets.slice(full_text, left_mut.start, new_end)?; + left_mut.text = offsets.slice(left_mut.start, new_end)?; } break; } @@ -171,7 +167,6 @@ fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { fn merge_adjacent( entities: &[PipelineEntity], - full_text: &str, offsets: &ByteOffsets<'_>, ) -> Result> { let mut sorted = entities.to_vec(); @@ -200,17 +195,11 @@ fn merge_adjacent( }; if !has_locked_boundary(previous) && entity.start < previous.end { - merge_into_previous( - &mut result, - previous_index, - entity, - full_text, - offsets, - )?; + merge_into_previous(&mut result, previous_index, entity, offsets)?; continue; } - let gap = offsets.slice(full_text, previous.end, entity.start)?; + let gap = offsets.slice(previous.end, entity.start)?; let gap_start = previous.end; let gap_end = entity.start; let gap_occupied = sorted.iter().any(|other| { @@ -228,13 +217,7 @@ fn merge_adjacent( && !gap_occupied && is_mergeable_gap(&gap) { - merge_into_previous( - &mut result, - previous_index, - entity, - full_text, - offsets, - )?; + merge_into_previous(&mut result, previous_index, entity, offsets)?; continue; } @@ -365,8 +348,11 @@ fn word_start_at( ) -> u32 { let mut cursor = position; while cursor > 0 && !boundaries.contains(&cursor) { - let Some(previous) = spans.iter().rev().find(|span| span.end <= cursor) - else { + let index = spans.partition_point(|span| span.end <= cursor); + if index == 0 { + return cursor; + } + let Some(previous) = spans.get(index.saturating_sub(1)) else { return cursor; }; if is_word_start_stop(previous.ch) { @@ -385,7 +371,8 @@ fn word_end_at( let mut cursor = position; let text_end = spans.last().map_or(0, |span| span.end); while cursor < text_end && !boundaries.contains(&cursor) { - let Some(next) = spans.iter().find(|span| span.start >= cursor) else { + let index = spans.partition_point(|span| span.start < cursor); + let Some(next) = spans.get(index) else { return cursor; }; if is_word_end_stop(next.ch) { @@ -400,12 +387,11 @@ fn merge_into_previous( entities: &mut [PipelineEntity], previous_index: usize, entity: &PipelineEntity, - full_text: &str, offsets: &ByteOffsets<'_>, ) -> Result<()> { if let Some(previous) = entities.get_mut(previous_index) { previous.end = previous.end.max(entity.end); - previous.text = offsets.slice(full_text, previous.start, previous.end)?; + previous.text = offsets.slice(previous.start, previous.end)?; if entity.score.total_cmp(&previous.score).is_gt() { previous.score = entity.score; } diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs index df41c1f1..0d4cf379 100644 --- a/crates/anonymize-core/src/resolution/sanitize.rs +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -41,7 +41,7 @@ pub(crate) fn sanitize_entities_with_source( continue; } - let raw_text = offsets.slice(full_text, entity.start, entity.end)?; + let raw_text = offsets.slice(entity.start, entity.end)?; let Some(cleaned) = clean_entity_text(entity, &raw_text) else { continue; }; diff --git a/crates/anonymize-core/src/resolution/types.rs b/crates/anonymize-core/src/resolution/types.rs index bd838074..3b1b5b53 100644 --- a/crates/anonymize-core/src/resolution/types.rs +++ b/crates/anonymize-core/src/resolution/types.rs @@ -34,6 +34,7 @@ pub enum SourceDetail { AddressContext, } +/// Internal pipeline entity span. `start` and `end` are UTF-8 byte offsets. #[derive(Clone, Debug, PartialEq)] pub struct PipelineEntity { pub start: u32, diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 820ffaea..94b269d7 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -126,7 +126,7 @@ impl SearchIndexArtifacts { "search_index_artifacts", )?; let count = reader.read_usize()?; - let mut slots = Vec::with_capacity(count); + let mut slots = Vec::new(); for _ in 0..count { slots.push( text_search::PreparedTextSearchArtifacts::from_bytes( diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs index 4d70fc9e..88588cf5 100644 --- a/crates/anonymize-core/src/triggers.rs +++ b/crates/anonymize-core/src/triggers.rs @@ -296,7 +296,7 @@ pub(crate) fn process_trigger_matches( value.start }; let mut entity_end = value.end; - let mut entity_text = offsets.slice(full_text, entity_start, entity_end)?; + let mut entity_text = offsets.slice(entity_start, entity_end)?; let mut label = if rule.label == "person" && has_known_legal_form_suffix(&entity_text, &data.legal_form_suffixes) { @@ -311,7 +311,7 @@ pub(crate) fn process_trigger_matches( && let Some(head) = value.text.get(..end) { entity_end = value.start.saturating_add(u32_len(head)); - entity_text = offsets.slice(full_text, entity_start, entity_end)?; + entity_text = offsets.slice(entity_start, entity_end)?; } if label.is_empty() { diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs index 5303c78e..83e621f8 100644 --- a/crates/anonymize-core/tests/false_positive_parity.rs +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -221,6 +221,7 @@ fn rejects_only_ambiguous_street_type_trigger_addresses() { assert!( resolved_texts(&prepared, "demeurant au cours du contrat.").is_empty() ); + assert!(resolved_texts(&prepared, "demeurant Cours.").is_empty()); assert_eq!( resolved_texts(&prepared, "demeurant Cours Mirabeau."), [String::from("Cours Mirabeau")] diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 0232fe9b..a3b3ed0f 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -118,7 +118,7 @@ fn address_context_data() -> AddressContextData { address_prepositions: vec![String::from("na"), String::from("mezi")], temporal_prepositions: vec![String::from("od"), String::from("do")], street_abbreviations: vec![String::from("ul.")], - bare_house_stopwords: vec![String::from("Section")], + bare_house_stopwords: vec![String::from("section")], } } @@ -389,6 +389,37 @@ fn prepared_search_measures_bare_house_context_in_text_offsets() { ); } +#[test] +fn prepared_search_filters_capitalized_bare_house_stopwords() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Praha 10 Section 183 follows.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Section 183") + ); +} + #[test] fn prepared_search_measures_slash_address_context_in_text_offsets() { let prepared = PreparedSearch::new(PreparedSearchConfig { diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index 8ec5688d..bc71d38e 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -124,6 +124,42 @@ fn contextual_identifier_cues_share_identifier_placeholder() { ); } +#[test] +fn identifier_normalization_stops_before_trailing_prose() { + let text = "Reg AB12345 expires. Reg AB12345 repeats."; + let second_start = text + .rfind("AB12345") + .expect("fixture should contain repeated identifier"); + let second_start = byte_len( + text + .get(..second_start) + .expect("fixture boundary should be valid"), + ); + let entities = vec![ + entity_with_display_text( + text, + "registration number", + "AB12345 expires", + "AB12345 expires", + ), + Entity::detected( + second_start, + second_start.saturating_add(byte_len("AB12345")), + "registration number", + "AB12345", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[REGISTRATION_NUMBER_1]" + ); +} + #[test] fn spaced_identifier_values_still_share_placeholder() { let text = From aa4e1fcb2f153faedddc7d1396359fc4c17c9345 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:37:57 +0200 Subject: [PATCH 086/130] fix: restore native build outputs --- .github/tools/check-packlist.mjs | 1 + turbo.json | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index a34f2c6c..2b710655 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -13,6 +13,7 @@ const PACKAGES = [ "dist/native-node.mjs", "index.cjs", "stella_anonymize_napi.node", + "native-pipeline.stlanonpkg", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/turbo.json b/turbo.json index 5582dd51..880f04d0 100644 --- a/turbo.json +++ b/turbo.json @@ -3,7 +3,12 @@ "globalDependencies": [".oxfmtrc.json"], "tasks": { "build": { - "outputs": ["dist/**", "wasm/dist/**"] + "outputs": [ + "dist/**", + "wasm/dist/**", + "stella_anonymize_napi.node", + "native-pipeline.stlanonpkg" + ] }, "typecheck": { "dependsOn": ["^build"], From 15c3bc5d9e173184d93cf8e9ea5e6228bc575617 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:55:13 +0200 Subject: [PATCH 087/130] fix: guard overlapping deny-list names --- crates/anonymize-core/src/processors.rs | 3 ++ crates/anonymize-core/tests/processors.rs | 40 +++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 75110345..213640b1 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -730,6 +730,9 @@ fn append_person_name_hits( let Some(prev) = chain.last() else { break; }; + if next.start < prev.end { + break; + } let gap = offsets.slice(prev.end, next.start)?; if person_chain_breaks(prev.text.as_str(), gap.as_str()) { break; diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs index c39081fb..cac14c80 100644 --- a/crates/anonymize-core/tests/processors.rs +++ b/crates/anonymize-core/tests/processors.rs @@ -355,6 +355,46 @@ fn deny_list_processor_suppresses_shorter_contained_curated_matches() { assert_eq!(entities[0].text, "Nemocnice Blansko"); } +#[test] +fn deny_list_processor_handles_overlapping_person_name_hits() { + let text = "John Smith Jr arrived."; + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 10, + }, + SearchMatch::Literal { + pattern: 1, + start: 5, + end: 13, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("person")], vec![String::from("person")]] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("John Smith"), String::from("Smith Jr")], + sources: vec![ + vec![String::from("first-name")], + vec![String::from("surname")], + ] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "John Smith Jr"); +} + #[test] fn deny_list_processor_suppresses_signing_place_address() { let text = "Podepsano V Brně dne 1. ledna 2026."; From c6a9c007409e90f969aecb249000d214da175045 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 12:55:44 +0200 Subject: [PATCH 088/130] test: add rust primitive properties --- Cargo.lock | 198 +++++++++++++ crates/anonymize-core/Cargo.toml | 3 + .../tests/primitives_properties.rs | 263 ++++++++++++++++++ 3 files changed, 464 insertions(+) create mode 100644 crates/anonymize-core/tests/primitives_properties.rs diff --git a/Cargo.lock b/Cargo.lock index 30eeef42..7c082a55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,12 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + [[package]] name = "bincode" version = "2.0.1" @@ -167,6 +173,16 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "fancy-regex" version = "0.18.0" @@ -178,12 +194,24 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "futures" version = "0.3.32" @@ -341,6 +369,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "memchr" version = "2.8.2" @@ -412,6 +446,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -436,6 +479,15 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -445,6 +497,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "pyo3" version = "0.29.0" @@ -502,6 +573,12 @@ dependencies = [ "syn", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.46" @@ -517,6 +594,44 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + [[package]] name = "regex" version = "1.12.4" @@ -552,6 +667,31 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "semver" version = "1.0.28" @@ -667,6 +807,7 @@ name = "stella-anonymize-core" version = "1.5.0" dependencies = [ "fancy-regex", + "proptest", "regex", "serde", "stella-stdnum-core", @@ -755,6 +896,19 @@ version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "tinyvec" version = "1.11.0" @@ -776,6 +930,12 @@ version = "1.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-case-mapping" version = "1.0.0" @@ -815,6 +975,15 @@ version = "0.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "wasip2" version = "1.0.4+wasi-0.2.12" @@ -830,12 +999,41 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index dd44fc81..9b910fea 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -16,5 +16,8 @@ serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } +[dev-dependencies] +proptest = "1" + [lints] workspace = true diff --git a/crates/anonymize-core/tests/primitives_properties.rs b/crates/anonymize-core/tests/primitives_properties.rs new file mode 100644 index 00000000..e4edf518 --- /dev/null +++ b/crates/anonymize-core/tests/primitives_properties.rs @@ -0,0 +1,263 @@ +#![allow( + clippy::arithmetic_side_effects, + clippy::expect_used, + clippy::indexing_slicing, + clippy::panic, + clippy::unwrap_used +)] + +use proptest::prelude::{ProptestConfig, Strategy, any}; +use proptest::{collection, prop_assert, prop_assert_eq, proptest, sample}; +use stella_anonymize_core::{ + DetectionSource, Entity, Error, LiteralSearchOptions, OperatorConfig, + PipelineEntity, SearchIndex, SearchMatch, SearchOptions, SearchPattern, + deanonymise, merge_and_dedup, redact_text, sanitize_entities, +}; + +const PROPERTY_CASES: u32 = 128; + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn text_char() -> impl Strategy { + sample::select(vec![ + 'a', 'b', 'Z', '0', '9', ' ', '-', '.', ',', ':', '\u{00a0}', 'á', 'ř', + '界', '🦀', '\u{0301}', + ]) +} + +fn search_char() -> impl Strategy { + sample::select(vec!['a', 'b', 'Z', '0', '9', 'á', 'ř', '界', '🦀']) +} + +fn text_fragment(max_len: usize) -> impl Strategy { + collection::vec(text_char(), 0..max_len) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn entity_text() -> impl Strategy { + collection::vec(search_char(), 1..8) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn trim_text() -> impl Strategy { + collection::vec( + sample::select(vec![ + ' ', '\t', '\n', ',', ';', ':', '"', '\'', '“', '”', '‘', '’', '«', '»', + '!', '?', + ]), + 0..6, + ) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn source_strategy() -> impl Strategy { + sample::select(vec![ + DetectionSource::Trigger, + DetectionSource::Regex, + DetectionSource::DenyList, + DetectionSource::LegalForm, + DetectionSource::Gazetteer, + DetectionSource::Country, + DetectionSource::Ner, + DetectionSource::Coreference, + ]) +} + +fn label_strategy() -> impl Strategy { + sample::select(vec![ + "person", + "organization", + "address", + "date", + "registration number", + ]) +} + +fn redaction_case() -> impl Strategy)> { + collection::vec((text_fragment(8), entity_text()), 1..8).prop_map( + |segments| { + let mut text = String::new(); + let mut entities = Vec::new(); + + for (index, (prefix, value)) in segments.into_iter().enumerate() { + text.push_str(&prefix); + let start = byte_len(&text); + text.push_str(&value); + let end = byte_len(&text); + entities.push(Entity::detected( + start, + end, + format!("generated label {index}"), + value, + )); + } + text.push_str(" tail"); + + (text, entities) + }, + ) +} + +fn pipeline_entity_strategy() -> impl Strategy { + ( + 0_u32..80, + 1_u32..24, + label_strategy(), + source_strategy(), + 0.0_f64..1.0, + ) + .prop_map(|(start, len, label, source, score)| { + let end = start.saturating_add(len); + PipelineEntity::detected( + start, + end, + label, + "x".repeat(usize::try_from(len).unwrap_or(0)), + score, + source, + ) + }) +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: PROPERTY_CASES, + ..ProptestConfig::default() + })] + + #[test] + fn generated_redactions_round_trip_on_utf8_boundaries( + (text, entities) in redaction_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, entities.len()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + for entry in &result.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + } + } + + #[test] + fn invalid_interior_utf8_offsets_are_rejected( + ch in any::().prop_filter( + "multi-byte scalar", + |candidate| candidate.len_utf8() > 1, + ), + ) { + let text = format!("a{ch}z"); + let end = 1_u32.saturating_add( + u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX), + ); + let entities = vec![Entity::detected(2, end, "person", ch.to_string())]; + + let error = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap_err(); + + prop_assert_eq!(error, Error::ByteOffsetInsideCodepoint { offset: 2 }); + } + + #[test] + fn merge_and_dedup_never_leaves_partial_overlaps( + entities in collection::vec(pipeline_entity_strategy(), 0..32), + ) { + let result = merge_and_dedup(&entities); + + for entity in &result { + prop_assert!(entity.start < entity.end); + } + + for pair in result.windows(2) { + let left = &pair[0]; + let right = &pair[1]; + prop_assert!(left.start <= right.start); + } + + for (index, left) in result.iter().enumerate() { + for right in result.iter().skip(index.saturating_add(1)) { + let overlaps = left.end > right.start && left.start < right.end; + let same_span = left.start == right.start && left.end == right.end; + prop_assert!( + !overlaps || same_span, + "partial overlap survived: {left:?} / {right:?}", + ); + } + } + } + + #[test] + fn sanitize_entities_keeps_trimmed_spans_inside_original_span( + leading in trim_text(), + core in entity_text(), + trailing in trim_text(), + label in label_strategy(), + base_start in 0_u32..20, + ) { + let raw = format!("{leading}{core}{trailing}"); + let original = PipelineEntity::detected( + base_start, + base_start.saturating_add(byte_len(&raw)), + label, + raw, + 0.5, + DetectionSource::Ner, + ); + + let result = sanitize_entities(std::slice::from_ref(&original)); + + for entity in &result { + prop_assert!(entity.start >= original.start); + prop_assert!(entity.end <= original.end); + prop_assert!(entity.start < entity.end); + prop_assert!(entity.text.chars().any(char::is_alphanumeric)); + prop_assert_eq!(entity.text.trim(), entity.text.as_str()); + } + } + + #[test] + fn literal_search_matches_are_valid_utf8_slices( + prefix in text_fragment(12), + needle in entity_text(), + suffix in text_fragment(12), + ) { + let haystack = format!("{prefix}{needle}{suffix}"); + let expected_start = byte_len(&prefix); + let expected_end = expected_start.saturating_add(byte_len(&needle)); + let index = SearchIndex::new( + vec![SearchPattern::Literal(needle.clone())], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter(&haystack).unwrap(); + + let includes_expected = matches.iter().any(|found| matches!( + found, + SearchMatch::Literal { pattern: 0, start, end } + if *start == expected_start && *end == expected_end + )); + prop_assert!(includes_expected); + for found in matches { + let SearchMatch::Literal { start, end, .. } = found else { + continue; + }; + let start = usize::try_from(start).unwrap(); + let end = usize::try_from(end).unwrap(); + let Some(slice) = haystack.get(start..end) else { + prop_assert!(false, "literal match was not a valid UTF-8 slice"); + continue; + }; + prop_assert_eq!(slice, needle.as_str()); + } + } +} From 9afba9328b7f4e6193a0bb7f78a1e01c9b6e7a8c Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:12:52 +0200 Subject: [PATCH 089/130] test: strengthen rust primitive properties --- .../tests/primitives_properties.rs | 288 +++++++++++++++++- 1 file changed, 284 insertions(+), 4 deletions(-) diff --git a/crates/anonymize-core/tests/primitives_properties.rs b/crates/anonymize-core/tests/primitives_properties.rs index e4edf518..a6285c1d 100644 --- a/crates/anonymize-core/tests/primitives_properties.rs +++ b/crates/anonymize-core/tests/primitives_properties.rs @@ -6,12 +6,15 @@ clippy::unwrap_used )] -use proptest::prelude::{ProptestConfig, Strategy, any}; -use proptest::{collection, prop_assert, prop_assert_eq, proptest, sample}; +use proptest::prelude::{Just, ProptestConfig, Strategy, any}; +use proptest::{ + collection, prop_assert, prop_assert_eq, prop_assume, proptest, sample, +}; use stella_anonymize_core::{ DetectionSource, Entity, Error, LiteralSearchOptions, OperatorConfig, - PipelineEntity, SearchIndex, SearchMatch, SearchOptions, SearchPattern, - deanonymise, merge_and_dedup, redact_text, sanitize_entities, + PipelineEntity, RegexSearchOptions, SearchIndex, SearchIndexArtifacts, + SearchMatch, SearchOptions, SearchPattern, deanonymise, merge_and_dedup, + redact_text, sanitize_entities, }; const PROPERTY_CASES: u32 = 128; @@ -41,6 +44,11 @@ fn entity_text() -> impl Strategy { .prop_map(|chars| chars.into_iter().collect()) } +fn fuzzy_text() -> impl Strategy { + collection::vec(search_char(), 2..8) + .prop_map(|chars| chars.into_iter().collect()) +} + fn trim_text() -> impl Strategy { collection::vec( sample::select(vec![ @@ -121,6 +129,167 @@ fn pipeline_entity_strategy() -> impl Strategy { }) } +fn literal_search_case() +-> impl Strategy, SearchOptions, String)> { + collection::vec(entity_text(), 1..6) + .prop_flat_map(|needles| { + let patterns = needles + .iter() + .map(|needle| SearchPattern::LiteralWithOptions { + pattern: needle.clone(), + case_insensitive: Some(false), + whole_words: Some(false), + }) + .collect::>(); + ( + Just(patterns), + collection::vec((text_fragment(8), sample::select(needles)), 1..10), + text_fragment(8), + ) + }) + .prop_map(|(patterns, segments, suffix)| { + let mut haystack = String::new(); + for (prefix, needle) in segments { + haystack.push_str(&prefix); + haystack.push_str(&needle); + } + haystack.push_str(&suffix); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + +fn mixed_search_case() +-> impl Strategy, SearchOptions, String)> { + ( + entity_text(), + entity_text(), + fuzzy_text(), + text_fragment(8), + text_fragment(8), + text_fragment(8), + ) + .prop_map(|(literal, regex_literal, fuzzy, prefix, middle, suffix)| { + let patterns = vec![ + SearchPattern::LiteralWithOptions { + pattern: literal.clone(), + case_insensitive: Some(false), + whole_words: Some(false), + }, + SearchPattern::Regex(regex::escape(®ex_literal)), + SearchPattern::Fuzzy { + pattern: fuzzy.clone(), + distance: Some(1), + }, + ]; + let haystack = + format!("{prefix}{literal} {middle}{regex_literal} {fuzzy}{suffix}"); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + +#[derive(Clone, Copy, Debug)] +enum ArtifactCorruption { + Header, + Version, + TrailingData, + Truncated, +} + +fn artifact_corruption() -> impl Strategy { + sample::select(vec![ + ArtifactCorruption::Header, + ArtifactCorruption::Version, + ArtifactCorruption::TrailingData, + ArtifactCorruption::Truncated, + ]) +} + +fn corrupt_artifact( + mut bytes: Vec, + corruption: ArtifactCorruption, +) -> Vec { + match corruption { + ArtifactCorruption::Header => { + let first = bytes.first_mut().expect("artifact header byte"); + *first ^= 0xff; + } + ArtifactCorruption::Version => { + let version_byte = bytes.get_mut(8).expect("artifact version byte"); + *version_byte ^= 0xff; + } + ArtifactCorruption::TrailingData => bytes.push(0), + ArtifactCorruption::Truncated => { + bytes.pop(); + } + } + bytes +} + +fn search_output_is_valid( + haystack: &str, + pattern_count: usize, + matches: &[SearchMatch], +) -> bool { + let mut previous: Option<(u32, u32, u32)> = None; + + for found in matches { + if found.start() >= found.end() { + return false; + } + + let Ok(pattern) = usize::try_from(found.pattern()) else { + return false; + }; + if pattern >= pattern_count { + return false; + } + + let Ok(start) = usize::try_from(found.start()) else { + return false; + }; + let Ok(end) = usize::try_from(found.end()) else { + return false; + }; + if haystack.get(start..end).is_none() { + return false; + } + + let current = (found.start(), found.end(), found.pattern()); + if previous.is_some_and(|last| last > current) { + return false; + } + previous = Some(current); + } + + true +} + proptest! { #![proptest_config(ProptestConfig { cases: PROPERTY_CASES, @@ -142,6 +311,29 @@ proptest! { } } + #[test] + fn generated_entity_spans_fail_or_round_trip( + text in text_fragment(32), + spans in collection::vec((0_u32..80, 0_u32..80, label_strategy()), 0..16), + ) { + let entities = spans + .into_iter() + .map(|(start, end, label)| { + Entity::detected(start, end, label, String::from("generated")) + }) + .collect::>(); + + let result = redact_text(&text, &entities, &OperatorConfig::default()); + if let Ok(redacted) = result { + let restored = + deanonymise(&redacted.redacted_text, &redacted.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + for entry in &redacted.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + } + } + } + #[test] fn invalid_interior_utf8_offsets_are_rejected( ch in any::().prop_filter( @@ -166,6 +358,8 @@ proptest! { entities in collection::vec(pipeline_entity_strategy(), 0..32), ) { let result = merge_and_dedup(&entities); + let second_pass = merge_and_dedup(&result); + prop_assert_eq!(&second_pass, &result); for entity in &result { prop_assert!(entity.start < entity.end); @@ -213,6 +407,7 @@ proptest! { prop_assert!(entity.start >= original.start); prop_assert!(entity.end <= original.end); prop_assert!(entity.start < entity.end); + prop_assert!(byte_len(&entity.text) <= entity.end.saturating_sub(entity.start)); prop_assert!(entity.text.chars().any(char::is_alphanumeric)); prop_assert_eq!(entity.text.trim(), entity.text.as_str()); } @@ -260,4 +455,89 @@ proptest! { prop_assert_eq!(slice, needle.as_str()); } } + + #[test] + fn prepared_literal_search_artifacts_match_direct_search( + (patterns, options, haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + prop_assume!(!artifacts.slots.is_empty()); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + prop_assert_eq!(&decoded, &artifacts); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns.clone(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn prepared_mixed_search_artifacts_match_direct_search( + (patterns, options, haystack) in mixed_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns.clone(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn malformed_search_artifacts_fail_closed( + (patterns, options, _haystack) in literal_search_case(), + corruption in artifact_corruption(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns, options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let corrupted = corrupt_artifact(encoded, corruption); + + prop_assert!(SearchIndexArtifacts::from_bytes(&corrupted).is_err()); + } + + #[test] + fn search_artifacts_reject_missing_and_extra_slots( + (patterns, options, _haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + prop_assume!(!artifacts.slots.is_empty()); + + let missing = SearchIndexArtifacts::default(); + prop_assert!( + SearchIndex::new_with_artifacts(patterns.clone(), options, &missing) + .is_err() + ); + + let mut extra = artifacts; + let first = extra.slots.first().expect("prepared slot").clone(); + extra.slots.push(first); + prop_assert!( + SearchIndex::new_with_artifacts(patterns, options, &extra).is_err() + ); + } } From 4b813045915c5fbde08e9487b76d1453b666a3e3 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:23:11 +0200 Subject: [PATCH 090/130] test: reject stale prepared search artifacts --- Cargo.lock | 4 +-- crates/anonymize-core/Cargo.toml | 2 +- crates/anonymize-core/tests/search.rs | 50 +++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7c082a55..45ca6d91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -784,7 +784,7 @@ checksum = "3a0219bd7d979d58245a4f41f695e1ac9f8befdffadd7f61f1bae9e39abc6620" [[package]] name = "stella-aho-corasick-core" version = "1.0.4" -source = "git+https://github.com/stella/aho-corasick?rev=28226295ca5df514cd915e7c26af6fd605348b81#28226295ca5df514cd915e7c26af6fd605348b81" +source = "git+https://github.com/stella/aho-corasick?rev=38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1#38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1" dependencies = [ "daachorse", "unicode-case-mapping", @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=8a42c28a8e7c5a32c838ae9dd443c21deab391ed#8a42c28a8e7c5a32c838ae9dd443c21deab391ed" +source = "git+https://github.com/stella/text-search?rev=0cfaad48a3df24f918cf52a2d5aaf32f5a031148#0cfaad48a3df24f918cf52a2d5aaf32f5a031148" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 9b910fea..ad464148 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8a42c28a8e7c5a32c838ae9dd443c21deab391ed" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0cfaad48a3df24f918cf52a2d5aaf32f5a031148" } [dev-dependencies] proptest = "1" diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs index 8dadd9b5..62cabc0d 100644 --- a/crates/anonymize-core/tests/search.rs +++ b/crates/anonymize-core/tests/search.rs @@ -415,3 +415,53 @@ fn search_index_prepared_artifacts_reject_wrong_slot_count() { "missing prepared slot artifacts should fail" ); } + +#[test] +fn search_index_prepared_artifacts_reject_stale_patterns() { + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }; + let artifacts = SearchIndex::prepare_artifacts( + vec![SearchPattern::Literal(String::from("Alice"))], + options, + ) + .unwrap(); + let stale_patterns = vec![SearchPattern::Literal(String::from("Bob"))]; + + assert!( + SearchIndex::new_with_artifacts(stale_patterns, options, &artifacts) + .is_err(), + "same-count stale prepared artifacts should fail" + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_stale_literal_options() { + let prepare_options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }; + let load_options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }; + let patterns = vec![SearchPattern::Literal(String::from("Alice"))]; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), prepare_options).unwrap(); + + assert!( + SearchIndex::new_with_artifacts(patterns, load_options, &artifacts) + .is_err(), + "prepared artifacts should be bound to literal search options" + ); +} From ab23a72b937371c2748d3abe7bfb1a904b3e8de5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:38:02 +0200 Subject: [PATCH 091/130] chore: pin text-search artifact identity fix --- Cargo.lock | 2 +- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45ca6d91..acd7dc79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=0cfaad48a3df24f918cf52a2d5aaf32f5a031148#0cfaad48a3df24f918cf52a2d5aaf32f5a031148" +source = "git+https://github.com/stella/text-search?rev=aaeded107370c8ac7479536432975c750af0c426#aaeded107370c8ac7479536432975c750af0c426" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index ad464148..9a3c578c 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0cfaad48a3df24f918cf52a2d5aaf32f5a031148" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "aaeded107370c8ac7479536432975c750af0c426" } [dev-dependencies] proptest = "1" From d72ab1a52d761df8729ce24c62872d9b1a177f42 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:43:20 +0200 Subject: [PATCH 092/130] test: strengthen search artifact properties --- .../tests/primitives_properties.rs | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/crates/anonymize-core/tests/primitives_properties.rs b/crates/anonymize-core/tests/primitives_properties.rs index a6285c1d..a155df25 100644 --- a/crates/anonymize-core/tests/primitives_properties.rs +++ b/crates/anonymize-core/tests/primitives_properties.rs @@ -213,6 +213,23 @@ fn mixed_search_case() }) } +fn mutated_search_patterns(patterns: &[SearchPattern]) -> Vec { + let mut result = patterns.to_vec(); + let Some(first) = result.first_mut() else { + return result; + }; + + match first { + SearchPattern::Literal(pattern) + | SearchPattern::Regex(pattern) + | SearchPattern::Fuzzy { pattern, .. } + | SearchPattern::LiteralWithOptions { pattern, .. } + | SearchPattern::RegexWithOptions { pattern, .. } => pattern.push('x'), + } + + result +} + #[derive(Clone, Copy, Debug)] enum ArtifactCorruption { Header, @@ -506,6 +523,36 @@ proptest! { )); } + #[test] + fn direct_mixed_search_match_presence_matches_find_iter( + (patterns, options, haystack) in mixed_search_case(), + ) { + let index = SearchIndex::new(patterns.clone(), options).unwrap(); + let matches = index.find_iter(&haystack).unwrap(); + + prop_assert_eq!(index.is_match(&haystack).unwrap(), !matches.is_empty()); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &matches, + )); + } + + #[test] + fn prepared_mixed_search_artifacts_reject_same_shape_stale_patterns( + (patterns, options, _haystack) in mixed_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let stale_patterns = mutated_search_patterns(&patterns); + prop_assume!(stale_patterns != patterns); + + prop_assert!( + SearchIndex::new_with_artifacts(stale_patterns, options, &artifacts) + .is_err() + ); + } + #[test] fn malformed_search_artifacts_fail_closed( (patterns, options, _haystack) in literal_search_case(), From 84a8ecce46928d52ad6fa10334495340a5f3c5ae Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:46:27 +0200 Subject: [PATCH 093/130] chore: pin text-search overlap artifact fix --- Cargo.lock | 2 +- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index acd7dc79..8d1f4569 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=aaeded107370c8ac7479536432975c750af0c426#aaeded107370c8ac7479536432975c750af0c426" +source = "git+https://github.com/stella/text-search?rev=8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77#8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 9a3c578c..61a9e6a3 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "aaeded107370c8ac7479536432975c750af0c426" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77" } [dev-dependencies] proptest = "1" From 9e2b4761183d0f03ebf1b27a93ce27b073f43a10 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 13:58:35 +0200 Subject: [PATCH 094/130] test: gate native fixture parity --- .github/workflows/ci.yml | 12 ++++++++++++ .../anonymize/scripts/migration-fixture-perf.mjs | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe7a8dcb..9d328153 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -86,6 +86,18 @@ jobs: git fetch origin main --depth=1 bun run --cwd packages/anonymize perf:migration-fixtures + - name: Native fixture parity and performance + env: + ANONYMIZE_MIGRATION_ALLOW_ACCEPTED_MISMATCHES: "1" + ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME: native-static + ANONYMIZE_MIGRATION_FAIL_ON_MISMATCH: "1" + ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: "1" + ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1" + ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE: "1" + run: | + git fetch origin main --depth=1 + bun run --cwd packages/anonymize perf:migration-fixtures + - name: Contract performance (informational) # Surfaces pipeline-latency regressions against the thresholds in # contract-perf.mjs. Non-blocking for now (shared-runner timing is diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 40cd961a..51987b64 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -36,6 +36,8 @@ const CANDIDATE_RUNTIME = const FAIL_ON_MISMATCH = process.env.ANONYMIZE_MIGRATION_FAIL_ON_MISMATCH ?? (CANDIDATE_RUNTIME === "typescript" ? "1" : "0"); +const ALLOW_ACCEPTED_MISMATCHES = + process.env.ANONYMIZE_MIGRATION_ALLOW_ACCEPTED_MISMATCHES === "1"; const WARM_ITERATIONS = positiveIntegerEnv( "ANONYMIZE_MIGRATION_WARM_ITERATIONS", 2, @@ -179,7 +181,9 @@ async function runCoordinator() { if (baseline !== null) { const comparison = compareSnapshots(baseline, candidate); console.log(JSON.stringify(comparison)); - if (!comparison.equal && FAIL_ON_MISMATCH !== "0") { + const acceptedByPolicy = + ALLOW_ACCEPTED_MISMATCHES && comparison.acceptedEqual; + if (!comparison.equal && !acceptedByPolicy && FAIL_ON_MISMATCH !== "0") { throw new Error( `Fixture parity failed for ${comparison.mismatches.length} fixture(s)`, ); From edc60b0aa26215aafc86f93ac4507a14979e6941 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 14:13:49 +0200 Subject: [PATCH 095/130] test: strengthen primitive invariants --- .../tests/primitives_properties.rs | 214 ++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/crates/anonymize-core/tests/primitives_properties.rs b/crates/anonymize-core/tests/primitives_properties.rs index a155df25..da402273 100644 --- a/crates/anonymize-core/tests/primitives_properties.rs +++ b/crates/anonymize-core/tests/primitives_properties.rs @@ -108,6 +108,80 @@ fn redaction_case() -> impl Strategy)> { ) } +fn reserved_person_redaction_case() +-> impl Strategy, u32)> { + ( + 1_u32..8, + collection::vec((text_fragment(8), entity_text()), 1..8), + ) + .prop_map(|(reserved_count, segments)| { + let mut text = (1..=reserved_count) + .map(|index| format!("[PERSON_{index}]")) + .collect::>() + .join(" "); + text.push(' '); + + let mut entities = Vec::new(); + for (prefix, value) in segments { + text.push_str(&prefix); + let start = byte_len(&text); + text.push_str(&value); + let end = byte_len(&text); + entities.push(Entity::detected(start, end, "person", value)); + } + + (text, entities, reserved_count) + }) +} + +fn displayed_entity_case() -> impl Strategy { + ( + text_fragment(8), + entity_text(), + text_fragment(8), + entity_text(), + ) + .prop_map(|(prefix, value, suffix, display_text)| { + let start = byte_len(&prefix); + let end = start.saturating_add(byte_len(&value)); + let text = format!("{prefix}{value}{suffix}"); + let entity = Entity::detected(start, end, "person", display_text); + (text, entity, value) + }) +} + +fn same_alias_coreference_case() +-> impl Strategy, String, String)> { + (entity_text(), entity_text(), entity_text()).prop_map( + |(alias, first_source_seed, second_source_seed)| { + let source_a = format!("{first_source_seed} source A"); + let source_b = format!("{second_source_seed} source B"); + let text = format!("{alias} met {alias}."); + let first_start = 0; + let first_end = byte_len(&alias); + let second_start = byte_len(&format!("{alias} met ")); + let second_end = second_start.saturating_add(byte_len(&alias)); + let entities = vec![ + Entity::coreference( + first_start, + first_end, + "person", + alias.clone(), + source_a.clone(), + ), + Entity::coreference( + second_start, + second_end, + "person", + alias, + source_b.clone(), + ), + ]; + (text, entities, source_a, source_b) + }, + ) +} + fn pipeline_entity_strategy() -> impl Strategy { ( 0_u32..80, @@ -169,6 +243,42 @@ fn literal_search_case() }) } +fn all_literal_identity_search_case() +-> impl Strategy, SearchOptions, String)> { + collection::vec(entity_text(), 1..6) + .prop_flat_map(|needles| { + let patterns = needles + .iter() + .map(|needle| SearchPattern::Literal(needle.clone())) + .collect::>(); + ( + Just(patterns), + collection::vec((text_fragment(8), sample::select(needles)), 1..10), + text_fragment(8), + ) + }) + .prop_map(|(patterns, segments, suffix)| { + let mut haystack = String::new(); + for (prefix, needle) in segments { + haystack.push_str(&prefix); + haystack.push_str(&needle); + } + haystack.push_str(&suffix); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + fn mixed_search_case() -> impl Strategy, SearchOptions, String)> { ( @@ -307,6 +417,14 @@ fn search_output_is_valid( true } +fn person_placeholder_number(placeholder: &str) -> Option { + placeholder + .strip_prefix("[PERSON_")? + .strip_suffix(']')? + .parse::() + .ok() +} + proptest! { #![proptest_config(ProptestConfig { cases: PROPERTY_CASES, @@ -328,6 +446,63 @@ proptest! { } } + #[test] + fn generated_redactions_skip_reserved_person_placeholders( + (text, entities, reserved_count) in reserved_person_redaction_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, entities.len()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + + for entry in &result.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + let Some(index) = person_placeholder_number(&entry.placeholder) else { + prop_assert!(false, "unexpected placeholder {}", entry.placeholder); + continue; + }; + prop_assert!(index > reserved_count); + } + } + + #[test] + fn generated_detected_originals_use_source_slice_not_display_text( + (text, entity, source_slice) in displayed_entity_case(), + ) { + let result = redact_text( + &text, + std::slice::from_ref(&entity), + &OperatorConfig::default(), + ) + .unwrap(); + + prop_assert_eq!(result.redaction_map.len(), 1); + prop_assert_eq!(result.redaction_map[0].original.as_str(), source_slice.as_str()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + } + + #[test] + fn generated_same_alias_coreferences_keep_distinct_source_identity( + (text, entities, source_a, source_b) in same_alias_coreference_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, 2); + prop_assert_eq!(result.redaction_map.len(), 2); + prop_assert!( + result.redaction_map[0].placeholder + != result.redaction_map[1].placeholder, + ); + prop_assert_eq!(result.redaction_map[0].original.as_str(), source_a.as_str()); + prop_assert_eq!(result.redaction_map[1].original.as_str(), source_b.as_str()); + prop_assert!(result.redacted_text.contains(&result.redaction_map[0].placeholder)); + prop_assert!(result.redacted_text.contains(&result.redaction_map[1].placeholder)); + } + #[test] fn generated_entity_spans_fail_or_round_trip( text in text_fragment(32), @@ -499,6 +674,45 @@ proptest! { )); } + #[test] + fn prepared_all_literal_artifacts_load_without_original_patterns( + (patterns, options, haystack) in all_literal_identity_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(Vec::new(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(prepared.len(), patterns.len()); + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn artifact_only_literal_loader_rejects_per_pattern_literal_options( + (patterns, options, _haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns, options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + prop_assert!( + SearchIndex::new_with_artifacts(Vec::new(), options, &decoded).is_err() + ); + } + #[test] fn prepared_mixed_search_artifacts_match_direct_search( (patterns, options, haystack) in mixed_search_case(), From e3ce0cc148aa296c8b11b94fc39142ce6a6f1d52 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 16:59:44 +0200 Subject: [PATCH 096/130] test: add native package timing harness --- packages/anonymize/package.json | 1 + .../scripts/migration-fixture-perf.mjs | 63 +++++++++++ .../scripts/native-package-ux-perf.mjs | 106 ++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 packages/anonymize/scripts/native-package-ux-perf.mjs diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 75890fa1..08c508b9 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -53,6 +53,7 @@ "perf:contracts": "bun scripts/contract-perf.mjs", "perf:migration-fixtures": "bun scripts/migration-fixture-perf.mjs", "perf:native-adapters": "bun scripts/native-adapter-perf.mjs", + "perf:native-package": "bun scripts/native-package-ux-perf.mjs", "smoke:dist": "bun scripts/dist-smoke.mjs", "format": "oxfmt ." }, diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 51987b64..76b05d16 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -432,6 +432,19 @@ async function runWorker() { const snapshots = Object.fromEntries( coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), ); + const nativeTimingScenario = describeNativeTimingScenario({ + runtime, + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackageCompressed, + nativePackageBytes, + nativePackageReadMs, + nativePackagePrepareMs, + nativePrepareMs, + nativeCachedPrepareAvgMs, + coldRunMs: coldRun.ms, + warmAvgMs, + }); writeFileSync( resultPath, @@ -458,11 +471,15 @@ async function runWorker() { nativePrepareMs, nativeCachedPrepareMsByIteration, nativeCachedPrepareAvgMs, + nativeFirstTouchMs: nativeTimingScenario.firstTouchMs, + nativeWarmClickMs: nativeTimingScenario.warmClickMs, coldRunMs: coldRun.ms, coldPipelineMs: roundMs( dictionaryMs + prepareMs + nativeConfigReadMs + + nativeConfigParseMs + + nativePackageReadMs + nativeStringifyMs + nativePrepareMs + coldRun.ms, @@ -472,6 +489,8 @@ async function runWorker() { dictionaryMs + prepareMs + nativeConfigReadMs + + nativeConfigParseMs + + nativePackageReadMs + nativeStringifyMs + nativePrepareMs + coldRun.ms, @@ -480,6 +499,7 @@ async function runWorker() { warmRunMs, warmAvgMs, }, + nativeTimingScenario, nativeDiagnostics, fixtureTimings, fixtures: coldRun.fixtures.map( @@ -495,6 +515,49 @@ async function runWorker() { ); } +function describeNativeTimingScenario({ + runtime, + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackageCompressed, + nativePackageBytes, + nativePackageReadMs, + nativePackagePrepareMs, + nativePrepareMs, + nativeCachedPrepareAvgMs, + coldRunMs, + warmAvgMs, +}) { + if (runtime !== "native-static") { + return { + mode: "typescript", + firstTouchMs: 0, + warmClickMs: 0, + }; + } + + const mode = usePrebuiltNativePackage + ? "prebuilt-package" + : usePrebuiltNativeConfig + ? "prebuilt-config" + : nativePackagePrepareMs > 0 + ? "build-package-in-process" + : "build-config-in-process"; + + return { + mode, + packageCompressed: nativePackageCompressed, + packageBytes: nativePackageBytes, + packageReadMs: nativePackageReadMs, + offlinePackageBuildMs: nativePackagePrepareMs, + firstPrepareMs: nativePrepareMs, + cachedPrepareMs: nativeCachedPrepareAvgMs, + firstRunMs: coldRunMs, + firstTouchMs: roundMs(nativePackageReadMs + nativePrepareMs + coldRunMs), + warmClickMs: warmAvgMs, + }; +} + async function prepareNativeStaticSearch({ sourceRoot, variant, diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs new file mode 100644 index 00000000..b6de6784 --- /dev/null +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -0,0 +1,106 @@ +import { spawnSync } from "node:child_process"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +const SCRIPT_PATH = fileURLToPath(import.meta.url); +const PACKAGE_DIR = dirname(dirname(SCRIPT_PATH)); +const ROOT_DIR = resolve(join(PACKAGE_DIR, "..", "..")); +const MIGRATION_SCRIPT = join( + PACKAGE_DIR, + "scripts", + "migration-fixture-perf.mjs", +); + +const SCENARIOS = [ + { name: "compressed", compressed: true }, + { name: "raw", compressed: false }, +]; + +const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-package-ux-")); + +try { + const scenarios = SCENARIOS.map((scenario) => runScenario(scenario)); + console.log( + JSON.stringify({ + event: "native-package-ux-perf", + scenarios, + }), + ); +} finally { + rmSync(tempRoot, { force: true, recursive: true }); +} + +function runScenario({ name, compressed }) { + const packagePath = join(tempRoot, `${name}.stlanonpkg`); + const build = runMigration({ + ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: compressed ? "1" : "0", + ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1", + ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH: packagePath, + }); + const load = runMigration({ + ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH: packagePath, + }); + + return { + name, + compressed, + fixtureCount: load.fixtureCount, + packageBytes: build.timings.nativePackageBytes, + offlinePackageBuildMs: build.timings.nativePackagePrepareMs, + firstPackageReadMs: load.timings.nativePackageReadMs, + firstPrepareMs: load.timings.nativePrepareMs, + cachedPrepareMs: load.timings.nativeCachedPrepareAvgMs, + firstRunMs: load.timings.coldRunMs, + firstTouchMs: load.timings.nativeFirstTouchMs, + warmClickMs: load.timings.nativeWarmClickMs, + fixtureTimings: load.fixtureTimings, + topColdFixtures: load.fixtureTimings.byFixture + .toSorted((left, right) => right.coldMs - left.coldMs) + .slice(0, 5), + }; +} + +function runMigration(extraEnv) { + const child = spawnSync(process.execPath, [MIGRATION_SCRIPT], { + cwd: ROOT_DIR, + env: { + ...process.env, + ...extraEnv, + ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME: "native-static", + ANONYMIZE_MIGRATION_COMPARE_BASELINE: "0", + ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE: "1", + }, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + + if (child.status !== 0) { + throw new Error( + [ + "Native package UX benchmark failed", + child.stdout.trim(), + child.stderr.trim(), + ] + .filter(Boolean) + .join("\n"), + ); + } + + return parseVariant(child.stdout); +} + +function parseVariant(stdout) { + for (const line of stdout.trim().split("\n").toReversed()) { + try { + const parsed = JSON.parse(line); + if (parsed.event === "fixture-migration-variant") { + return parsed; + } + } catch { + continue; + } + } + throw new Error("Migration benchmark did not emit a variant summary"); +} From ae4a66c82002c0c7b323e5adb841eb93740351f2 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 17:02:53 +0200 Subject: [PATCH 097/130] feat: add typed python sdk facade --- .github/workflows/ci.yml | 11 ++ .github/workflows/release.yml | 9 ++ crates/anonymize-py/pyproject.toml | 3 +- .../python/stella_anonymize/__init__.py | 62 ++++++++ .../python/stella_anonymize/__init__.pyi | 33 +++++ .../python/stella_anonymize/_native.pyi | 93 ++++++++++++ .../python/stella_anonymize/py.typed | 1 + crates/anonymize-py/src/lib.rs | 2 +- crates/anonymize-py/typecheck/sdk_usage.py | 20 +++ package.json | 1 + .../anonymize/scripts/native-adapter-perf.mjs | 8 +- .../__test__/native-adapter-parity.test.ts | 137 ++++++++++++++++-- 12 files changed, 366 insertions(+), 14 deletions(-) create mode 100644 crates/anonymize-py/python/stella_anonymize/__init__.py create mode 100644 crates/anonymize-py/python/stella_anonymize/__init__.pyi create mode 100644 crates/anonymize-py/python/stella_anonymize/_native.pyi create mode 100644 crates/anonymize-py/python/stella_anonymize/py.typed create mode 100644 crates/anonymize-py/typecheck/sdk_usage.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9d328153..da6ca6c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,8 @@ name: CI +env: + UV_VERSION: "0.10.1" + on: push: branches: [main] @@ -72,6 +75,14 @@ jobs: - name: Typecheck run: bun run typecheck + - name: Install uv + run: | + python3 -m pip install --user "uv==${{ env.UV_VERSION }}" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Python typecheck + run: bun run python:typecheck + - name: Test run: bun run test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bd18c3ae..ec032b4d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,6 +2,7 @@ name: Release env: NPM_VERSION: "11.11.1" + UV_VERSION: "0.10.1" on: push: @@ -54,6 +55,14 @@ jobs: - name: Typecheck run: bun run typecheck + - name: Install uv + run: | + python3 -m pip install --user "uv==${{ env.UV_VERSION }}" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Python typecheck + run: bun run python:typecheck + - name: Test run: bun run test diff --git a/crates/anonymize-py/pyproject.toml b/crates/anonymize-py/pyproject.toml index d8962e21..9357fcb6 100644 --- a/crates/anonymize-py/pyproject.toml +++ b/crates/anonymize-py/pyproject.toml @@ -17,4 +17,5 @@ classifiers = [ [tool.maturin] manifest-path = "Cargo.toml" -module-name = "stella_anonymize_core_py" +module-name = "stella_anonymize._native" +python-source = "python" diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py new file mode 100644 index 00000000..2b5a84b9 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from functools import lru_cache +from os import PathLike + +from ._native import ( + OperatorEntry, + PipelineEntity, + PreparedSearch, + RedactionEntry, + RedactionResult, + StaticRedactionResult, + native_package_version, + normalize_for_search, + prepare_static_search_artifacts_bytes, + prepare_static_search_compressed_package_bytes, + prepare_static_search_package_bytes, + redact_static_entities_diagnostics_json, + redact_static_entities_json, +) + +__all__ = [ + "OperatorEntry", + "PipelineEntity", + "PreparedSearch", + "RedactionEntry", + "RedactionResult", + "StaticRedactionResult", + "load_prepared_package", + "load_prepared_package_file", + "native_package_version", + "normalize_for_search", + "prepare_search_package", + "prepare_static_search_artifacts_bytes", + "prepare_static_search_compressed_package_bytes", + "prepare_static_search_package_bytes", + "redact_static_entities_diagnostics_json", + "redact_static_entities_json", +] + +BytesLike = bytes | bytearray | memoryview +PathLikeString = str | PathLike[str] + + +def prepare_search_package(config_json: str, *, compressed: bool = True) -> bytes: + if compressed: + return prepare_static_search_compressed_package_bytes(config_json) + return prepare_static_search_package_bytes(config_json) + + +def load_prepared_package(package_bytes: BytesLike) -> PreparedSearch: + return _load_prepared_package(bytes(package_bytes)) + + +def load_prepared_package_file(package_path: PathLikeString) -> PreparedSearch: + with open(package_path, "rb") as handle: + return load_prepared_package(handle.read()) + + +@lru_cache(maxsize=8) +def _load_prepared_package(package_bytes: bytes) -> PreparedSearch: + return PreparedSearch.from_prepared_package_bytes(package_bytes) diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.pyi b/crates/anonymize-py/python/stella_anonymize/__init__.pyi new file mode 100644 index 00000000..24501056 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/__init__.pyi @@ -0,0 +1,33 @@ +from __future__ import annotations + +from os import PathLike +from typing import TypeAlias + +from ._native import ( + OperatorEntry as OperatorEntry, + PipelineEntity as PipelineEntity, + PreparedSearch as PreparedSearch, + RedactionEntry as RedactionEntry, + RedactionResult as RedactionResult, + StaticRedactionResult as StaticRedactionResult, + native_package_version as native_package_version, + normalize_for_search as normalize_for_search, + prepare_static_search_artifacts_bytes as prepare_static_search_artifacts_bytes, + prepare_static_search_compressed_package_bytes as prepare_static_search_compressed_package_bytes, + prepare_static_search_package_bytes as prepare_static_search_package_bytes, + redact_static_entities_diagnostics_json as redact_static_entities_diagnostics_json, + redact_static_entities_json as redact_static_entities_json, +) + +BytesLike: TypeAlias = bytes | bytearray | memoryview +PathLikeString: TypeAlias = str | PathLike[str] + +def prepare_search_package( + config_json: str, *, compressed: bool = True +) -> bytes: ... +def load_prepared_package(package_bytes: BytesLike) -> PreparedSearch: ... +def load_prepared_package_file( + package_path: PathLikeString, +) -> PreparedSearch: ... + +__all__: list[str] diff --git a/crates/anonymize-py/python/stella_anonymize/_native.pyi b/crates/anonymize-py/python/stella_anonymize/_native.pyi new file mode 100644 index 00000000..e871a0f2 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/_native.pyi @@ -0,0 +1,93 @@ +from __future__ import annotations + +from typing import TypeAlias + +BytesLike: TypeAlias = bytes | bytearray | memoryview + +class RedactionEntry: + @property + def placeholder(self) -> str: ... + @property + def original(self) -> str: ... + +class OperatorEntry: + @property + def placeholder(self) -> str: ... + @property + def operator(self) -> str: ... + +class RedactionResult: + @property + def redacted_text(self) -> str: ... + @property + def redaction_map(self) -> list[RedactionEntry]: ... + @property + def operator_map(self) -> list[OperatorEntry]: ... + @property + def entity_count(self) -> int: ... + +class PipelineEntity: + @property + def start(self) -> int: ... + @property + def end(self) -> int: ... + @property + def label(self) -> str: ... + @property + def text(self) -> str: ... + @property + def score(self) -> float: ... + @property + def source(self) -> str: ... + @property + def source_detail(self) -> str | None: ... + +class StaticRedactionResult: + @property + def resolved_entities(self) -> list[PipelineEntity]: ... + @property + def redaction(self) -> RedactionResult: ... + +class PreparedSearch: + def __init__(self, config_json: str) -> None: ... + @staticmethod + def from_config_json_and_artifact_bytes( + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedSearch: ... + @staticmethod + def from_prepared_package_bytes( + package_bytes: BytesLike, + ) -> PreparedSearch: ... + def prepare_diagnostics_json(self) -> str: ... + def redact_static_entities( + self, + full_text: str, + operators_json: str | None = None, + ) -> StaticRedactionResult: ... + def redact_static_entities_json( + self, + full_text: str, + operators_json: str | None = None, + ) -> str: ... + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators_json: str | None = None, + ) -> str: ... + +def redact_static_entities_json( + config_json: str, + full_text: str, + operators_json: str | None = None, +) -> str: ... +def prepare_static_search_artifacts_bytes(config_json: str) -> bytes: ... +def prepare_static_search_package_bytes(config_json: str) -> bytes: ... +def prepare_static_search_compressed_package_bytes(config_json: str) -> bytes: ... +def redact_static_entities_diagnostics_json( + config_json: str, + full_text: str, + operators_json: str | None = None, +) -> str: ... +def normalize_for_search(text: str) -> str: ... +def native_package_version() -> str: ... diff --git a/crates/anonymize-py/python/stella_anonymize/py.typed b/crates/anonymize-py/python/stella_anonymize/py.typed new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/py.typed @@ -0,0 +1 @@ + diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index e4e23a47..4cf4659a 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -449,7 +449,7 @@ fn to_py_serde_error(error: &serde_json::Error) -> PyErr { } #[pymodule(gil_used = false)] -fn stella_anonymize_core_py(module: &Bound<'_, PyModule>) -> PyResult<()> { +fn _native(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module.add_class::()?; module.add_class::()?; diff --git a/crates/anonymize-py/typecheck/sdk_usage.py b/crates/anonymize-py/typecheck/sdk_usage.py new file mode 100644 index 00000000..0fc2cfbe --- /dev/null +++ b/crates/anonymize-py/typecheck/sdk_usage.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import stella_anonymize as anonymize + + +def redact_with_prepared_package(config_json: str, text: str) -> str: + package_bytes = anonymize.prepare_search_package(config_json) + prepared = anonymize.load_prepared_package(package_bytes) + result = prepared.redact_static_entities(text) + return result.redaction.redacted_text + + +def redact_with_package_file(package_path: str, text: str) -> int: + prepared = anonymize.load_prepared_package_file(package_path) + result = prepared.redact_static_entities(text) + return result.redaction.entity_count + + +def runtime_version() -> str: + return anonymize.native_package_version() diff --git a/package.json b/package.json index 11458b04..a605f1cc 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "rust:lint": "cargo ci-clippy", "rust:test": "cargo ci-test", "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:test", + "python:typecheck": "uvx ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index 38b0ccbd..d5b10fe1 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -1,5 +1,5 @@ import { spawnSync } from "node:child_process"; -import { copyFileSync, mkdtempSync } from "node:fs"; +import { copyFileSync, mkdirSync, mkdtempSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createRequire } from "node:module"; @@ -91,7 +91,7 @@ import time module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -123,7 +123,9 @@ runCommand("cargo", [ const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-perf-")); const napiPath = join(tempDir, "stella_anonymize_napi.node"); -const pythonModulePath = join(tempDir, "stella_anonymize_core_py.so"); +const pythonPackageDir = join(tempDir, "stella_anonymize"); +mkdirSync(pythonPackageDir); +const pythonModulePath = join(pythonPackageDir, "_native.so"); copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 90abf193..4e5049e3 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1,6 +1,7 @@ import { spawnSync } from "node:child_process"; import { copyFileSync, + mkdirSync, mkdtempSync, readFileSync, readdirSync, @@ -143,6 +144,7 @@ type PythonNativeOffsetSlice = { const ROOT_DIR = join(import.meta.dir, "..", "..", "..", ".."); const TARGET_DIR = join(ROOT_DIR, "target", "debug"); +const PYTHON_SOURCE_DIR = join(ROOT_DIR, "crates", "anonymize-py", "python"); const CONTRACT_FIXTURES_DIR = join( ROOT_DIR, "packages", @@ -239,7 +241,7 @@ import pathlib module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -267,7 +269,7 @@ import pathlib module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -312,7 +314,7 @@ import pathlib module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -330,7 +332,7 @@ module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) artifact_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_ARTIFACTS"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -361,7 +363,7 @@ module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -393,7 +395,7 @@ module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -419,6 +421,51 @@ results = [ print(json.dumps(results)) `; +const PYTHON_PACKAGE_FACADE_SCRIPT = ` +import json +import os +import pathlib +import sys + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +if anonymize.prepare_search_package( + payload["config_json"], + compressed=payload["compressed"], +) != package_bytes: + raise AssertionError("facade package bytes differ") +prepared = anonymize.load_prepared_package(package_bytes) +if prepared is not anonymize.load_prepared_package(package_bytes): + raise AssertionError("facade package cache did not reuse prepared search") +from_file = anonymize.load_prepared_package_file(package_path) +print( + json.dumps( + { + "from_bytes": json.loads( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) + ), + "from_file": json.loads( + from_file.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) + ), + "version": anonymize.native_package_version(), + } + ) +) +`; + let loadedAdapters: { native: NativeAdapter; pythonModulePath: string; @@ -811,6 +858,31 @@ describe("native adapter parity", () => { ).toEqual(expectedJson); }); + test("Python package facade loads compressed package bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchCompressedPackageBytes(configBytes); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + const result = callPythonPackageFacade({ + pythonModulePath: adapters.pythonModulePath, + tempDir: adapters.tempDir, + packageBytes, + text, + operators: null, + compressed: true, + }); + + expect(result.from_bytes).toEqual(expectedJson); + expect(result.from_file).toEqual(expectedJson); + expect(result.version).toBe(packageJsonVersion()); + }); + test("native facade redacts from compressed package bytes", () => { const adapters = getAdapters(); const text = @@ -1649,9 +1721,15 @@ const getAdapters = () => { const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-native-")); const napiPath = join(tempDir, "stella_anonymize_napi.node"); - const pythonModulePath = join(tempDir, "stella_anonymize_core_py.so"); + const pythonPackageDir = join(tempDir, "stella_anonymize"); + mkdirSync(pythonPackageDir); + const pythonModulePath = join(pythonPackageDir, "_native.so"); copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); + copyFileSync( + join(PYTHON_SOURCE_DIR, "stella_anonymize", "__init__.py"), + join(pythonPackageDir, "__init__.py"), + ); const native = loadNativeAdapter(napiPath); loadedAdapters = { native, pythonModulePath, tempDir }; @@ -1828,7 +1906,7 @@ import pathlib module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) @@ -1948,6 +2026,47 @@ const callPythonPreparedPackageCases = ( return JSON.parse(output); }; +type PythonPackageFacadeOptions = { + pythonModulePath: string; + tempDir: string; + packageBytes: Buffer; + text: string; + operators: Record | null; + compressed: boolean; +}; + +const callPythonPackageFacade = ({ + pythonModulePath, + tempDir, + packageBytes, + text, + operators, + compressed, +}: PythonPackageFacadeOptions): { + from_bytes: StaticRedactionResult; + from_file: StaticRedactionResult; + version: string; +} => { + const payloadPath = join(tempDir, "package-facade-payload.json"); + const packagePath = join(tempDir, "package-facade.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + compressed, + }), + ); + const output = runCommand("python3", ["-c", PYTHON_PACKAGE_FACADE_SCRIPT], { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + const callPythonDiagnostics = ( pythonModulePath: string, text: string, @@ -1979,7 +2098,7 @@ import pathlib module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) spec = importlib.util.spec_from_file_location( - "stella_anonymize_core_py", + "_native", module_path, ) module = importlib.util.module_from_spec(spec) From f8f52e81cb4b328356421b6f6909d1ff72f14fd5 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 17:07:54 +0200 Subject: [PATCH 098/130] fix: simplify native timing mode --- .../scripts/migration-fixture-perf.mjs | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 76b05d16..41713965 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -536,16 +536,12 @@ function describeNativeTimingScenario({ }; } - const mode = usePrebuiltNativePackage - ? "prebuilt-package" - : usePrebuiltNativeConfig - ? "prebuilt-config" - : nativePackagePrepareMs > 0 - ? "build-package-in-process" - : "build-config-in-process"; - return { - mode, + mode: nativeTimingMode({ + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackagePrepareMs, + }), packageCompressed: nativePackageCompressed, packageBytes: nativePackageBytes, packageReadMs: nativePackageReadMs, @@ -558,6 +554,23 @@ function describeNativeTimingScenario({ }; } +function nativeTimingMode({ + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackagePrepareMs, +}) { + if (usePrebuiltNativePackage) { + return "prebuilt-package"; + } + if (usePrebuiltNativeConfig) { + return "prebuilt-config"; + } + if (nativePackagePrepareMs > 0) { + return "build-package-in-process"; + } + return "build-config-in-process"; +} + async function prepareNativeStaticSearch({ sourceRoot, variant, From 933c9b1b875515708d4aeabea3e78d1f6b7cc16f Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 17:16:53 +0200 Subject: [PATCH 099/130] chore: pin python type checker --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a605f1cc..dd99f7a7 100644 --- a/package.json +++ b/package.json @@ -21,7 +21,7 @@ "rust:lint": "cargo ci-clippy", "rust:test": "cargo ci-test", "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:test", - "python:typecheck": "uvx ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", + "python:typecheck": "uvx --from ty==0.0.29 ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", From 5b7b19d5358268a70268900686bf92872533424b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 18:27:21 +0200 Subject: [PATCH 100/130] feat: align native sdk facades --- .../python/stella_anonymize/__init__.py | 177 +++++++++++++- .../python/stella_anonymize/__init__.pyi | 83 ++++++- crates/anonymize-py/typecheck/sdk_usage.py | 13 +- .../__test__/native-adapter-parity.test.ts | 231 ++++++++++++++++++ .../src/__test__/native-node.test.ts | 1 + .../src/__test__/pipeline-config.test.ts | 1 + packages/anonymize/src/index-shared.ts | 11 + packages/anonymize/src/native-node.ts | 73 ++++++ packages/anonymize/src/native.ts | 155 +++++++++++- 9 files changed, 732 insertions(+), 13 deletions(-) diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py index 2b5a84b9..5e99429a 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.py +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -1,12 +1,14 @@ from __future__ import annotations +import json +from collections.abc import Mapping from functools import lru_cache from os import PathLike from ._native import ( OperatorEntry, PipelineEntity, - PreparedSearch, + PreparedSearch as NativePreparedSearch, RedactionEntry, RedactionResult, StaticRedactionResult, @@ -21,11 +23,15 @@ __all__ = [ "OperatorEntry", + "OperatorConfig", + "PreparedAnonymizer", + "NativePreparedSearch", "PipelineEntity", "PreparedSearch", "RedactionEntry", "RedactionResult", "StaticRedactionResult", + "diagnostics_json", "load_prepared_package", "load_prepared_package_file", "native_package_version", @@ -34,12 +40,126 @@ "prepare_static_search_artifacts_bytes", "prepare_static_search_compressed_package_bytes", "prepare_static_search_package_bytes", + "redact_text_json", "redact_static_entities_diagnostics_json", "redact_static_entities_json", ] BytesLike = bytes | bytearray | memoryview PathLikeString = str | PathLike[str] +OperatorConfig = Mapping[str, str] | str | None + + +class PreparedAnonymizer: + def __init__(self, prepared: NativePreparedSearch) -> None: + self._prepared = prepared + + @classmethod + def from_config_json(cls, config_json: str) -> PreparedAnonymizer: + return cls(NativePreparedSearch(config_json)) + + @classmethod + def from_config_json_and_artifact_bytes( + cls, + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedAnonymizer: + return cls( + NativePreparedSearch.from_config_json_and_artifact_bytes( + config_json, + bytes(artifact_bytes), + ) + ) + + @classmethod + def from_prepared_package_bytes( + cls, + package_bytes: BytesLike, + ) -> PreparedAnonymizer: + return cls( + NativePreparedSearch.from_prepared_package_bytes(bytes(package_bytes)) + ) + + def prepare_diagnostics_json(self) -> str: + return self._prepared.prepare_diagnostics_json() + + def redact_text( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: + return self._prepared.redact_static_entities( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def redact_text_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self._prepared.redact_static_entities_json( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self._prepared.redact_static_entities_diagnostics_json( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def redact_static_entities( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: + return self.redact_text( + full_text, + operators, + redact_string=redact_string, + ) + + def redact_static_entities_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self.redact_text_json( + full_text, + operators, + redact_string=redact_string, + ) + + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self.diagnostics_json( + full_text, + operators, + redact_string=redact_string, + ) + + +PreparedSearch = PreparedAnonymizer def prepare_search_package(config_json: str, *, compressed: bool = True) -> bytes: @@ -48,15 +168,62 @@ def prepare_search_package(config_json: str, *, compressed: bool = True) -> byte return prepare_static_search_package_bytes(config_json) -def load_prepared_package(package_bytes: BytesLike) -> PreparedSearch: +def load_prepared_package(package_bytes: BytesLike) -> PreparedAnonymizer: return _load_prepared_package(bytes(package_bytes)) -def load_prepared_package_file(package_path: PathLikeString) -> PreparedSearch: +def load_prepared_package_file(package_path: PathLikeString) -> PreparedAnonymizer: with open(package_path, "rb") as handle: return load_prepared_package(handle.read()) @lru_cache(maxsize=8) -def _load_prepared_package(package_bytes: bytes) -> PreparedSearch: - return PreparedSearch.from_prepared_package_bytes(package_bytes) +def _load_prepared_package(package_bytes: bytes) -> PreparedAnonymizer: + return PreparedAnonymizer.from_prepared_package_bytes(package_bytes) + + +def redact_text_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: + return PreparedAnonymizer.from_config_json(config_json).redact_text_json( + full_text, + operators, + redact_string=redact_string, + ) + + +def diagnostics_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: + return PreparedAnonymizer.from_config_json(config_json).diagnostics_json( + full_text, + operators, + redact_string=redact_string, + ) + + +def _operator_config_json( + operators: OperatorConfig, + *, + redact_string: str | None, +) -> str | None: + if operators is None and redact_string is None: + return None + if isinstance(operators, str): + if redact_string is not None: + raise ValueError("redact_string cannot be combined with raw JSON") + return operators + payload: dict[str, object] = {} + if operators is not None: + payload["operators"] = dict(operators) + if redact_string is not None: + payload["redactString"] = redact_string + return json.dumps(payload, separators=(",", ":")) diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.pyi b/crates/anonymize-py/python/stella_anonymize/__init__.pyi index 24501056..67cd9d9e 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.pyi +++ b/crates/anonymize-py/python/stella_anonymize/__init__.pyi @@ -1,12 +1,13 @@ from __future__ import annotations +from collections.abc import Mapping from os import PathLike from typing import TypeAlias from ._native import ( + PreparedSearch as NativePreparedSearch, OperatorEntry as OperatorEntry, PipelineEntity as PipelineEntity, - PreparedSearch as PreparedSearch, RedactionEntry as RedactionEntry, RedactionResult as RedactionResult, StaticRedactionResult as StaticRedactionResult, @@ -21,13 +22,89 @@ from ._native import ( BytesLike: TypeAlias = bytes | bytearray | memoryview PathLikeString: TypeAlias = str | PathLike[str] +OperatorConfig: TypeAlias = Mapping[str, str] | str | None + +class PreparedAnonymizer: + def __init__(self, prepared: NativePreparedSearch) -> None: ... + @classmethod + def from_config_json(cls, config_json: str) -> PreparedAnonymizer: ... + @classmethod + def from_config_json_and_artifact_bytes( + cls, + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedAnonymizer: ... + @classmethod + def from_prepared_package_bytes( + cls, + package_bytes: BytesLike, + ) -> PreparedAnonymizer: ... + def prepare_diagnostics_json(self) -> str: ... + def redact_text( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: ... + def redact_text_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def redact_static_entities( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: ... + def redact_static_entities_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + +PreparedSearch: TypeAlias = PreparedAnonymizer def prepare_search_package( config_json: str, *, compressed: bool = True ) -> bytes: ... -def load_prepared_package(package_bytes: BytesLike) -> PreparedSearch: ... +def load_prepared_package(package_bytes: BytesLike) -> PreparedAnonymizer: ... def load_prepared_package_file( package_path: PathLikeString, -) -> PreparedSearch: ... +) -> PreparedAnonymizer: ... +def redact_text_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: ... +def diagnostics_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: ... __all__: list[str] diff --git a/crates/anonymize-py/typecheck/sdk_usage.py b/crates/anonymize-py/typecheck/sdk_usage.py index 0fc2cfbe..7f966e7e 100644 --- a/crates/anonymize-py/typecheck/sdk_usage.py +++ b/crates/anonymize-py/typecheck/sdk_usage.py @@ -6,15 +6,24 @@ def redact_with_prepared_package(config_json: str, text: str) -> str: package_bytes = anonymize.prepare_search_package(config_json) prepared = anonymize.load_prepared_package(package_bytes) - result = prepared.redact_static_entities(text) + result = prepared.redact_text(text) return result.redaction.redacted_text def redact_with_package_file(package_path: str, text: str) -> int: prepared = anonymize.load_prepared_package_file(package_path) - result = prepared.redact_static_entities(text) + result = prepared.redact_text(text, {"country": "redact"}) return result.redaction.entity_count def runtime_version() -> str: return anonymize.native_package_version() + + +def redact_json(config_json: str, text: str) -> str: + return anonymize.redact_text_json( + config_json, + text, + {"country": "redact"}, + redact_string="***", + ) diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 4e5049e3..fdc1aa10 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -17,7 +17,11 @@ import { assertNativeBindingVersion, createNativeAnonymizerFromPackage, getNativeBindingVersion, + load_prepared_package, + native_package_version, + normalize_for_search, prepareNativeSearchPackage, + prepare_search_package, type NativeAnonymizeBinding, type NativeOperatorConfig, type NativePreparedSearchBinding, @@ -130,6 +134,11 @@ type GeneratedNativeCase = { sensitiveValues: string[]; }; +type SharedSdkParityCase = { + text: string; + operators: NativeOperatorConfig | null; +}; + type ContractFixtureCase = { name: string; text: string; @@ -466,6 +475,92 @@ print( ) `; +const PYTHON_SHARED_SDK_PARITY_SCRIPT = ` +import json +import os +import pathlib +import sys + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +top_level = [ + "prepare_search_package", + "load_prepared_package", + "load_prepared_package_file", + "native_package_version", + "normalize_for_search", + "redact_text_json", + "diagnostics_json", +] +prepared_methods = [ + "redact_text", + "redact_text_json", + "diagnostics_json", + "prepare_diagnostics_json", +] +missing_top_level = [ + name for name in top_level if not callable(getattr(anonymize, name, None)) +] +if missing_top_level: + raise AssertionError(f"missing Python SDK functions: {missing_top_level}") +prepared = anonymize.load_prepared_package(package_bytes) +if prepared is not anonymize.load_prepared_package(package_bytes): + raise AssertionError("facade package cache did not reuse prepared search") +missing_prepared = [ + name for name in prepared_methods if not callable(getattr(prepared, name, None)) +] +if missing_prepared: + raise AssertionError(f"missing Python prepared methods: {missing_prepared}") +from_file = anonymize.load_prepared_package_file(package_path) +if anonymize.prepare_search_package( + payload["config_json"], + compressed=payload["compressed"], +) != package_bytes: + raise AssertionError("facade package bytes differ") + +def redact_with(instance, item): + return json.loads( + instance.redact_text_json( + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + ) + +print( + json.dumps( + { + "from_bytes": [ + redact_with(prepared, item) for item in payload["cases"] + ], + "from_file": [ + redact_with(from_file, item) for item in payload["cases"] + ], + "top_level": [ + json.loads( + anonymize.redact_text_json( + payload["config_json"], + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + ) + for item in payload["cases"] + ], + "normalized": anonymize.normalize_for_search(payload["normalize_text"]), + "version": anonymize.native_package_version(), + } + ) +) +`; + let loadedAdapters: { native: NativeAdapter; pythonModulePath: string; @@ -883,6 +978,84 @@ describe("native adapter parity", () => { expect(result.version).toBe(packageJsonVersion()); }); + test("shared TS and Python SDK facades match Rust core JSON", () => { + const adapters = getAdapters(); + const config = JSON.parse(CONFIG_JSON); + const packageBytes = prepare_search_package({ + binding: adapters.native, + config: CONFIG_JSON, + compressed: true, + }); + const prepared = load_prepared_package({ + binding: adapters.native, + packageBytes, + }); + const cases: SharedSdkParityCase[] = [ + { + text: + "č Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code.", + operators: null, + }, + { + text: + "🙂 Reference CD9876 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-456, code Secret Code.", + operators: { + operators: { country: "redact", "matter id": "redact" }, + redactString: "***", + }, + }, + ]; + + expect(native_package_version(adapters.native)).toBe(packageJsonVersion()); + expect( + normalize_for_search({ + binding: adapters.native, + text: "Číslo\u00a0PAS - 1234", + }), + ).toBe(adapters.native.normalizeForSearch("Číslo\u00a0PAS - 1234")); + expect([ + ...prepare_search_package({ binding: adapters.native, config }), + ]).toEqual([...packageBytes]); + + const rustCoreJson = cases.map(({ text, operators }) => + JSON.parse( + adapters.native.redactStaticEntitiesJson( + CONFIG_JSON, + text, + nativeOperatorConfigJson(operators), + ), + ), + ); + const tsSdkJson = cases.map(({ text, operators }) => + JSON.parse(prepared.redact_text_json(text, operators ?? undefined)), + ); + + expect(tsSdkJson).toEqual(rustCoreJson); + const diagnosticsJson = prepared.diagnostics_json(cases[0].text); + if (diagnosticsJson === null) { + throw new Error("missing shared SDK diagnostics"); + } + expect(diagnosticsJson).toContain('"diagnostics"'); + + const python = callPythonSharedSdkParity({ + pythonModulePath: adapters.pythonModulePath, + tempDir: adapters.tempDir, + packageBytes: Buffer.from(packageBytes), + cases, + normalizeText: "Číslo\u00a0PAS - 1234", + }); + + expect(python.from_bytes).toEqual(rustCoreJson); + expect(python.from_file).toEqual(rustCoreJson); + expect(python.top_level).toEqual(rustCoreJson); + expect(python.normalized).toBe( + adapters.native.normalizeForSearch("Číslo\u00a0PAS - 1234"), + ); + expect(python.version).toBe(packageJsonVersion()); + }); + test("native facade redacts from compressed package bytes", () => { const adapters = getAdapters(); const text = @@ -2067,6 +2240,55 @@ const callPythonPackageFacade = ({ return JSON.parse(output); }; +type PythonSharedSdkParityOptions = { + pythonModulePath: string; + tempDir: string; + packageBytes: Buffer; + cases: SharedSdkParityCase[]; + normalizeText: string; +}; + +const callPythonSharedSdkParity = ({ + pythonModulePath, + tempDir, + packageBytes, + cases, + normalizeText, +}: PythonSharedSdkParityOptions): { + from_bytes: StaticRedactionResult[]; + from_file: StaticRedactionResult[]; + top_level: StaticRedactionResult[]; + normalized: string; + version: string; +} => { + const payloadPath = join(tempDir, "shared-sdk-payload.json"); + const packagePath = join(tempDir, "shared-sdk-package.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + cases: cases.map(({ text, operators }) => ({ + text, + operators: operators?.operators ?? null, + redact_string: operators?.redactString, + })), + compressed: true, + config_json: CONFIG_JSON, + normalize_text: normalizeText, + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_SHARED_SDK_PARITY_SCRIPT], + { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + const callPythonDiagnostics = ( pythonModulePath: string, text: string, @@ -2224,6 +2446,15 @@ const operatorConfigJson = ( return JSON.stringify({ operators }); }; +const nativeOperatorConfigJson = ( + operators: NativeOperatorConfig | null, +): string | undefined => { + if (operators === null) { + return undefined; + } + return JSON.stringify(operators); +}; + const runCommand = ( command: string, args: string[], diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index a75a076f..9b55680c 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -215,6 +215,7 @@ const fakeNativeBinding = ( : preparedSearch; return { + normalizeForSearch: (text: string) => text, nativePackageVersion: () => version, NativePreparedSearch, prepareStaticSearchPackageBytes: () => new Uint8Array(), diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 69b29129..aaf8f816 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -63,6 +63,7 @@ const createCountingNativeBinding = (version: string) => { let rawPrepare = 0; let fromPackage = 0; const binding = { + normalizeForSearch: (text: string) => text, nativePackageVersion: () => version, NativePreparedSearch: { fromConfigJsonBytes: () => { diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index f12d97c6..fbe29bee 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -51,26 +51,37 @@ export type { // ── Native Adapter ─────────────────────────────── export { + PreparedSearch, PreparedNativeAnonymizer, assertNativeBindingVersion, createNativeAnonymizerFromConfig, createNativeAnonymizerFromPackage, encodeNativeSearchConfig, + encodeNativeSearchConfigInput, getNativeBindingVersion, + load_prepared_package, + native_package_version, + normalize_for_search, prepareNativeSearchPackage, + prepare_search_package, } from "./native"; export type { NativeAnonymizeBinding, NativeAnonymizerFromConfigOptions, NativeAnonymizerFromPackageOptions, NativeBindingVersionOptions, + NativeNormalizeOptions, NativeOperatorConfig, NativePipelineEntity, NativePipelineFromPackageOptions, NativePreparedSearchBinding, NativeRedactionResult, + NativeSearchPackageInput, NativeSearchPackageOptions, NativeStaticRedactionResult, + PreparedSearch as PreparedSearchInstance, + SharedNativePreparedPackageOptions, + SharedNativeSearchPackageOptions, } from "./native"; export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; export { diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index cc69e991..f7e8d541 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -6,7 +6,13 @@ import { assertNativeBindingVersion, createNativePipelineFromPackage, type NativeAnonymizeBinding, + type NativeNormalizeOptions, + type NativeSearchPackageInput, type PreparedNativePipeline, + load_prepared_package as loadPreparedPackageWithBinding, + native_package_version as nativePackageVersionWithBinding, + normalize_for_search as normalizeForSearchWithBinding, + prepare_search_package as prepareSearchPackageWithBinding, } from "./native"; export * from "./native"; @@ -29,6 +35,14 @@ export type NativePipelinePackageFileOptions = LoadNativeBindingOptions & { packagePath: string; }; +export type NativeSdkOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; +}; + +export type NativeSdkPackageOptions = NativeSdkOptions & { + compressed?: boolean; +}; + export type DefaultNativePipelinePackageOptions = LoadNativeBindingOptions & { binding?: NativeAnonymizeBinding; packagePath?: string; @@ -90,6 +104,45 @@ export const readNativePipelinePackageFile = ( packagePath: string, ): Uint8Array => new Uint8Array(readFileSync(packagePath)); +export const native_package_version = ( + options: NativeSdkOptions = {}, +): string => nativePackageVersionWithBinding(resolveNativeSdkBinding(options)); + +export const normalize_for_search = ( + text: string, + options: NativeSdkOptions = {}, +): string => { + const args: NativeNormalizeOptions = { + binding: resolveNativeSdkBinding(options), + text, + }; + return normalizeForSearchWithBinding(args); +}; + +export const prepare_search_package = ( + config: NativeSearchPackageInput, + { compressed = true, ...options }: NativeSdkPackageOptions = {}, +): Uint8Array => + prepareSearchPackageWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + compressed, + }); + +export const load_prepared_package = ( + packageBytes: Uint8Array, + options: NativeSdkOptions = {}, +) => + loadPreparedPackageWithBinding({ + binding: resolveNativeSdkBinding(options), + packageBytes, + }); + +export const load_prepared_package_file = ( + packagePath: string, + options: NativeSdkOptions = {}, +) => load_prepared_package(readNativePipelinePackageFile(packagePath), options); + export const readDefaultNativePipelinePackageFile = (): Uint8Array => { try { return new Uint8Array(readFileSync(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL)); @@ -190,6 +243,23 @@ const createNativePipelineFromResolvedDefaultPackage = ({ }); }; +const resolveNativeSdkBinding = ({ + binding, + expectedVersion, + ...loadOptions +}: NativeSdkOptions): NativeAnonymizeBinding => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + return resolvedBinding; +}; + const defaultPipelineCacheFor = ( binding: NativeAnonymizeBinding, ): Map => { @@ -270,6 +340,9 @@ const isNativeAnonymizeBinding = ( if (typeof candidate["nativePackageVersion"] !== "function") { return false; } + if (typeof candidate["normalizeForSearch"] !== "function") { + return false; + } if (typeof candidate["prepareStaticSearchPackageBytes"] !== "function") { return false; } diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index d13b1894..1bd1b161 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -38,6 +38,26 @@ type NativeBindingStaticRedactionResult = { redaction: NativeBindingRedactionResult; }; +type CanonicalPipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + source_detail?: string | null; +}; + +type CanonicalStaticRedactionResult = { + resolved_entities: CanonicalPipelineEntity[]; + redaction: { + redacted_text: string; + redaction_map: NativeBindingRedactionEntry[]; + operator_map: NativeBindingOperatorEntry[]; + entity_count: number; + }; +}; + export type NativePreparedSearchBinding = { prepareDiagnosticsJson?: () => string; redactStaticEntities: ( @@ -51,6 +71,7 @@ export type NativePreparedSearchBinding = { }; export type NativeAnonymizeBinding = { + normalizeForSearch: (text: string) => string; nativePackageVersion: () => string; NativePreparedSearch: { fromConfigJsonBytes: ( @@ -99,6 +120,27 @@ export type NativeSearchPackageOptions = { compressed?: boolean; }; +export type NativeSearchPackageInput = + | NativePreparedSearchConfig + | string + | Uint8Array; + +export type SharedNativeSearchPackageOptions = { + binding: NativeAnonymizeBinding; + config: NativeSearchPackageInput; + compressed?: boolean; +}; + +export type SharedNativePreparedPackageOptions = { + binding: NativeAnonymizeBinding; + packageBytes: Uint8Array; +}; + +export type NativeNormalizeOptions = { + binding: NativeAnonymizeBinding; + text: string; +}; + export type NativeAnonymizerFromConfigOptions = { binding: NativeAnonymizeBinding; config: NativePreparedSearchConfig; @@ -140,15 +182,39 @@ export class PreparedNativeAnonymizer { ); } + redact_text( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.redactStaticEntities(fullText, operators); + } + + redact_text_json(fullText: string, operators?: NativeOperatorConfig): string { + return JSON.stringify( + toBindingStaticRedactionResult( + this.redactStaticEntities(fullText, operators), + ), + ); + } + redactStaticEntitiesDiagnosticsJson( fullText: string, operators?: NativeOperatorConfig, ): string | null { - const run = this.#prepared.redactStaticEntitiesDiagnosticsJson; - if (!run) { + if (!this.#prepared.redactStaticEntitiesDiagnosticsJson) { return null; } - return run(fullText, toBindingOperatorConfig(operators)); + return this.#prepared.redactStaticEntitiesDiagnosticsJson( + fullText, + toBindingOperatorConfig(operators), + ); + } + + diagnostics_json( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.redactStaticEntitiesDiagnosticsJson(fullText, operators); } } @@ -170,6 +236,19 @@ export class PreparedNativePipeline { return this.#anonymizer.redactStaticEntities(fullText, operators); } + redact_text( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.redactText(fullText, operators); + } + + redact_text_json(fullText: string, operators?: NativeOperatorConfig): string { + return JSON.stringify( + toBindingStaticRedactionResult(this.redactText(fullText, operators)), + ); + } + redactTextDiagnosticsJson( fullText: string, operators?: NativeOperatorConfig, @@ -179,16 +258,42 @@ export class PreparedNativePipeline { operators, ); } + + diagnostics_json( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.redactTextDiagnosticsJson(fullText, operators); + } } export const encodeNativeSearchConfig = ( config: NativePreparedSearchConfig, ): Uint8Array => new TextEncoder().encode(JSON.stringify(config)); +export const encodeNativeSearchConfigInput = ( + config: NativeSearchPackageInput, +): Uint8Array => { + if (typeof config === "string") { + return new TextEncoder().encode(config); + } + if (config instanceof Uint8Array) { + return config; + } + return encodeNativeSearchConfig(config); +}; + export const getNativeBindingVersion = ( binding: NativeAnonymizeBinding, ): string => binding.nativePackageVersion(); +export const native_package_version = getNativeBindingVersion; + +export const normalize_for_search = ({ + binding, + text, +}: NativeNormalizeOptions): string => binding.normalizeForSearch(text); + export const assertNativeBindingVersion = ({ binding, expectedVersion, @@ -212,6 +317,17 @@ export const prepareNativeSearchPackage = ({ : binding.prepareStaticSearchPackageBytes(configBytes); }; +export const prepare_search_package = ({ + binding, + config, + compressed = true, +}: SharedNativeSearchPackageOptions): Uint8Array => { + const configBytes = encodeNativeSearchConfigInput(config); + return compressed + ? binding.prepareStaticSearchCompressedPackageBytes(configBytes) + : binding.prepareStaticSearchPackageBytes(configBytes); +}; + export const createNativeAnonymizerFromConfig = ({ binding, config, @@ -230,6 +346,12 @@ export const createNativeAnonymizerFromPackage = ({ binding.NativePreparedSearch.fromPreparedPackageBytes(packageBytes), ); +export const load_prepared_package = ({ + binding, + packageBytes, +}: SharedNativePreparedPackageOptions): PreparedNativeAnonymizer => + createNativeAnonymizerFromPackage({ binding, packageBytes }); + export const createNativePipelineFromPackage = ({ binding, packageBytes, @@ -238,6 +360,9 @@ export const createNativePipelineFromPackage = ({ createNativeAnonymizerFromPackage({ binding, packageBytes }), ); +export const PreparedSearch = PreparedNativeAnonymizer; +export type PreparedSearch = PreparedNativeAnonymizer; + const toBindingOperatorConfig = ( config: NativeOperatorConfig | undefined, ): NativeBindingOperatorConfig | undefined => { @@ -261,6 +386,22 @@ const toNativeStaticRedactionResult = ( redaction: toNativeRedactionResult(result.redaction), }); +const toBindingStaticRedactionResult = ( + result: NativeStaticRedactionResult, +): CanonicalStaticRedactionResult => ({ + resolved_entities: result.resolvedEntities.map(toBindingPipelineEntity), + redaction: { + redacted_text: result.redaction.redactedText, + redaction_map: [...result.redaction.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.redaction.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.redaction.entityCount, + }, +}); + const toNativePipelineEntity = ( entity: NativeBindingPipelineEntity, ): NativePipelineEntity => ({ @@ -273,6 +414,14 @@ const toNativePipelineEntity = ( ...(entity.sourceDetail ? { sourceDetail: entity.sourceDetail } : {}), }); +const toBindingPipelineEntity = ({ + sourceDetail, + ...entity +}: NativePipelineEntity): CanonicalPipelineEntity => ({ + ...entity, + source_detail: sourceDetail ?? null, +}); + const toNativeRedactionResult = ( result: NativeBindingRedactionResult, ): NativeRedactionResult => ({ From f36a0d3e5f6b5728da64062dbae0e61cb1086eb6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 18:42:29 +0200 Subject: [PATCH 101/130] feat: enforce native sdk parity --- crates/anonymize-napi/src/lib.rs | 20 ++++ .../scripts/native-package-ux-perf.mjs | 5 + .../__test__/native-adapter-parity.test.ts | 86 ++++++++++++++--- .../src/__test__/native-node.test.ts | 96 +++++++++++++++++-- packages/anonymize/src/index-shared.ts | 4 + packages/anonymize/src/native-node.ts | 29 ++++++ packages/anonymize/src/native.ts | 57 ++++++++++- 7 files changed, 271 insertions(+), 26 deletions(-) diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index b3083e95..386f0f3b 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -621,6 +621,26 @@ impl NativePreparedSearch { .and_then(to_js_static_redaction_result) } + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities_json( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; + let result = self + .inner + .redact_static_entities(&full_text, &operators) + .map_err(|error| to_napi_core_error(&error))?; + let result = static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) + } + #[napi] #[allow(clippy::needless_pass_by_value)] pub fn redact_static_entities_diagnostics_json( diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs index b6de6784..3382c7be 100644 --- a/packages/anonymize/scripts/native-package-ux-perf.mjs +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -42,6 +42,7 @@ function runScenario({ name, compressed }) { const load = runMigration({ ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH: packagePath, }); + const nativeDiagnostics = load.nativeDiagnostics ?? null; return { name, @@ -55,6 +56,10 @@ function runScenario({ name, compressed }) { firstRunMs: load.timings.coldRunMs, firstTouchMs: load.timings.nativeFirstTouchMs, warmClickMs: load.timings.nativeWarmClickMs, + prepareTopStages: nativeDiagnostics?.prepare?.topStages ?? [], + cachedPrepareTopStages: nativeDiagnostics?.cachedPrepare?.topStages ?? [], + runTopStages: nativeDiagnostics?.run?.topStages ?? [], + runTopFixtures: nativeDiagnostics?.run?.topFixtures ?? [], fixtureTimings: load.fixtureTimings, topColdFixtures: load.fixtureTimings.byFixture .toSorted((left, right) => right.coldMs - left.coldMs) diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index fdc1aa10..a4d926d1 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -16,12 +16,15 @@ import fc from "fast-check"; import { assertNativeBindingVersion, createNativeAnonymizerFromPackage, + diagnostics_json, getNativeBindingVersion, load_prepared_package, native_package_version, normalize_for_search, prepareNativeSearchPackage, prepare_search_package, + PreparedSearch, + redact_text_json, type NativeAnonymizeBinding, type NativeOperatorConfig, type NativePreparedSearchBinding, @@ -139,6 +142,27 @@ type SharedSdkParityCase = { operators: NativeOperatorConfig | null; }; +const SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS = [ + "prepare_search_package", + "load_prepared_package", + "native_package_version", + "normalize_for_search", + "redact_text_json", + "diagnostics_json", +] as const; + +const SHARED_SDK_TOP_LEVEL_FUNCTIONS = [ + ...SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS, + "load_prepared_package_file", +] as const; + +const SHARED_SDK_PREPARED_METHODS = [ + "redact_text", + "redact_text_json", + "diagnostics_json", + "prepare_diagnostics_json", +] as const; + type ContractFixtureCase = { name: string; text: string; @@ -490,26 +514,15 @@ import stella_anonymize as anonymize payload = json.loads(payload_path.read_text()) package_bytes = package_path.read_bytes() -top_level = [ - "prepare_search_package", - "load_prepared_package", - "load_prepared_package_file", - "native_package_version", - "normalize_for_search", - "redact_text_json", - "diagnostics_json", -] -prepared_methods = [ - "redact_text", - "redact_text_json", - "diagnostics_json", - "prepare_diagnostics_json", -] +top_level = payload["top_level_functions"] +prepared_methods = payload["prepared_methods"] missing_top_level = [ name for name in top_level if not callable(getattr(anonymize, name, None)) ] if missing_top_level: raise AssertionError(f"missing Python SDK functions: {missing_top_level}") +if not callable(getattr(anonymize, "PreparedSearch", None)): + raise AssertionError("missing Python PreparedSearch facade") prepared = anonymize.load_prepared_package(package_bytes) if prepared is not anonymize.load_prepared_package(package_bytes): raise AssertionError("facade package cache did not reuse prepared search") @@ -1008,6 +1021,26 @@ describe("native adapter parity", () => { }, ]; + const tsSdkFunctions: Record< + (typeof SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS)[number], + unknown + > = { + diagnostics_json, + load_prepared_package, + native_package_version, + normalize_for_search, + prepare_search_package, + redact_text_json, + }; + for (const name of SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS) { + expect(typeof tsSdkFunctions[name]).toBe("function"); + } + expect(typeof PreparedSearch).toBe("function"); + const preparedApi = prepared as unknown as Record; + for (const name of SHARED_SDK_PREPARED_METHODS) { + expect(typeof preparedApi[name]).toBe("function"); + } + expect(native_package_version(adapters.native)).toBe(packageJsonVersion()); expect( normalize_for_search({ @@ -1033,11 +1066,32 @@ describe("native adapter parity", () => { ); expect(tsSdkJson).toEqual(rustCoreJson); + expect( + cases.map(({ text, operators }) => + JSON.parse( + redact_text_json({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: text, + ...(operators !== null ? { operators } : {}), + }), + ), + ), + ).toEqual(rustCoreJson); const diagnosticsJson = prepared.diagnostics_json(cases[0].text); if (diagnosticsJson === null) { throw new Error("missing shared SDK diagnostics"); } expect(diagnosticsJson).toContain('"diagnostics"'); + const topLevelDiagnosticsJson = diagnostics_json({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: cases[0].text, + }); + if (topLevelDiagnosticsJson === null) { + throw new Error("missing top-level shared SDK diagnostics"); + } + expect(topLevelDiagnosticsJson).toContain('"diagnostics"'); const python = callPythonSharedSdkParity({ pythonModulePath: adapters.pythonModulePath, @@ -2275,6 +2329,8 @@ const callPythonSharedSdkParity = ({ compressed: true, config_json: CONFIG_JSON, normalize_text: normalizeText, + prepared_methods: SHARED_SDK_PREPARED_METHODS, + top_level_functions: SHARED_SDK_TOP_LEVEL_FUNCTIONS, }), ); const output = runCommand( diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 9b55680c..afeae73b 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -7,10 +7,17 @@ import type { NativeAnonymizeBinding } from "../native"; import { createNativePipelineFromDefaultPackage, createNativePipelineFromPackageFile, + diagnostics_json, getDefaultNativePipeline, + load_prepared_package, + load_prepared_package_file, loadNativeAnonymizeBinding, + native_package_version, + normalize_for_search, preloadDefaultNativePipeline, + prepare_search_package, readNativePipelinePackageFile, + redact_text_json, } from "../native-node"; describe("native node loader", () => { @@ -192,10 +199,84 @@ describe("native node loader", () => { rmSync(dir, { recursive: true, force: true }); } }); + + test("shared SDK helpers delegate through the native binding", () => { + const capturedBytes: number[][] = []; + const binding = fakeNativeBinding("1.5.0", { + compressedPackageBytes: Uint8Array.of(21, 22, 23), + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + expect(native_package_version({ binding })).toBe("1.5.0"); + expect(normalize_for_search("Číslo", { binding })).toBe("Číslo"); + + const packageBytes = prepare_search_package("{}", { binding }); + expect([...packageBytes]).toEqual([21, 22, 23]); + + const prepared = load_prepared_package(packageBytes, { binding }); + expect(capturedBytes).toEqual([[21, 22, 23]]); + expect(prepared.redact_text("x").redaction.redactedText).toBe(""); + const expectedJson = { + redaction: { + entity_count: 0, + operator_map: [], + redacted_text: "", + redaction_map: [], + }, + resolved_entities: [], + }; + expect(JSON.parse(prepared.redact_text_json("x"))).toEqual(expectedJson); + expect( + JSON.parse(redact_text_json("{}", "x", undefined, { binding })), + ).toEqual(expectedJson); + expect( + JSON.parse(diagnostics_json("{}", "x", undefined, { binding }) ?? "{}"), + ).toEqual({ + diagnostics: { events: [] }, + result: expectedJson, + }); + + const dir = mkdtempSync(join(tmpdir(), "anonymize-shared-sdk-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, packageBytes); + const fromFile = load_prepared_package_file(packagePath, { binding }); + expect(fromFile.redact_text("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +const emptyStaticRedactionBindingResult = () => ({ + resolvedEntities: [], + redaction: { + redactedText: "", + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, }); +const emptyStaticRedactionDiagnosticJson = (): string => + JSON.stringify({ + diagnostics: { events: [] }, + result: { + redaction: { + entity_count: 0, + operator_map: [], + redacted_text: "", + redaction_map: [], + }, + resolved_entities: [], + }, + }); + type FakeNativeBindingOptions = { preparedSearchAsConstructor?: boolean; + compressedPackageBytes?: Uint8Array; onPreparedPackageBytes?: (bytes: Uint8Array) => void; }; @@ -219,18 +300,13 @@ const fakeNativeBinding = ( nativePackageVersion: () => version, NativePreparedSearch, prepareStaticSearchPackageBytes: () => new Uint8Array(), - prepareStaticSearchCompressedPackageBytes: () => new Uint8Array(), + prepareStaticSearchCompressedPackageBytes: () => + options.compressedPackageBytes ?? new Uint8Array(), }; }; const fakePreparedSearch = () => ({ - redactStaticEntities: () => ({ - resolvedEntities: [], - redaction: { - redactedText: "", - redactionMap: [], - operatorMap: [], - entityCount: 0, - }, - }), + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + redactStaticEntities: emptyStaticRedactionBindingResult, + redactStaticEntitiesDiagnosticsJson: emptyStaticRedactionDiagnosticJson, }); diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index fbe29bee..dd5b589e 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -59,11 +59,13 @@ export { encodeNativeSearchConfig, encodeNativeSearchConfigInput, getNativeBindingVersion, + diagnostics_json, load_prepared_package, native_package_version, normalize_for_search, prepareNativeSearchPackage, prepare_search_package, + redact_text_json, } from "./native"; export type { NativeAnonymizeBinding, @@ -81,6 +83,8 @@ export type { NativeStaticRedactionResult, PreparedSearch as PreparedSearchInstance, SharedNativePreparedPackageOptions, + SharedNativeDiagnosticsJsonOptions, + SharedNativeRedactTextJsonOptions, SharedNativeSearchPackageOptions, } from "./native"; export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index f7e8d541..a8c7610f 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -5,14 +5,17 @@ import process from "node:process"; import { assertNativeBindingVersion, createNativePipelineFromPackage, + type NativeOperatorConfig, type NativeAnonymizeBinding, type NativeNormalizeOptions, type NativeSearchPackageInput, type PreparedNativePipeline, + diagnostics_json as diagnosticsJsonWithBinding, load_prepared_package as loadPreparedPackageWithBinding, native_package_version as nativePackageVersionWithBinding, normalize_for_search as normalizeForSearchWithBinding, prepare_search_package as prepareSearchPackageWithBinding, + redact_text_json as redactTextJsonWithBinding, } from "./native"; export * from "./native"; @@ -143,6 +146,32 @@ export const load_prepared_package_file = ( options: NativeSdkOptions = {}, ) => load_prepared_package(readNativePipelinePackageFile(packagePath), options); +export const redact_text_json = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): string => + redactTextJsonWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + +export const diagnostics_json = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): string | null => + diagnosticsJsonWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + export const readDefaultNativePipelinePackageFile = (): Uint8Array => { try { return new Uint8Array(readFileSync(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL)); diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index 1bd1b161..03fd93f1 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -64,6 +64,10 @@ export type NativePreparedSearchBinding = { fullText: string, operators?: NativeBindingOperatorConfig, ) => NativeBindingStaticRedactionResult; + redactStaticEntitiesJson?: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => string; redactStaticEntitiesDiagnosticsJson?: ( fullText: string, operators?: NativeBindingOperatorConfig, @@ -136,6 +140,16 @@ export type SharedNativePreparedPackageOptions = { packageBytes: Uint8Array; }; +export type SharedNativeRedactTextJsonOptions = { + binding: NativeAnonymizeBinding; + config: NativeSearchPackageInput; + fullText: string; + operators?: NativeOperatorConfig; +}; + +export type SharedNativeDiagnosticsJsonOptions = + SharedNativeRedactTextJsonOptions; + export type NativeNormalizeOptions = { binding: NativeAnonymizeBinding; text: string; @@ -170,6 +184,10 @@ export class PreparedNativeAnonymizer { return this.#prepared.prepareDiagnosticsJson?.() ?? null; } + prepare_diagnostics_json(): string | null { + return this.prepareDiagnosticsJson(); + } + redactStaticEntities( fullText: string, operators?: NativeOperatorConfig, @@ -190,9 +208,18 @@ export class PreparedNativeAnonymizer { } redact_text_json(fullText: string, operators?: NativeOperatorConfig): string { + const bindingOperators = toBindingOperatorConfig(operators); + if (this.#prepared.redactStaticEntitiesJson) { + return this.#prepared.redactStaticEntitiesJson( + fullText, + bindingOperators, + ); + } return JSON.stringify( toBindingStaticRedactionResult( - this.redactStaticEntities(fullText, operators), + toNativeStaticRedactionResult( + this.#prepared.redactStaticEntities(fullText, bindingOperators), + ), ), ); } @@ -229,6 +256,10 @@ export class PreparedNativePipeline { return this.#anonymizer.prepareDiagnosticsJson(); } + prepare_diagnostics_json(): string | null { + return this.prepareDiagnosticsJson(); + } + redactText( fullText: string, operators?: NativeOperatorConfig, @@ -352,6 +383,30 @@ export const load_prepared_package = ({ }: SharedNativePreparedPackageOptions): PreparedNativeAnonymizer => createNativeAnonymizerFromPackage({ binding, packageBytes }); +export const redact_text_json = ({ + binding, + config, + fullText, + operators, +}: SharedNativeRedactTextJsonOptions): string => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).redact_text_json(fullText, operators); + +export const diagnostics_json = ({ + binding, + config, + fullText, + operators, +}: SharedNativeDiagnosticsJsonOptions): string | null => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).diagnostics_json(fullText, operators); + export const createNativePipelineFromPackage = ({ binding, packageBytes, From ca283da4dd06bff0011cceae06c8ad44eb82f72a Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 19:15:49 +0200 Subject: [PATCH 102/130] fix: align native redaction parity --- crates/anonymize-adapter-contract/src/lib.rs | 3 + .../anonymize-core/data/identifier-cues.txt | 22 --- crates/anonymize-core/src/anchored.rs | 4 + crates/anonymize-core/src/coreference.rs | 16 ++- crates/anonymize-core/src/money.rs | 129 +++++++++++++++++- crates/anonymize-core/src/normalize.rs | 51 +------ crates/anonymize-core/src/prepared.rs | 27 +++- crates/anonymize-core/tests/prepared.rs | 47 +++++-- crates/anonymize-core/tests/redaction.rs | 31 +++-- .../__test__/native-adapter-parity.test.ts | 126 +++++++++++++++++ .../src/__test__/native-node.test.ts | 26 ++++ .../src/__test__/pipeline-config.test.ts | 90 +++++++++++- .../anonymize/src/build-unified-search.ts | 10 +- packages/anonymize/src/native-pipeline.ts | 7 + 14 files changed, 487 insertions(+), 102 deletions(-) delete mode 100644 crates/anonymize-core/data/identifier-cues.txt diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs index 1908a41d..feb5b2b7 100644 --- a/crates/anonymize-adapter-contract/src/lib.rs +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -396,6 +396,8 @@ pub struct BindingCoreferenceData { #[serde(default)] pub legal_form_aliases: Vec, #[serde(default)] + pub organization_suffixes: Vec, + #[serde(default)] pub organization_determiners: Vec, } @@ -1678,6 +1680,7 @@ fn coreference_data_from_binding( .collect(), role_stop_terms: data.role_stop_terms, legal_form_aliases: data.legal_form_aliases, + organization_suffixes: data.organization_suffixes, organization_determiners: data.organization_determiners, } } diff --git a/crates/anonymize-core/data/identifier-cues.txt b/crates/anonymize-core/data/identifier-cues.txt deleted file mode 100644 index 00b7ad4a..00000000 --- a/crates/anonymize-core/data/identifier-cues.txt +++ /dev/null @@ -1,22 +0,0 @@ -CIF -CNI -CNPJ -CPF -DIČ -DIC -DNI -IČO -ICO -ID -KRS -NIE -NIF -NIP -OAB -PESEL -RCS -REGON -RG -SIREN -SIRET -TVA diff --git a/crates/anonymize-core/src/anchored.rs b/crates/anonymize-core/src/anchored.rs index c91040f9..b249b499 100644 --- a/crates/anonymize-core/src/anchored.rs +++ b/crates/anonymize-core/src/anchored.rs @@ -98,6 +98,10 @@ impl AnchoredExtractor { } Ok(select_anchored_entities(entities)) } + + pub(crate) const fn rule(&self) -> &R { + &self.rule + } } fn anchor_span(found: &SearchMatch) -> AnchorSpan { diff --git a/crates/anonymize-core/src/coreference.rs b/crates/anonymize-core/src/coreference.rs index 5209a27e..0f4fb6d5 100644 --- a/crates/anonymize-core/src/coreference.rs +++ b/crates/anonymize-core/src/coreference.rs @@ -21,6 +21,8 @@ pub struct CoreferenceData { #[serde(default)] pub legal_form_aliases: Vec, #[serde(default)] + pub organization_suffixes: Vec, + #[serde(default)] pub organization_determiners: Vec, } @@ -60,7 +62,11 @@ impl PreparedCoreferenceData { definition_patterns.push(compile_definition_pattern(pattern)?); } - let mut legal_form_suffixes = data.legal_form_aliases.clone(); + let mut legal_form_suffixes = if data.organization_suffixes.is_empty() { + data.legal_form_aliases.clone() + } else { + data.organization_suffixes.clone() + }; legal_form_suffixes.sort_by_key(|suffix| std::cmp::Reverse(suffix.len())); Ok(Self { @@ -89,7 +95,13 @@ impl PreparedCoreferenceData { )?; if !self.definition_patterns.is_empty() { - let terms = self.extract_defined_terms(full_text, existing_entities)?; + let terms = if results.is_empty() { + self.extract_defined_terms(full_text, existing_entities)? + } else { + let mut definition_entities = existing_entities.to_vec(); + definition_entities.extend(results.iter().cloned()); + self.extract_defined_terms(full_text, &definition_entities)? + }; results.extend(Self::find_alias_spans(full_text, &terms)?); } diff --git a/crates/anonymize-core/src/money.rs b/crates/anonymize-core/src/money.rs index 5278eae7..3b7ba516 100644 --- a/crates/anonymize-core/src/money.rs +++ b/crates/anonymize-core/src/money.rs @@ -3,7 +3,7 @@ use std::collections::BTreeSet; use crate::anchored::{ AnchorSpan, AnchorTerm, AnchoredExtractor, AnchoredRule, }; -use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; use crate::types::Result; const MONEY_LABEL: &str = "monetary amount"; @@ -75,6 +75,14 @@ impl PreparedMonetaryData { pub(crate) fn process(&self, full_text: &str) -> Result> { self.extractor.extract(full_text) } + + pub(crate) fn extend_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + self.extractor.rule().extend_entities(full_text, entities) + } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -116,10 +124,11 @@ impl MonetaryRule { let symbols = clean_terms(data.currencies.symbols) .into_iter() .collect::>(); - let local_names = clean_terms(data.currencies.local_names) + let mut local_names = clean_terms(data.currencies.local_names) .into_iter() .map(currency_name) .collect::>(); + local_names.sort_by_key(|name| std::cmp::Reverse(name.text.len())); let mut magnitudes = Vec::new(); for entry in data.amount_words.magnitude_suffixes { magnitudes.extend( @@ -239,6 +248,85 @@ impl AnchoredRule for MonetaryRule { } impl MonetaryRule { + fn extend_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + let mut extended = Vec::with_capacity(entities.len()); + for entity in entities { + extended.push(self.extend_entity(full_text, entity)); + } + extended + } + + fn extend_entity( + &self, + full_text: &str, + entity: &PipelineEntity, + ) -> PipelineEntity { + if entity.label != MONEY_LABEL || caller_owned(entity) { + return entity.clone(); + } + + let mut next = entity.clone(); + let mut end = usize::try_from(next.end).unwrap_or(usize::MAX); + if !ends_with_letter(&next.text) + && let Some(currency_end) = self.trailing_currency_end(full_text, end) + { + end = currency_end; + } + end = self.extend_written_amount(full_text, end); + + let Ok(end_u32) = u32::try_from(end) else { + return next; + }; + if end_u32 == next.end { + return next; + } + + let Ok(start) = usize::try_from(next.start) else { + return next; + }; + let Some(text) = str_slice(full_text, start, end) else { + return next; + }; + next.end = end_u32; + text.clone_into(&mut next.text); + next + } + + fn trailing_currency_end(&self, text: &str, index: usize) -> Option { + let start = skip_trailing_currency_gap(text, index, 4); + + for name in &self.local_names { + let end = start.saturating_add(name.text.len()); + let Some(candidate) = str_slice(text, start, end) else { + continue; + }; + let matches = if name.case_insensitive { + candidate.to_lowercase() == name.folded + } else { + candidate == name.text + }; + if matches && right_alnum_boundary(text, end) { + return Some(end); + } + } + + for code in &self.codes { + let end = start.saturating_add(code.len()); + let Some(candidate) = str_slice(text, start, end) else { + continue; + }; + if candidate == code && right_alnum_boundary(text, end) { + return Some(end); + } + } + + None + } + fn leading_amount_span( &self, text: &str, @@ -631,6 +719,23 @@ fn is_identifier_char(ch: char) -> bool { ch == '_' || ch.is_alphanumeric() } +fn right_alnum_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn ends_with_letter(text: &str) -> bool { + text.chars().next_back().is_some_and(char::is_alphabetic) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + const fn is_number_separator(ch: char) -> bool { ch == ',' || ch == '.' @@ -704,6 +809,26 @@ fn skip_horizontal_ws_limit( index } +fn skip_trailing_currency_gap( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some(ch) = str_tail(text, index).and_then(|value| value.chars().next()) + else { + break; + }; + if ch == '\n' || ch == '\t' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + skipped = skipped.saturating_add(1); + } + index +} + fn skip_horizontal_ws_backward_limit( text: &str, mut index: usize, diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs index 1085c6c3..f6c79c36 100644 --- a/crates/anonymize-core/src/normalize.rs +++ b/crates/anonymize-core/src/normalize.rs @@ -1,6 +1,5 @@ const PHONE_NOISE: [char; 3] = ['(', ')', '-']; const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; -const IDENTIFIER_CUES: &str = include_str!("../data/identifier-cues.txt"); use crate::types::{Error, Result}; @@ -203,49 +202,7 @@ fn strip_id_separators(text: &str) -> String { } fn normalize_identifier_text(text: &str) -> String { - // Strip contextual cues before comparing identifiers. - if let Some(after_cue) = strip_leading_identifier_cue(text) - && let Some(identifier) = - find_compact_ascii_identifier(after_cue, true, is_generic_identifier) - { - return identifier; - } - - find_compact_ascii_identifier(text, true, is_generic_identifier) - .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) -} - -fn strip_leading_identifier_cue(text: &str) -> Option<&str> { - let trimmed = text.trim_start(); - let mut cue_end = 0; - - for (index, ch) in trimmed.char_indices() { - if !ch.is_alphabetic() { - break; - } - cue_end = index.saturating_add(ch.len_utf8()); - } - - if cue_end == 0 { - return None; - } - - let cue = trimmed.get(..cue_end)?; - if !is_identifier_cue(cue) { - return None; - } - - let after_cue = trimmed.get(cue_end..)?; - after_cue - .chars() - .next() - .is_some_and(char::is_whitespace) - .then(|| after_cue.trim_start()) -} - -fn is_identifier_cue(cue: &str) -> bool { - let upper = uppercase(cue); - IDENTIFIER_CUES.lines().any(|line| line == upper) + strip_id_separators(text).to_uppercase() } fn is_identifier_label(upper: &str) -> bool { @@ -432,12 +389,6 @@ fn is_identifier_separator(ch: char, allow_whitespace: bool) -> bool { ID_SEPARATORS.contains(&ch) || (allow_whitespace && ch.is_whitespace()) } -fn is_generic_identifier(candidate: &str) -> bool { - (5..=64).contains(&candidate.len()) - && candidate.chars().any(|ch| ch.is_ascii_digit()) - && candidate.chars().all(|ch| ch.is_ascii_alphanumeric()) -} - const fn is_base58_char(ch: char) -> bool { matches!( ch, diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 1832e5b6..030423db 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -79,6 +79,7 @@ pub struct PreparedSearch { name_corpus_data: Option, date_data: Option, monetary_data: Option, + monetary_extraction: bool, } #[derive( @@ -376,6 +377,7 @@ impl PreparedSearch { let allowed_labels = config.allowed_labels.clone(); let threshold = config.threshold; let confidence_boost = config.confidence_boost; + let monetary_extraction = should_extract_monetary_data(&config); let regex_groups = split_regex_patterns(config.regex_patterns, &slices)?; let regex_len = regex_groups.regex.len(); let custom_regex_len = config.custom_regex_patterns.len(); @@ -467,6 +469,7 @@ impl PreparedSearch { name_corpus_data: config.name_corpus_data.map(PreparedNames::new), date_data, monetary_data, + monetary_extraction, }) } @@ -735,7 +738,9 @@ impl PreparedSearch { if let Some(data) = &self.date_data { entities.extend(data.process(full_text)?); } - if let Some(data) = &self.monetary_data { + if self.monetary_extraction + && let Some(data) = &self.monetary_data + { entities.extend(data.process(full_text)?); } @@ -917,6 +922,7 @@ impl PreparedSearch { raw_entities.extend(address_context_entities); let merge_start = Instant::now(); let merged = merge_and_dedup(&raw_entities); + let merged = self.extend_monetary_entities(full_text, &merged); if let Some(diagnostics) = &mut diagnostics { diagnostics.record_entities( DiagnosticStage::Merge, @@ -1094,6 +1100,25 @@ impl PreparedSearch { )?; Ok(filter_entities_for_labels(filtered, &self.allowed_labels)) } + + fn extend_monetary_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + let Some(data) = &self.monetary_data else { + return entities.to_vec(); + }; + data.extend_entities(full_text, entities) + } +} + +fn should_extract_monetary_data(config: &PreparedSearchConfig) -> bool { + config.regex_patterns.is_empty() + || config + .regex_meta + .iter() + .any(|meta| meta.label == "monetary amount") } fn process_signature_entities(full_text: &str) -> TimedEntities { diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index a3b3ed0f..3468c146 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -6,13 +6,14 @@ use stella_anonymize_core::{ AddressContextData, AddressSeedData, AmountWordsData, CoreferenceData, CoreferencePatternData, CountryMatchData, CurrencyData, DateData, DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, - DiagnosticStage, Error, FuzzySearchOptions, GazetteerMatchData, HotwordRule, - HotwordRuleData, LegalFormData, LiteralSearchOptions, MagnitudeSuffixData, - MonetaryData, OperatorConfig, PatternSlice, PreparedSearch, - PreparedSearchArtifacts, PreparedSearchConfig, PreparedSearchSlices, - RegexMatchMeta, RegexSearchOptions, SearchOptions, SearchPattern, - SourceDetail, TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, - WrittenAmountPatternData, ZoneData, ZonePatternData, ZoneSigningClauseData, + DiagnosticStage, EntityKind, Error, FuzzySearchOptions, GazetteerMatchData, + HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, + MagnitudeSuffixData, MonetaryData, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchArtifacts, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, + SearchPattern, SourceDetail, TriggerData, TriggerRule, TriggerStrategy, + TriggerValidation, WrittenAmountPatternData, ZoneData, ZonePatternData, + ZoneSigningClauseData, }; fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { @@ -144,6 +145,7 @@ fn coreference_data() -> CoreferenceData { }], role_stop_terms: vec![String::from("seller")], legal_form_aliases: vec![String::from("LLC")], + organization_suffixes: vec![String::from("LLC")], organization_determiners: vec![String::from( r"the\s+(?:company|corporation|firm)", )], @@ -188,11 +190,16 @@ fn legal_form_coreference_prepared_search( ..LegalFormData::default() }), coreference_data: Some(CoreferenceData { - legal_form_aliases: suffix_strings, + definition_patterns: vec![CoreferencePatternData { + pattern: String::from(r#"\((?:hereinafter|the)\s+["']([^"']+)["']\)"#), + flags: String::from("gi"), + }], + role_stop_terms: vec![String::from("seller")], + legal_form_aliases: suffix_strings.clone(), + organization_suffixes: suffix_strings, organization_determiners: vec![String::from( r"the\s+(?:company|corporation|firm)", )], - ..CoreferenceData::default() }), ..empty_config(PreparedSearchSlices::default()) }) @@ -595,6 +602,28 @@ fn prepared_search_extends_propagated_organization_determiners() { ); } +#[test] +fn prepared_search_uses_propagated_orgs_as_defined_term_sources() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + let full_text = format!( + "Acme LLC signed. {} Acme (the \"Acme Platform\") paid. Acme Platform renewed.", + "body ".repeat(50), + ); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + let EntityKind::Coreference { source_text } = &entity.kind else { + return false; + }; + entity.source == DetectionSource::Coreference + && entity.text == "Acme Platform" + && source_text == "Acme" + })); +} + #[test] fn prepared_search_does_not_seed_coreference_from_caller_owned_entities() { let mut meta = RegexMatchMeta::new("organization", 1.0); diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs index bc71d38e..3209a6bf 100644 --- a/crates/anonymize-core/tests/redaction.rs +++ b/crates/anonymize-core/tests/redaction.rs @@ -102,30 +102,39 @@ fn normalized_identifier_values_share_placeholder() { } #[test] -fn contextual_identifier_cues_share_identifier_placeholder() { +fn generic_identifier_cues_keep_distinct_placeholder_keys() { let text = concat!( "CNI: 12AB34567 was present. ", "CNI nº 12AB34567 was repeated. ", - "CNI 12AB34567 was listed." + "CNI 12AB34567 was listed. ", + "12AB34567 was bare." + ); + let bare_start = byte_len( + text + .get(..text.rfind("12AB34567").unwrap_or(0)) + .unwrap_or(""), ); let entities = vec![ entity(text, "national identification number", "CNI: 12AB34567"), entity(text, "national identification number", "CNI nº 12AB34567"), entity(text, "national identification number", "CNI 12AB34567"), + Entity::detected( + bare_start, + bare_start.saturating_add(byte_len("12AB34567")), + "national identification number", + "12AB34567", + ), ]; let result = redact_text(text, &entities, &OperatorConfig::default()).unwrap(); - assert_eq!(result.redaction_map.len(), 1); - assert_eq!( - result.redaction_map[0].placeholder, - "[NATIONAL_IDENTIFICATION_NUMBER_1]" - ); + assert_eq!(result.redaction_map.len(), 4); + assert_eq!(result.redacted_text.matches('[').count(), 4); } #[test] -fn identifier_normalization_stops_before_trailing_prose() { +fn generic_identifier_normalization_keeps_trailing_prose_in_key() { let text = "Reg AB12345 expires. Reg AB12345 repeats."; let second_start = text .rfind("AB12345") @@ -153,11 +162,7 @@ fn identifier_normalization_stops_before_trailing_prose() { let result = redact_text(text, &entities, &OperatorConfig::default()).unwrap(); - assert_eq!(result.redaction_map.len(), 1); - assert_eq!( - result.redaction_map[0].placeholder, - "[REGISTRATION_NUMBER_1]" - ); + assert_eq!(result.redaction_map.len(), 2); } #[test] diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index a4d926d1..015543f6 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -1569,6 +1569,132 @@ describe("native adapter parity", () => { }); }); + test("native pipeline package keeps org propagation suffixes in TS parity", async () => { + const adapters = getAdapters(); + const fullText = "Acme Kft. signed. Acme paid."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: true, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-coreference-suffix-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect( + tsEntities.some( + (entity) => entity.source === "coreference" && entity.text === "Acme", + ), + ).toBe(false); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS trigger monetary widening", async () => { + const adapters = getAdapters(); + const fullText = + "Smluvní pokuta je sjednána ve výši 50.000,- Kč (slovy: padesát tisíc korun českých)."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: true, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["monetary amount"], + workspaceId: "native-pipeline-trigger-money-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "monetary amount", + text: "50.000,- Kč (slovy: padesát tisíc korun českých)", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + test("native pipeline package matches TS zone score adjustments", async () => { const adapters = getAdapters(); const fullText = ["Parties", "Alice", "Article 1", "Body"].join("\n"); diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index afeae73b..085deab2 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -20,6 +20,16 @@ import { redact_text_json, } from "../native-node"; +const SHARED_NODE_SDK_FUNCTIONS = [ + "diagnostics_json", + "load_prepared_package", + "load_prepared_package_file", + "native_package_version", + "normalize_for_search", + "prepare_search_package", + "redact_text_json", +] as const; + describe("native node loader", () => { test("loads the bundled native loader", () => { const calls: string[] = []; @@ -201,6 +211,22 @@ describe("native node loader", () => { }); test("shared SDK helpers delegate through the native binding", () => { + const sharedSdkFunctions: Record< + (typeof SHARED_NODE_SDK_FUNCTIONS)[number], + unknown + > = { + diagnostics_json, + load_prepared_package, + load_prepared_package_file, + native_package_version, + normalize_for_search, + prepare_search_package, + redact_text_json, + }; + for (const name of SHARED_NODE_SDK_FUNCTIONS) { + expect(typeof sharedSdkFunctions[name]).toBe("function"); + } + const capturedBytes: number[][] = []; const binding = fakeNativeBinding("1.5.0", { compressedPackageBytes: Uint8Array.of(21, 22, 23), diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index aaf8f816..28416f14 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -298,6 +298,15 @@ describe("pipeline config semantics", () => { expect( search.nativeStaticConfig.coreference_data?.legal_form_aliases, ).toContain("LLC"); + expect( + search.nativeStaticConfig.coreference_data?.legal_form_aliases, + ).toContain("Kft."); + expect( + search.nativeStaticConfig.coreference_data?.organization_suffixes, + ).toContain("LLC"); + expect( + search.nativeStaticConfig.coreference_data?.organization_suffixes, + ).not.toContain("Kft."); expect( search.nativeStaticConfig.coreference_data?.organization_determiners, ).toContain("the\\s+(?:company|corporation|firm)"); @@ -379,7 +388,7 @@ describe("pipeline config semantics", () => { ).toEqual([]); }); - test("native config keeps trigger currency terms separate from monetary detection", async () => { + test("native trigger config carries currency terms and monetary extension data", async () => { const search = await buildUnifiedSearch( { ...BASE_CONFIG, @@ -395,7 +404,7 @@ describe("pipeline config semantics", () => { search.nativeStaticConfig.trigger_data?.sentence_terminal_currency_terms .length, ).toBeGreaterThan(0); - expect(search.nativeStaticConfig.monetary_data).toBeUndefined(); + expect(search.nativeStaticConfig.monetary_data).toBeDefined(); }); test("native date data gates year words on trigger phrases", async () => { @@ -788,6 +797,83 @@ describe("pipeline config semantics", () => { expect(counts().compressedPrepare).toBe(3); }); + test("native pipeline package cache retries after failed build", async () => { + let attempts = 0; + const binding = { + normalizeForSearch: (text: string) => text, + nativePackageVersion: () => "native-cache-retry", + NativePreparedSearch: { + fromConfigJsonBytes: () => { + throw new Error( + "native package cache retry should use package bytes", + ); + }, + fromPreparedPackageBytes: () => ({ + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + redactStaticEntities: (fullText: string) => ({ + resolvedEntities: [], + redaction: { + redactedText: fullText, + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, + }), + }), + }, + prepareStaticSearchPackageBytes: () => new Uint8Array([9]), + prepareStaticSearchCompressedPackageBytes: () => { + attempts += 1; + if (attempts === 1) { + throw new Error("build failed"); + } + return new Uint8Array([attempts]); + }, + } satisfies NativeAnonymizeBinding; + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + labels: ["person"], + }; + + try { + await prepareNativePipelinePackage({ binding, config, context }); + throw new Error("expected first native package build to fail"); + } catch (error) { + expect(error).toBeInstanceOf(Error); + const message = error instanceof Error ? error.message : ""; + expect(message).toBe("build failed"); + } + + const retry = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + + expect([...retry]).toEqual([2]); + expect(attempts).toBe(2); + }); + + test("native trigger configs carry monetary extension data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableTriggerPhrases: true, + labels: ["monetary amount"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.monetary_data).toBeDefined(); + expect( + search.nativeStaticConfig.monetary_data?.amount_words + .written_amount_patterns.length, + ).toBeGreaterThan(0); + }); + test("enableLegalForms flag gates legal-form detection", async () => { const withFlag = await detect("Acme s.r.o.", { enableLegalForms: true, diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index f06804e6..303cf871 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -38,6 +38,7 @@ import type { DenyListData, DenyListFilterData } from "./detectors/deny-list"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; import { POST_NOMINALS } from "./config/titles"; +import { LEGAL_SUFFIXES } from "./config/legal-forms"; import { loadLanguageConfigs } from "./util/lang-loader"; import { @@ -249,6 +250,7 @@ export type NativeCoreferenceData = { definition_patterns: NativeCoreferencePatternData[]; role_stop_terms: string[]; legal_form_aliases: string[]; + organization_suffixes: string[]; organization_determiners: string[]; }; export type NativeNameCorpusData = { @@ -715,6 +717,9 @@ const buildUnifiedSearchSources = async ( legalFormsEnabled || config.enableTriggerPhrases || config.enableCoreference ? [...getKnownLegalSuffixes()] : []; + const nativeOrganizationSuffixes = config.enableCoreference + ? [...LEGAL_SUFFIXES] + : []; const nativeLegalFormData = nativeLegalFormSuffixes.length > 0 ? { @@ -810,7 +815,8 @@ const buildUnifiedSearchSources = async ( year_words_by_language: config.enableTriggerPhrases === true ? (yearWordData ?? {}) : {}, }; - const nativeMonetaryData = regexMonetaryEnabled ? monetaryData : null; + const nativeMonetaryData = + config.enableTriggerPhrases || regexMonetaryEnabled ? monetaryData : null; const nativeSentenceTerminalCurrencyTerms = sentenceTerminalCurrencyTerms(monetaryData); const nativeNameCorpusData = buildNativeNameCorpusData(config, ctx); @@ -968,6 +974,7 @@ const buildUnifiedSearchSources = async ( : { ...coreferenceData, legal_form_aliases: nativeLegalFormSuffixes, + organization_suffixes: nativeOrganizationSuffixes, }, nativeNameCorpusData, nativeSigningPatterns, @@ -1751,6 +1758,7 @@ const buildNativeCoreferenceData = async (): Promise => { definition_patterns: definitionPatterns, role_stop_terms: roleData.roles, legal_form_aliases: [], + organization_suffixes: [], organization_determiners: Object.entries(determinerData) .flatMap(([language, values]) => { if (language === "_comment" || !Array.isArray(values)) { diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts index 630b6bc1..aea10c46 100644 --- a/packages/anonymize/src/native-pipeline.ts +++ b/packages/anonymize/src/native-pipeline.ts @@ -210,6 +210,13 @@ const getCachedNativePipelinePackage = async ({ if (sharedCache.get(key) === promise) { sharedCache.delete(key); } + if ( + ctx.nativePipelinePackageKey === key && + ctx.nativePipelinePackagePromise === promise + ) { + ctx.nativePipelinePackage = null; + ctx.nativePipelinePackagePromise = null; + } throw error; } if (sharedCache.get(key) === promise) { From 2b7eb715ceee7eee60e19198596eed54d5be76db Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 19:32:46 +0200 Subject: [PATCH 103/130] test: enforce native sdk parity --- .../examples/native_adapter_parity.rs | 59 +++++++++++++ packages/anonymize/package.json | 1 + .../scripts/migration-fixture-perf.mjs | 47 +++++++++- .../__test__/native-adapter-parity.test.ts | 86 +++++++++++-------- .../src/__test__/native-node.test.ts | 15 +--- packages/anonymize/src/native-sdk-contract.ts | 20 +++++ turbo.json | 1 + 7 files changed, 181 insertions(+), 48 deletions(-) create mode 100644 crates/anonymize-adapter-contract/examples/native_adapter_parity.rs create mode 100644 packages/anonymize/src/native-sdk-contract.ts diff --git a/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs b/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs new file mode 100644 index 00000000..0959a767 --- /dev/null +++ b/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs @@ -0,0 +1,59 @@ +#![allow(clippy::print_stdout)] + +use std::{env, fs, io::Write}; + +use serde::Deserialize; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingPreparedSearchConfig, + BindingStaticRedactionResult, operator_config_from_binding, + prepared_search_config_from_binding, + static_redaction_result_to_utf16_binding, +}; +use stella_anonymize_core::PreparedSearch; + +#[derive(Deserialize)] +struct Payload { + config_json: String, + cases: Vec, +} + +#[derive(Deserialize)] +struct Case { + text: String, + operators_json: Option, +} + +fn main() -> Result<(), Box> { + let payload_path = env::var("STELLA_ANONYMIZE_PARITY_PAYLOAD")?; + let payload = fs::read_to_string(payload_path)?; + let payload = serde_json::from_str::(&payload)?; + let config = + serde_json::from_str::(&payload.config_json)?; + let prepared = + PreparedSearch::new(prepared_search_config_from_binding(config)?)?; + let results = payload + .cases + .iter() + .map(|case| run_case(&prepared, case)) + .collect::, _>>()?; + + let mut stdout = std::io::stdout().lock(); + writeln!(stdout, "{}", serde_json::to_string(&results)?)?; + Ok(()) +} + +fn run_case( + prepared: &PreparedSearch, + case: &Case, +) -> Result> { + let operators = case + .operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose()?; + let operators = operator_config_from_binding(operators)?; + let result = prepared.redact_static_entities(&case.text, &operators)?; + Ok(static_redaction_result_to_utf16_binding( + result, &case.text, + )?) +} diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 08c508b9..e75c2202 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -30,6 +30,7 @@ "stella-anonymize-build-native-package": "./scripts/build-native-pipeline-package.mjs" }, "files": [ + "ATTRIBUTION.md", "dist", "index.cjs", "*.node", diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 41713965..47cf30fa 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -8,6 +8,7 @@ import { readFileSync, readdirSync, rmSync, + symlinkSync, writeFileSync, } from "node:fs"; import { tmpdir } from "node:os"; @@ -180,7 +181,7 @@ async function runCoordinator() { if (baseline !== null) { const comparison = compareSnapshots(baseline, candidate); - console.log(JSON.stringify(comparison)); + console.log(JSON.stringify(comparisonForLog(comparison))); const acceptedByPolicy = ALLOW_ACCEPTED_MISMATCHES && comparison.acceptedEqual; if (!comparison.equal && !acceptedByPolicy && FAIL_ON_MISMATCH !== "0") { @@ -873,6 +874,40 @@ function compareSnapshots(baseline, candidate) { }; } +function comparisonForLog(comparison) { + return { + ...comparison, + mismatches: comparison.mismatches.map(mismatchForLog), + }; +} + +function mismatchForLog(mismatch) { + return { + ...mismatch, + firstEntityDiff: entityDiffForLog(mismatch.firstEntityDiff), + firstByteEntityDiff: entityDiffForLog(mismatch.firstByteEntityDiff), + }; +} + +function entityDiffForLog(diff) { + if (diff === null || diff === undefined) { + return diff ?? null; + } + return { + ...diff, + baseline: entityForLog(diff.baseline), + candidate: entityForLog(diff.candidate), + }; +} + +function entityForLog(entity) { + if (entity === null || entity === undefined) { + return entity ?? null; + } + const { text: _text, ...safeEntity } = entity; + return safeEntity; +} + function mismatchSummary(mismatches) { const byCategory = {}; let materialMismatchCount = 0; @@ -1635,9 +1670,19 @@ function materializeGitRef(ref, tempRoot) { throw new Error(`tar extraction failed: ${extract.stderr.toString()}`); } + linkWorkspaceNodeModules(outputDir); return outputDir; } +function linkWorkspaceNodeModules(outputDir) { + const source = join(ROOT_DIR, "node_modules"); + const target = join(outputDir, "node_modules"); + if (!existsSync(source) || existsSync(target)) { + return; + } + symlinkSync(source, target, "dir"); +} + function createNativeStaticRunner(nativeStaticConfig) { if (!nativeStaticConfig) { throw new Error("Native static runtime requires nativeStaticConfig"); diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 015543f6..c5642247 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -36,6 +36,11 @@ import type { PipelineConfig, RedactionResult, } from "../types"; +import { + SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS, + SHARED_NATIVE_SDK_PREPARED_METHODS, + SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS, +} from "../native-sdk-contract"; import { createPipelineContext, createNativePipelineFromPackage, @@ -142,27 +147,6 @@ type SharedSdkParityCase = { operators: NativeOperatorConfig | null; }; -const SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS = [ - "prepare_search_package", - "load_prepared_package", - "native_package_version", - "normalize_for_search", - "redact_text_json", - "diagnostics_json", -] as const; - -const SHARED_SDK_TOP_LEVEL_FUNCTIONS = [ - ...SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS, - "load_prepared_package_file", -] as const; - -const SHARED_SDK_PREPARED_METHODS = [ - "redact_text", - "redact_text_json", - "diagnostics_json", - "prepare_diagnostics_json", -] as const; - type ContractFixtureCase = { name: string; text: string; @@ -521,8 +505,15 @@ missing_top_level = [ ] if missing_top_level: raise AssertionError(f"missing Python SDK functions: {missing_top_level}") +missing_public_names = [ + name for name in top_level if name not in anonymize.__all__ +] +if missing_public_names: + raise AssertionError(f"missing Python SDK public names: {missing_public_names}") if not callable(getattr(anonymize, "PreparedSearch", None)): raise AssertionError("missing Python PreparedSearch facade") +if "PreparedSearch" not in anonymize.__all__: + raise AssertionError("missing Python PreparedSearch public name") prepared = anonymize.load_prepared_package(package_bytes) if prepared is not anonymize.load_prepared_package(package_bytes): raise AssertionError("facade package cache did not reuse prepared search") @@ -1022,7 +1013,7 @@ describe("native adapter parity", () => { ]; const tsSdkFunctions: Record< - (typeof SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS)[number], + (typeof SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS)[number], unknown > = { diagnostics_json, @@ -1032,12 +1023,12 @@ describe("native adapter parity", () => { prepare_search_package, redact_text_json, }; - for (const name of SHARED_SDK_CORE_TOP_LEVEL_FUNCTIONS) { + for (const name of SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS) { expect(typeof tsSdkFunctions[name]).toBe("function"); } expect(typeof PreparedSearch).toBe("function"); const preparedApi = prepared as unknown as Record; - for (const name of SHARED_SDK_PREPARED_METHODS) { + for (const name of SHARED_NATIVE_SDK_PREPARED_METHODS) { expect(typeof preparedApi[name]).toBe("function"); } @@ -1052,15 +1043,7 @@ describe("native adapter parity", () => { ...prepare_search_package({ binding: adapters.native, config }), ]).toEqual([...packageBytes]); - const rustCoreJson = cases.map(({ text, operators }) => - JSON.parse( - adapters.native.redactStaticEntitiesJson( - CONFIG_JSON, - text, - nativeOperatorConfigJson(operators), - ), - ), - ); + const rustCoreJson = callRustCoreSharedSdkParity(adapters.tempDir, cases); const tsSdkJson = cases.map(({ text, operators }) => JSON.parse(prepared.redact_text_json(text, operators ?? undefined)), ); @@ -2420,6 +2403,39 @@ const callPythonPackageFacade = ({ return JSON.parse(output); }; +const callRustCoreSharedSdkParity = ( + tempDir: string, + cases: SharedSdkParityCase[], +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "rust-core-shared-sdk-payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: nativeOperatorConfigJson(operators), + })), + }), + ); + const output = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_parity", + "--locked", + "--quiet", + ], + { + STELLA_ANONYMIZE_PARITY_PAYLOAD: payloadPath, + }, + ); + return JSON.parse(output); +}; + type PythonSharedSdkParityOptions = { pythonModulePath: string; tempDir: string; @@ -2455,8 +2471,8 @@ const callPythonSharedSdkParity = ({ compressed: true, config_json: CONFIG_JSON, normalize_text: normalizeText, - prepared_methods: SHARED_SDK_PREPARED_METHODS, - top_level_functions: SHARED_SDK_TOP_LEVEL_FUNCTIONS, + prepared_methods: SHARED_NATIVE_SDK_PREPARED_METHODS, + top_level_functions: SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS, }), ); const output = runCommand( diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 085deab2..18acc99c 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -19,16 +19,7 @@ import { readNativePipelinePackageFile, redact_text_json, } from "../native-node"; - -const SHARED_NODE_SDK_FUNCTIONS = [ - "diagnostics_json", - "load_prepared_package", - "load_prepared_package_file", - "native_package_version", - "normalize_for_search", - "prepare_search_package", - "redact_text_json", -] as const; +import { SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS } from "../native-sdk-contract"; describe("native node loader", () => { test("loads the bundled native loader", () => { @@ -212,7 +203,7 @@ describe("native node loader", () => { test("shared SDK helpers delegate through the native binding", () => { const sharedSdkFunctions: Record< - (typeof SHARED_NODE_SDK_FUNCTIONS)[number], + (typeof SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS)[number], unknown > = { diagnostics_json, @@ -223,7 +214,7 @@ describe("native node loader", () => { prepare_search_package, redact_text_json, }; - for (const name of SHARED_NODE_SDK_FUNCTIONS) { + for (const name of SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS) { expect(typeof sharedSdkFunctions[name]).toBe("function"); } diff --git a/packages/anonymize/src/native-sdk-contract.ts b/packages/anonymize/src/native-sdk-contract.ts new file mode 100644 index 00000000..7d6677e1 --- /dev/null +++ b/packages/anonymize/src/native-sdk-contract.ts @@ -0,0 +1,20 @@ +export const SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS = [ + "prepare_search_package", + "load_prepared_package", + "native_package_version", + "normalize_for_search", + "redact_text_json", + "diagnostics_json", +] as const; + +export const SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS = [ + ...SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS, + "load_prepared_package_file", +] as const; + +export const SHARED_NATIVE_SDK_PREPARED_METHODS = [ + "redact_text", + "redact_text_json", + "diagnostics_json", + "prepare_diagnostics_json", +] as const; diff --git a/turbo.json b/turbo.json index 880f04d0..0809af65 100644 --- a/turbo.json +++ b/turbo.json @@ -3,6 +3,7 @@ "globalDependencies": [".oxfmtrc.json"], "tasks": { "build": { + "dependsOn": ["^build"], "outputs": [ "dist/**", "wasm/dist/**", From b417d940d184e5ec424e027b338a068af4503fb4 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 20:48:32 +0200 Subject: [PATCH 104/130] feat: align native sdk surface --- README.md | 20 ++-- crates/anonymize-py/README.md | 45 ++++++++ crates/anonymize-py/pyproject.toml | 4 +- .../python/stella_anonymize/__init__.py | 15 +++ .../python/stella_anonymize/__init__.pyi | 7 ++ crates/anonymize-py/typecheck/sdk_usage.py | 10 ++ packages/anonymize/README.md | 99 +++++++++-------- .../__test__/native-adapter-parity.test.ts | 102 ++++++++++++++++-- .../src/__test__/native-node.test.ts | 8 ++ packages/anonymize/src/native-node.ts | 15 +++ packages/anonymize/src/native-sdk-contract.ts | 6 ++ packages/anonymize/src/native.ts | 16 +++ 12 files changed, 281 insertions(+), 66 deletions(-) create mode 100644 crates/anonymize-py/README.md diff --git a/README.md b/README.md index 6cdb1783..0f2626fb 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,22 @@

- Stella anonymize + stella anonymize

# anonymize -Monorepo for the Stella anonymization stack. +Monorepo for the stella anonymization stack. It contains the runtime package, the published data package, and the browser/WASM entrypoint used by downstream apps. ## Packages -| Package | Purpose | -| ---------------------- | -------------------------------------------------------------- | -| `@stll/anonymize` | Native runtime for multi-layer PII detection and anonymization | -| `@stll/anonymize-data` | Published deny-list dictionaries and trigger/config data | -| `@stll/anonymize-wasm` | Browser/WASM build of the runtime | -| `@stll/anonymize-cli` | Command-line anonymization (`anonymize` binary) | +| Package | Purpose | +| ----------------------- | -------------------------------------------------------------- | +| `@stll/anonymize` | Native runtime for multi-layer PII detection and anonymization | +| `@stll/anonymize-data` | Published deny-list dictionaries and trigger/config data | +| `@stll/anonymize-wasm` | Browser/WASM build of the runtime | +| `@stll/anonymize-cli` | Command-line anonymization (`anonymize` binary) | +| `stella-anonymize-core` | Python bindings for the Rust anonymization core | ## Install @@ -25,6 +26,8 @@ bun add @stll/anonymize bun add @stll/anonymize-data # Browser / Vite usage bun add @stll/anonymize-wasm +# Python SDK +uv add stella-anonymize-core ``` Or anonymize from the terminal without installing: @@ -70,3 +73,4 @@ bun run hooks:install - [`packages/anonymize`](packages/anonymize) - [`packages/data`](packages/data) - [`packages/anonymize/wasm`](packages/anonymize/wasm) +- [`crates/anonymize-py`](crates/anonymize-py) diff --git a/crates/anonymize-py/README.md b/crates/anonymize-py/README.md new file mode 100644 index 00000000..9bb9ffd6 --- /dev/null +++ b/crates/anonymize-py/README.md @@ -0,0 +1,45 @@ +# stella-anonymize-core + +Python bindings for the Stella anonymization Rust core. + +## Install + +```bash +uv add stella-anonymize-core +``` + +## Usage + +Prepare or load the anonymizer once, then reuse it for documents. + +```py +import stella_anonymize as anonymize + +package_bytes = anonymize.prepare_search_package(config_json) +prepared = anonymize.load_prepared_package(package_bytes) +result = prepared.redact_text(text, redact_string="***") + +print(result.redaction.redacted_text) +``` + +For prepared package files: + +```py +import stella_anonymize as anonymize + +prepared = anonymize.load_prepared_package_file("anonymize.stlanonpkg") +result_json = prepared.redact_text_json(text) +``` + +Top-level `redact_text()` and `redact_text_json()` are available for one-off calls, but they prepare from config on each invocation. Use `load_prepared_package()` or `load_prepared_package_file()` for repeated document processing. + +## API + +- `prepare_search_package(config_json, compressed=True) -> bytes` +- `load_prepared_package(package_bytes) -> PreparedAnonymizer` +- `load_prepared_package_file(package_path) -> PreparedAnonymizer` +- `PreparedAnonymizer.redact_text(text, operators=None, redact_string=None)` +- `PreparedAnonymizer.redact_text_json(text, operators=None, redact_string=None)` +- `PreparedAnonymizer.diagnostics_json(text, operators=None, redact_string=None)` + +`PreparedSearch` is an alias for `PreparedAnonymizer`. diff --git a/crates/anonymize-py/pyproject.toml b/crates/anonymize-py/pyproject.toml index 9357fcb6..9ac6bc2f 100644 --- a/crates/anonymize-py/pyproject.toml +++ b/crates/anonymize-py/pyproject.toml @@ -5,8 +5,8 @@ build-backend = "maturin" [project] name = "stella-anonymize-core" dynamic = ["version"] -description = "Python bindings for Stella anonymization core" -readme = "../../README.md" +description = "Python bindings for stella anonymization core" +readme = "README.md" requires-python = ">=3.11" license = "MIT" classifiers = [ diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py index 5e99429a..c8d8e2ed 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.py +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -40,6 +40,7 @@ "prepare_static_search_artifacts_bytes", "prepare_static_search_compressed_package_bytes", "prepare_static_search_package_bytes", + "redact_text", "redact_text_json", "redact_static_entities_diagnostics_json", "redact_static_entities_json", @@ -182,6 +183,20 @@ def _load_prepared_package(package_bytes: bytes) -> PreparedAnonymizer: return PreparedAnonymizer.from_prepared_package_bytes(package_bytes) +def redact_text( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> StaticRedactionResult: + return PreparedAnonymizer.from_config_json(config_json).redact_text( + full_text, + operators, + redact_string=redact_string, + ) + + def redact_text_json( config_json: str, full_text: str, diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.pyi b/crates/anonymize-py/python/stella_anonymize/__init__.pyi index 67cd9d9e..b0efc661 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.pyi +++ b/crates/anonymize-py/python/stella_anonymize/__init__.pyi @@ -92,6 +92,13 @@ def load_prepared_package(package_bytes: BytesLike) -> PreparedAnonymizer: ... def load_prepared_package_file( package_path: PathLikeString, ) -> PreparedAnonymizer: ... +def redact_text( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> StaticRedactionResult: ... def redact_text_json( config_json: str, full_text: str, diff --git a/crates/anonymize-py/typecheck/sdk_usage.py b/crates/anonymize-py/typecheck/sdk_usage.py index 7f966e7e..7fbf05f5 100644 --- a/crates/anonymize-py/typecheck/sdk_usage.py +++ b/crates/anonymize-py/typecheck/sdk_usage.py @@ -27,3 +27,13 @@ def redact_json(config_json: str, text: str) -> str: {"country": "redact"}, redact_string="***", ) + + +def redact_object(config_json: str, text: str) -> int: + result = anonymize.redact_text( + config_json, + text, + {"country": "redact"}, + redact_string="***", + ) + return result.redaction.entity_count diff --git a/packages/anonymize/README.md b/packages/anonymize/README.md index 3651db4b..ecd7fe50 100644 --- a/packages/anonymize/README.md +++ b/packages/anonymize/README.md @@ -1,5 +1,5 @@

- Stella anonymize + stella anonymize

# @stll/anonymize @@ -18,39 +18,51 @@ bun add @stll/anonymize-data For browser targets, install `@stll/anonymize-wasm` instead. It exposes the same runtime API through WebAssembly and is the supported entrypoint for Vite-based bundles. -## Usage +## Usage: Node.js native SDK ```ts -import { runPipeline } from "@stll/anonymize"; +import { getDefaultNativePipeline } from "@stll/anonymize/native-node"; -const entities = await runPipeline({ - fullText: text, - config: { - labels: [ - "person", - "organization", - "address", - "date", - "iban", - "phone number", - ], - threshold: 0.5, - enableRegex: true, - enableTriggerPhrases: true, - enableLegalForms: true, - enableNameCorpus: true, - enableDenyList: false, - enableGazetteer: false, - enableNer: false, - enableConfidenceBoost: true, - enableCoreference: true, - workspaceId: "default", - }, - gazetteerEntries: [], -}); +const anonymizer = getDefaultNativePipeline(); +const result = anonymizer.redact_text(text); + +console.log(result.redaction.redactedText); +``` + +Call `getDefaultNativePipeline()` once during service startup and reuse the returned anonymizer. The package ships with a prepared native package, so the normal request path avoids rebuilding search automata. + +For build-time generated packages or caller-owned data, prepare the package before runtime and load the bytes in the process that handles documents. + +```bash +bunx stella-anonymize-build-native-package \ + --config ./anonymize-native-config.mjs \ + --out ./dist/anonymize.stlanonpkg +``` + +```ts +import { load_prepared_package_file } from "@stll/anonymize/native-node"; + +const anonymizer = load_prepared_package_file("./dist/anonymize.stlanonpkg"); +const result = anonymizer.redact_text(text, { redactString: "***" }); ``` -## Caller-owned deny lists and regexes +The config module may export a `PipelineConfig` directly or `{ config, gazetteerEntries }`. Include `@stll/anonymize-data` dictionaries there if your runtime config uses the deny-list or name-corpus layers; keep the corresponding layers enabled for caller-owned `customDenyList`, `customRegexes`, and gazetteers. Those inputs are part of the prepared package and should be regenerated when they change. + +## Python SDK + +```py +import stella_anonymize as anonymize + +package_bytes = anonymize.prepare_search_package(config_json) +prepared = anonymize.load_prepared_package(package_bytes) +result = prepared.redact_text(text, redact_string="***") + +print(result.redaction.redacted_text) +``` + +The Python SDK uses the same Rust core and prepared-package contract as the Node SDK. Prefer `load_prepared_package()` or `load_prepared_package_file()` for repeated calls; top-level `redact_text()` and `redact_text_json()` prepare from config on each call. + +## Caller-Owned Deny Lists and Regexes Use `customDenyList` for exact terms and variants that you control. These are matched by the deny-list layer, so keep `enableDenyList: true`. @@ -92,33 +104,20 @@ const entities = await runPipeline({ }); ``` -## Native prepared packages - -For Node.js deployments that need low click-time latency, prepare the native pipeline package during your build and load the bytes at runtime. +## TypeScript Pipeline Compatibility -```bash -bunx stella-anonymize-build-native-package \ - --config ./anonymize-native-config.mjs \ - --out ./dist/anonymize.stlanonpkg -``` +The async TypeScript pipeline remains available for compatibility and for browser/WASM builds. ```ts -import { - createNativePipelineFromPackageFile, - loadNativeAnonymizeBinding, -} from "@stll/anonymize/native-node"; - -const native = loadNativeAnonymizeBinding(); -const pipeline = createNativePipelineFromPackageFile({ - binding: native, - packagePath: "./dist/anonymize.stlanonpkg", -}); +import { runPipeline } from "@stll/anonymize"; -const result = pipeline.redactText(text); +const entities = await runPipeline({ + fullText: text, + config, + gazetteerEntries: [], +}); ``` -The config module may export a `PipelineConfig` directly or `{ config, gazetteerEntries }`. Include `@stll/anonymize-data` dictionaries there if your runtime config uses the deny-list or name-corpus layers; keep the corresponding layers enabled for caller-owned `customDenyList`, `customRegexes`, and gazetteers. Those inputs are part of the prepared package and should be regenerated when they change. - ## Browser setup If you use Vite with the WASM build, exclude the bundle from dependency pre-bundling: diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index c5642247..e12fbebe 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -23,7 +23,9 @@ import { normalize_for_search, prepareNativeSearchPackage, prepare_search_package, + PreparedAnonymizer, PreparedSearch, + redact_text, redact_text_json, type NativeAnonymizeBinding, type NativeOperatorConfig, @@ -37,6 +39,7 @@ import type { RedactionResult, } from "../types"; import { + SHARED_NATIVE_SDK_CLASS_NAMES, SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS, SHARED_NATIVE_SDK_PREPARED_METHODS, SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS, @@ -112,6 +115,13 @@ type StaticRedactionResult = { }; }; +type OffsetFreeStaticRedactionResult = { + resolved_entities: Array< + Omit + >; + redaction: StaticRedactionResult["redaction"]; +}; + type StaticRedactionDiagnosticResult = { result: StaticRedactionResult; diagnostics: { @@ -500,20 +510,22 @@ payload = json.loads(payload_path.read_text()) package_bytes = package_path.read_bytes() top_level = payload["top_level_functions"] prepared_methods = payload["prepared_methods"] +class_names = payload["class_names"] missing_top_level = [ name for name in top_level if not callable(getattr(anonymize, name, None)) ] if missing_top_level: raise AssertionError(f"missing Python SDK functions: {missing_top_level}") missing_public_names = [ - name for name in top_level if name not in anonymize.__all__ + name for name in [*top_level, *class_names] if name not in anonymize.__all__ ] if missing_public_names: raise AssertionError(f"missing Python SDK public names: {missing_public_names}") -if not callable(getattr(anonymize, "PreparedSearch", None)): - raise AssertionError("missing Python PreparedSearch facade") -if "PreparedSearch" not in anonymize.__all__: - raise AssertionError("missing Python PreparedSearch public name") +missing_classes = [ + name for name in class_names if not callable(getattr(anonymize, name, None)) +] +if missing_classes: + raise AssertionError(f"missing Python SDK classes: {missing_classes}") prepared = anonymize.load_prepared_package(package_bytes) if prepared is not anonymize.load_prepared_package(package_bytes): raise AssertionError("facade package cache did not reuse prepared search") @@ -538,6 +550,44 @@ def redact_with(instance, item): ) ) +def redact_object_with_top_level(item): + result = anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + return { + "resolved_entities": [ + { + "label": entity.label, + "text": entity.text, + "score": entity.score, + "source": entity.source, + "source_detail": entity.source_detail, + } + for entity in result.resolved_entities + ], + "redaction": { + "redacted_text": result.redaction.redacted_text, + "redaction_map": [ + { + "placeholder": entry.placeholder, + "original": entry.original, + } + for entry in result.redaction.redaction_map + ], + "operator_map": [ + { + "placeholder": entry.placeholder, + "operator": entry.operator, + } + for entry in result.redaction.operator_map + ], + "entity_count": result.redaction.entity_count, + }, + } + print( json.dumps( { @@ -558,6 +608,9 @@ print( ) for item in payload["cases"] ], + "top_level_object": [ + redact_object_with_top_level(item) for item in payload["cases"] + ], "normalized": anonymize.normalize_for_search(payload["normalize_text"]), "version": anonymize.native_package_version(), } @@ -1021,12 +1074,22 @@ describe("native adapter parity", () => { native_package_version, normalize_for_search, prepare_search_package, + redact_text, redact_text_json, }; for (const name of SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS) { expect(typeof tsSdkFunctions[name]).toBe("function"); } - expect(typeof PreparedSearch).toBe("function"); + const tsSdkClasses: Record< + (typeof SHARED_NATIVE_SDK_CLASS_NAMES)[number], + unknown + > = { + PreparedAnonymizer, + PreparedSearch, + }; + for (const name of SHARED_NATIVE_SDK_CLASS_NAMES) { + expect(typeof tsSdkClasses[name]).toBe("function"); + } const preparedApi = prepared as unknown as Record; for (const name of SHARED_NATIVE_SDK_PREPARED_METHODS) { expect(typeof preparedApi[name]).toBe("function"); @@ -1049,6 +1112,18 @@ describe("native adapter parity", () => { ); expect(tsSdkJson).toEqual(rustCoreJson); + expect( + cases.map(({ text, operators }) => + toBindingStaticResult( + redact_text({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: text, + ...(operators !== null ? { operators } : {}), + }), + ), + ), + ).toEqual(rustCoreJson); expect( cases.map(({ text, operators }) => JSON.parse( @@ -1087,6 +1162,9 @@ describe("native adapter parity", () => { expect(python.from_bytes).toEqual(rustCoreJson); expect(python.from_file).toEqual(rustCoreJson); expect(python.top_level).toEqual(rustCoreJson); + expect(python.top_level_object).toEqual( + rustCoreJson.map(withoutEntityOffsets), + ); expect(python.normalized).toBe( adapters.native.normalizeForSearch("Číslo\u00a0PAS - 1234"), ); @@ -2454,6 +2532,7 @@ const callPythonSharedSdkParity = ({ from_bytes: StaticRedactionResult[]; from_file: StaticRedactionResult[]; top_level: StaticRedactionResult[]; + top_level_object: OffsetFreeStaticRedactionResult[]; normalized: string; version: string; } => { @@ -2468,6 +2547,7 @@ const callPythonSharedSdkParity = ({ operators: operators?.operators ?? null, redact_string: operators?.redactString, })), + class_names: SHARED_NATIVE_SDK_CLASS_NAMES, compressed: true, config_json: CONFIG_JSON, normalize_text: normalizeText, @@ -2604,6 +2684,16 @@ const toBindingStaticResult = ( }, }); +const withoutEntityOffsets = ({ + resolved_entities, + redaction, +}: StaticRedactionResult): OffsetFreeStaticRedactionResult => ({ + resolved_entities: resolved_entities.map( + ({ start: _start, end: _end, ...entity }) => entity, + ), + redaction, +}); + const toBindingPipelineEntity = ({ sourceDetail, ...entity diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 18acc99c..455ac13b 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -17,6 +17,7 @@ import { preloadDefaultNativePipeline, prepare_search_package, readNativePipelinePackageFile, + redact_text, redact_text_json, } from "../native-node"; import { SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS } from "../native-sdk-contract"; @@ -212,6 +213,7 @@ describe("native node loader", () => { native_package_version, normalize_for_search, prepare_search_package, + redact_text, redact_text_json, }; for (const name of SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS) { @@ -235,6 +237,12 @@ describe("native node loader", () => { const prepared = load_prepared_package(packageBytes, { binding }); expect(capturedBytes).toEqual([[21, 22, 23]]); expect(prepared.redact_text("x").redaction.redactedText).toBe(""); + expect(redact_text("{}", "x", undefined, { binding }).redaction).toEqual({ + entityCount: 0, + operatorMap: new Map(), + redactedText: "", + redactionMap: new Map(), + }); const expectedJson = { redaction: { entity_count: 0, diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index a8c7610f..4f07507c 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -10,11 +10,13 @@ import { type NativeNormalizeOptions, type NativeSearchPackageInput, type PreparedNativePipeline, + type NativeStaticRedactionResult, diagnostics_json as diagnosticsJsonWithBinding, load_prepared_package as loadPreparedPackageWithBinding, native_package_version as nativePackageVersionWithBinding, normalize_for_search as normalizeForSearchWithBinding, prepare_search_package as prepareSearchPackageWithBinding, + redact_text as redactTextWithBinding, redact_text_json as redactTextJsonWithBinding, } from "./native"; @@ -146,6 +148,19 @@ export const load_prepared_package_file = ( options: NativeSdkOptions = {}, ) => load_prepared_package(readNativePipelinePackageFile(packagePath), options); +export const redact_text = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): NativeStaticRedactionResult => + redactTextWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + export const redact_text_json = ( config: NativeSearchPackageInput, fullText: string, diff --git a/packages/anonymize/src/native-sdk-contract.ts b/packages/anonymize/src/native-sdk-contract.ts index 7d6677e1..35dd7e71 100644 --- a/packages/anonymize/src/native-sdk-contract.ts +++ b/packages/anonymize/src/native-sdk-contract.ts @@ -3,6 +3,7 @@ export const SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS = [ "load_prepared_package", "native_package_version", "normalize_for_search", + "redact_text", "redact_text_json", "diagnostics_json", ] as const; @@ -18,3 +19,8 @@ export const SHARED_NATIVE_SDK_PREPARED_METHODS = [ "diagnostics_json", "prepare_diagnostics_json", ] as const; + +export const SHARED_NATIVE_SDK_CLASS_NAMES = [ + "PreparedAnonymizer", + "PreparedSearch", +] as const; diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index 03fd93f1..a11f6b82 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -147,6 +147,8 @@ export type SharedNativeRedactTextJsonOptions = { operators?: NativeOperatorConfig; }; +export type SharedNativeRedactTextOptions = SharedNativeRedactTextJsonOptions; + export type SharedNativeDiagnosticsJsonOptions = SharedNativeRedactTextJsonOptions; @@ -395,6 +397,18 @@ export const redact_text_json = ({ ), ).redact_text_json(fullText, operators); +export const redact_text = ({ + binding, + config, + fullText, + operators, +}: SharedNativeRedactTextOptions): NativeStaticRedactionResult => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).redact_text(fullText, operators); + export const diagnostics_json = ({ binding, config, @@ -417,6 +431,8 @@ export const createNativePipelineFromPackage = ({ export const PreparedSearch = PreparedNativeAnonymizer; export type PreparedSearch = PreparedNativeAnonymizer; +export const PreparedAnonymizer = PreparedNativeAnonymizer; +export type PreparedAnonymizer = PreparedNativeAnonymizer; const toBindingOperatorConfig = ( config: NativeOperatorConfig | undefined, From 5ffdd07d88930f9de0e4b07d9dc531348e71bedd Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 20:48:41 +0200 Subject: [PATCH 105/130] chore: enforce brand casing --- .github/tools/check-brand-case.mjs | 75 ++++++++++++++++++++ .gitleaks.toml | 2 +- bunfig.toml | 2 +- crates/anonymize-adapter-contract/Cargo.toml | 2 +- crates/anonymize-napi/Cargo.toml | 2 +- crates/anonymize-py/Cargo.toml | 2 +- package.json | 3 +- packages/anonymize/wasm/LICENSE | 3 +- packages/data/README.md | 2 +- 9 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 .github/tools/check-brand-case.mjs diff --git a/.github/tools/check-brand-case.mjs b/.github/tools/check-brand-case.mjs new file mode 100644 index 00000000..1072d17c --- /dev/null +++ b/.github/tools/check-brand-case.mjs @@ -0,0 +1,75 @@ +import { execFileSync } from "node:child_process"; +import { readFileSync } from "node:fs"; +import process from "node:process"; + +const DISALLOWED = ["S", "tella"].join(""); +const EXPECTED = DISALLOWED.toLowerCase(); +const IGNORED_PATHS = new Set(["AGENTS.md", "CLAUDE.md", "GEMINI.md"]); +const IGNORED_PREFIXES = [ + ".ai/", + ".agents/", + ".claude/", + ".github/assets/", + "packages/data/dictionaries/", + "packages/*/dist/", + "packages/anonymize/wasm/dist/", + "target/", +]; + +const trackedFiles = execFileSync("git", ["ls-files", "-z"], { + encoding: "utf8", +}) + .split("\0") + .filter(Boolean) + .filter((file) => !isIgnored(file)); + +let hasFailure = false; + +for (const file of trackedFiles) { + const content = readFileSync(file); + if (content.includes(0)) { + continue; + } + + const text = content.toString("utf8"); + let index = text.indexOf(DISALLOWED); + while (index !== -1) { + const { line, column } = lineColumnFor(text, index); + console.error( + `${file}:${line}:${column} uses disallowed brand casing; use "${EXPECTED}"`, + ); + hasFailure = true; + index = text.indexOf(DISALLOWED, index + DISALLOWED.length); + } +} + +if (hasFailure) { + process.exit(1); +} + +function isIgnored(file) { + if (IGNORED_PATHS.has(file)) { + return true; + } + return IGNORED_PREFIXES.some((pattern) => { + if (!pattern.includes("*")) { + return file.startsWith(pattern); + } + const [prefix, suffix] = pattern.split("*"); + return file.startsWith(prefix) && file.includes(suffix); + }); +} + +function lineColumnFor(text, index) { + let line = 1; + let column = 1; + for (let i = 0; i < index; i += 1) { + if (text.charCodeAt(i) === 10) { + line += 1; + column = 1; + continue; + } + column += 1; + } + return { line, column }; +} diff --git a/.gitleaks.toml b/.gitleaks.toml index bfb512b3..4e6c099c 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -1,4 +1,4 @@ -title = "Stella Gitleaks Configuration" +title = "stella Gitleaks Configuration" [extend] useDefault = true diff --git a/bunfig.toml b/bunfig.toml index d9275b36..85c08c2d 100644 --- a/bunfig.toml +++ b/bunfig.toml @@ -3,7 +3,7 @@ linker = "hoisted" # 5-day quarantine: reject package versions published # less than 5 days ago to mitigate supply chain attacks. minimumReleaseAge = 432_000 -# First-party Stella packages ship in coordinated release +# First-party stella packages ship in coordinated release # waves, so allow fresh internal versions immediately. minimumReleaseAgeExcludes = [ "@stll/typescript-config", diff --git a/crates/anonymize-adapter-contract/Cargo.toml b/crates/anonymize-adapter-contract/Cargo.toml index 804516be..9a1728e8 100644 --- a/crates/anonymize-adapter-contract/Cargo.toml +++ b/crates/anonymize-adapter-contract/Cargo.toml @@ -2,7 +2,7 @@ name = "stella-anonymize-adapter-contract" version.workspace = true edition.workspace = true -description = "Shared adapter contract for Stella anonymization bindings" +description = "Shared adapter contract for stella anonymization bindings" license.workspace = true publish.workspace = true repository.workspace = true diff --git a/crates/anonymize-napi/Cargo.toml b/crates/anonymize-napi/Cargo.toml index 27d5cd8c..0e757372 100644 --- a/crates/anonymize-napi/Cargo.toml +++ b/crates/anonymize-napi/Cargo.toml @@ -2,7 +2,7 @@ name = "stella-anonymize-napi" version.workspace = true edition.workspace = true -description = "Native bindings for Stella anonymization core" +description = "Native bindings for stella anonymization core" license.workspace = true publish.workspace = true repository.workspace = true diff --git a/crates/anonymize-py/Cargo.toml b/crates/anonymize-py/Cargo.toml index ece0b065..8ad28284 100644 --- a/crates/anonymize-py/Cargo.toml +++ b/crates/anonymize-py/Cargo.toml @@ -2,7 +2,7 @@ name = "stella-anonymize-py" version.workspace = true edition.workspace = true -description = "Python bindings for Stella anonymization core" +description = "Python bindings for stella anonymization core" license.workspace = true publish.workspace = true repository.workspace = true diff --git a/package.json b/package.json index dd99f7a7..5c15a7d6 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,7 @@ "scripts": { "build": "turbo run build", "typecheck": "turbo run typecheck", - "lint": "turbo run lint && bun run lint:oxlint", + "lint": "turbo run lint && bun run lint:oxlint && bun run check:brand-case", "lint:oxlint": "bun --bun oxlint -c oxlint.config.ts --deny-warnings --type-aware .", "lint:fix": "bun --bun oxlint -c oxlint.config.ts --type-aware --fix .", "format": "turbo run format && oxfmt . \"!packages/**\" \"!.ai/**\" \"!.agents/**\" \"!.claude/**\" \"!AGENTS.md\" \"!CLAUDE.md\" \"!GEMINI.md\"", @@ -25,6 +25,7 @@ "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", + "check:brand-case": "node .github/tools/check-brand-case.mjs", "secrets:check:staged": "bash scripts/check-staged-secrets.sh", "hooks:install": "lefthook install", "hooks:uninstall": "lefthook uninstall", diff --git a/packages/anonymize/wasm/LICENSE b/packages/anonymize/wasm/LICENSE index 503b2e8a..9dac24d2 100644 --- a/packages/anonymize/wasm/LICENSE +++ b/packages/anonymize/wasm/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 Stella +Copyright (c) 2026 stella Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated @@ -25,4 +25,3 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/packages/data/README.md b/packages/data/README.md index 6a7da55d..6a6ecc75 100644 --- a/packages/data/README.md +++ b/packages/data/README.md @@ -1,5 +1,5 @@

- Stella anonymize + stella anonymize

# @stll/anonymize-data From f085e7ae71490c7fbd190fb9dca8ada611fff583 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:10:13 +0200 Subject: [PATCH 106/130] chore: report native sdk perf --- crates/anonymize-py/README.md | 2 +- .../anonymize/scripts/native-adapter-perf.mjs | 196 +++++++++++++++++- 2 files changed, 195 insertions(+), 3 deletions(-) diff --git a/crates/anonymize-py/README.md b/crates/anonymize-py/README.md index 9bb9ffd6..b327bd45 100644 --- a/crates/anonymize-py/README.md +++ b/crates/anonymize-py/README.md @@ -1,6 +1,6 @@ # stella-anonymize-core -Python bindings for the Stella anonymization Rust core. +Python bindings for the stella anonymization Rust core. ## Install diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index d5b10fe1..8317814b 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -3,9 +3,18 @@ import { copyFileSync, mkdirSync, mkdtempSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createRequire } from "node:module"; +import { + load_prepared_package, + prepare_search_package, + redact_text as redactTextWithSdk, +} from "../src/native.ts"; const ROOT_DIR = join(import.meta.dir, "..", "..", ".."); const ITERATIONS = Number(process.env.ANONYMIZE_NATIVE_PERF_ITERATIONS ?? 100); +const TOP_LEVEL_ITERATIONS = Number( + process.env.ANONYMIZE_NATIVE_PERF_TOP_LEVEL_ITERATIONS ?? + Math.min(ITERATIONS, 10), +); const configJson = JSON.stringify({ regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], @@ -111,6 +120,54 @@ elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000 print(json.dumps({"prepareMs": prepare_ms, "runMs": elapsed_ms})) `; +const pythonSdkScript = ` +import json +import os +import pathlib +import sys +import time + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(os.environ["STELLA_ANONYMIZE_PERF_PAYLOAD"]) +package_start = time.perf_counter_ns() +package_bytes = anonymize.prepare_search_package(payload["config_json"]) +package_prepare_ms = (time.perf_counter_ns() - package_start) / 1_000_000 +load_start = time.perf_counter_ns() +prepared = anonymize.load_prepared_package(package_bytes) +load_ms = (time.perf_counter_ns() - load_start) / 1_000_000 +start = time.perf_counter_ns() +for _ in range(payload["iterations"]): + for item in payload["cases"]: + prepared.redact_text( + item["text"], + item.get("operators"), + ) +run_ms = (time.perf_counter_ns() - start) / 1_000_000 +one_shot_start = time.perf_counter_ns() +for _ in range(payload["top_level_iterations"]): + for item in payload["cases"]: + anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + ) +one_shot_ms = (time.perf_counter_ns() - one_shot_start) / 1_000_000 +print( + json.dumps( + { + "packagePrepareMs": package_prepare_ms, + "loadMs": load_ms, + "runMs": run_ms, + "oneShotMs": one_shot_ms, + } + ) +) +`; + runCommand("cargo", [ "build", "-p", @@ -128,14 +185,60 @@ mkdirSync(pythonPackageDir); const pythonModulePath = join(pythonPackageDir, "_native.so"); copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "__init__.py", + ), + join(pythonPackageDir, "__init__.py"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "__init__.pyi", + ), + join(pythonPackageDir, "__init__.pyi"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "_native.pyi", + ), + join(pythonPackageDir, "_native.pyi"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "py.typed", + ), + join(pythonPackageDir, "py.typed"), +); const native = createRequire(import.meta.url)(napiPath); const cases = buildCases(); const payload = { config_json: configJson, iterations: ITERATIONS, - cases: cases.map(({ text, operatorsJson }) => ({ + top_level_iterations: TOP_LEVEL_ITERATIONS, + cases: cases.map(({ text, operatorsConfig, operatorsJson }) => ({ text, + operators: operatorsConfig?.operators ?? null, operators_json: operatorsJson, })), }; @@ -180,6 +283,58 @@ printSummary( ITERATIONS, ); +const packageStart = Bun.nanoseconds(); +const packageBytes = prepare_search_package({ + binding: native, + config: configJson, +}); +const packagePrepareMs = elapsedMs(packageStart); +const loadStart = Bun.nanoseconds(); +const preparedSdk = load_prepared_package({ + binding: native, + packageBytes, +}); +const loadMs = elapsedMs(loadStart); +const sdkRunStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { + for (const item of cases) { + preparedSdk.redact_text(item.text, item.operatorsConfig); + } +} +const sdkRunMs = elapsedMs(sdkRunStart); +printSummary( + "ts-sdk-prepared-package", + { + prepareMs: packagePrepareMs + loadMs, + packagePrepareMs, + loadMs, + runMs: sdkRunMs, + }, + cases.length, + ITERATIONS, +); + +const topLevelRunStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < TOP_LEVEL_ITERATIONS; iteration += 1) { + for (const item of cases) { + redactTextWithSdk({ + binding: native, + config: configJson, + fullText: item.text, + ...(item.operatorsConfig !== undefined + ? { operators: item.operatorsConfig } + : {}), + }); + } +} +const topLevelRunMs = elapsedMs(topLevelRunStart); +printSummary( + "ts-sdk-one-shot", + { prepareMs: 0, runMs: topLevelRunMs }, + cases.length, + TOP_LEVEL_ITERATIONS, +); + const pyOutput = runCommand("python3", ["-c", pythonScript], { STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, @@ -187,6 +342,29 @@ const pyOutput = runCommand("python3", ["-c", pythonScript], { const pySummary = JSON.parse(pyOutput); printSummary("python-pyo3", pySummary, cases.length, ITERATIONS); +const pySdkOutput = runCommand("python3", ["-c", pythonSdkScript], { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, +}); +const pySdkSummary = JSON.parse(pySdkOutput); +printSummary( + "python-sdk-prepared-package", + { + prepareMs: pySdkSummary.packagePrepareMs + pySdkSummary.loadMs, + packagePrepareMs: pySdkSummary.packagePrepareMs, + loadMs: pySdkSummary.loadMs, + runMs: pySdkSummary.runMs, + }, + cases.length, + ITERATIONS, +); +printSummary( + "python-sdk-one-shot", + { prepareMs: 0, runMs: pySdkSummary.oneShotMs }, + cases.length, + TOP_LEVEL_ITERATIONS, +); + function buildCases() { const places = ["Fuzztovn", "Fuzztawn", "Fuzztowm"]; const operators = [ @@ -201,11 +379,14 @@ function buildCases() { const registration = `AB${String(index).padStart(4, "0")}`; const matter = `MAT-${String(index % 1_000).padStart(3, "0")}`; const place = places[index % places.length]; + const operatorsJson = operators[index % operators.length]; fixtureCases.push({ text: `Reference ${registration} for Acme s.r.o. near ` + `${place}, Turkey, Prague, matter ${matter}, code Secret Code.`, - operatorsJson: operators[index % operators.length], + operatorsConfig: + operatorsJson === undefined ? undefined : JSON.parse(operatorsJson), + operatorsJson, }); } @@ -237,6 +418,7 @@ function printSummary(adapter, summary, fixtureCount, iterations) { runMs: roundMs(runMs), totalMs: roundMs(prepareMs + runMs), avgCallMs: roundMs(runMs / calls), + ...extraSummaryFields(summary), }), ); } @@ -249,6 +431,16 @@ function roundMs(ms) { return Math.round(ms * 1_000) / 1_000; } +function extraSummaryFields(summary) { + const fields = {}; + for (const key of ["packagePrepareMs", "loadMs"]) { + if (summary[key] !== undefined) { + fields[key] = roundMs(Number(summary[key])); + } + } + return fields; +} + function runCommand(command, args, env = {}) { const result = spawnSync(command, args, { cwd: ROOT_DIR, From 258563135df4accc46540cfbe7d329d337d43da9 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:14:31 +0200 Subject: [PATCH 107/130] chore: verify python wheel packaging --- .github/tools/check-python-wheel.mjs | 82 ++++++++++++++++++++++++++++ .github/workflows/ci.yml | 3 + .github/workflows/release.yml | 5 ++ package.json | 1 + 4 files changed, 91 insertions(+) create mode 100644 .github/tools/check-python-wheel.mjs diff --git a/.github/tools/check-python-wheel.mjs b/.github/tools/check-python-wheel.mjs new file mode 100644 index 00000000..2aff9e15 --- /dev/null +++ b/.github/tools/check-python-wheel.mjs @@ -0,0 +1,82 @@ +import { execFileSync } from "node:child_process"; +import { mkdtempSync, readdirSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import process from "node:process"; + +const outDir = mkdtempSync(join(tmpdir(), "stella-anonymize-wheel-")); +const profile = process.env.ANONYMIZE_PYTHON_WHEEL_PROFILE ?? "ci"; + +try { + execFileSync( + "uvx", + [ + "--from", + "maturin>=1.14,<2", + "maturin", + "build", + "--manifest-path", + "crates/anonymize-py/Cargo.toml", + "--locked", + "--profile", + profile, + "--out", + outDir, + ], + { stdio: "inherit" }, + ); + + const wheel = readdirSync(outDir).find((file) => file.endsWith(".whl")); + if (wheel === undefined) { + throw new Error("maturin did not emit a wheel"); + } + + const wheelPath = join(outDir, wheel); + const files = new Set(JSON.parse(readWheelFiles(wheelPath))); + const required = [ + "stella_anonymize/__init__.py", + "stella_anonymize/__init__.pyi", + "stella_anonymize/_native.pyi", + "stella_anonymize/py.typed", + ]; + const missing = required.filter((file) => !files.has(file)); + if (missing.length > 0) { + throw new Error(`wheel is missing files: ${missing.join(", ")}`); + } + if (![...files].some(isNativeExtension)) { + throw new Error("wheel is missing the native _native extension"); + } + + console.log( + JSON.stringify({ + event: "python-wheel-check", + wheel, + profile, + }), + ); +} finally { + rmSync(outDir, { force: true, recursive: true }); +} + +function readWheelFiles(wheelPath) { + return execFileSync( + "python3", + [ + "-c", + [ + "import json, sys, zipfile", + "with zipfile.ZipFile(sys.argv[1]) as wheel:", + " print(json.dumps(wheel.namelist()))", + ].join("\n"), + wheelPath, + ], + { encoding: "utf8" }, + ); +} + +function isNativeExtension(file) { + return ( + file.startsWith("stella_anonymize/_native.") && + [".so", ".pyd", ".dll", ".dylib"].some((suffix) => file.endsWith(suffix)) + ); +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da6ca6c8..5f2ec21c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -83,6 +83,9 @@ jobs: - name: Python typecheck run: bun run python:typecheck + - name: Python wheel + run: bun run python:wheel + - name: Test run: bun run test diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ec032b4d..0ee9ece9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -63,6 +63,11 @@ jobs: - name: Python typecheck run: bun run python:typecheck + - name: Python wheel + run: bun run python:wheel + env: + ANONYMIZE_PYTHON_WHEEL_PROFILE: release + - name: Test run: bun run test diff --git a/package.json b/package.json index 5c15a7d6..4eb1335a 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "rust:test": "cargo ci-test", "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:test", "python:typecheck": "uvx --from ty==0.0.29 ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", + "python:wheel": "node .github/tools/check-python-wheel.mjs", "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", From 7a894660ffee55f3dee4bc21b778879892cd9b67 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:20:30 +0200 Subject: [PATCH 108/130] test: enforce native adapter perf parity --- .../anonymize/scripts/native-adapter-perf.mjs | 272 +++++++++++++++++- 1 file changed, 270 insertions(+), 2 deletions(-) diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs index 8317814b..a6281103 100644 --- a/packages/anonymize/scripts/native-adapter-perf.mjs +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -1,5 +1,6 @@ import { spawnSync } from "node:child_process"; -import { copyFileSync, mkdirSync, mkdtempSync } from "node:fs"; +import { createHash } from "node:crypto"; +import { copyFileSync, mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createRequire } from "node:module"; @@ -117,7 +118,16 @@ for _ in range(payload["iterations"]): item.get("operators_json"), ) elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000 -print(json.dumps({"prepareMs": prepare_ms, "runMs": elapsed_ms})) +case_results = [ + json.loads( + prepared.redact_static_entities_json( + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps({"prepareMs": prepare_ms, "runMs": elapsed_ms, "caseResults": case_results})) `; const pythonSdkScript = ` @@ -139,6 +149,42 @@ package_prepare_ms = (time.perf_counter_ns() - package_start) / 1_000_000 load_start = time.perf_counter_ns() prepared = anonymize.load_prepared_package(package_bytes) load_ms = (time.perf_counter_ns() - load_start) / 1_000_000 +def entity_to_dict(entity): + return { + "start": entity.start, + "end": entity.end, + "label": entity.label, + "text": entity.text, + "score": entity.score, + "source": entity.source, + "source_detail": entity.source_detail, + } + +def redaction_entry_to_dict(entry): + return {"placeholder": entry.placeholder, "original": entry.original} + +def operator_entry_to_dict(entry): + return {"placeholder": entry.placeholder, "operator": entry.operator} + +def result_to_dict(result): + return { + "resolved_entities": [ + entity_to_dict(entity) + for entity in result.resolved_entities + ], + "redaction": { + "redacted_text": result.redaction.redacted_text, + "redaction_map": [ + redaction_entry_to_dict(entry) + for entry in result.redaction.redaction_map + ], + "operator_map": [ + operator_entry_to_dict(entry) + for entry in result.redaction.operator_map + ], + "entity_count": result.redaction.entity_count, + }, + } start = time.perf_counter_ns() for _ in range(payload["iterations"]): for item in payload["cases"]: @@ -156,6 +202,25 @@ for _ in range(payload["top_level_iterations"]): item.get("operators"), ) one_shot_ms = (time.perf_counter_ns() - one_shot_start) / 1_000_000 +package_case_results = [ + result_to_dict( + prepared.redact_text( + item["text"], + item.get("operators"), + ) + ) + for item in payload["cases"] +] +one_shot_case_results = [ + result_to_dict( + anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + ) + ) + for item in payload["cases"] +] print( json.dumps( { @@ -163,6 +228,8 @@ print( "loadMs": load_ms, "runMs": run_ms, "oneShotMs": one_shot_ms, + "packageCaseResults": package_case_results, + "oneShotCaseResults": one_shot_case_results, } ) ) @@ -242,6 +309,7 @@ const payload = { operators_json: operatorsJson, })), }; +const rustCoreResults = callRustCoreResults(payload, tempDir); const rustOutput = runCommand( "cargo", @@ -276,6 +344,20 @@ for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { } } const tsRunMs = elapsedMs(tsStart); +assertAdapterResults( + "ts-napi", + cases.map((item) => + canonicalResult( + prepared.redactStaticEntities( + item.text, + item.operatorsJson === undefined + ? undefined + : JSON.parse(item.operatorsJson), + ), + ), + ), + rustCoreResults, +); printSummary( "ts-napi", { prepareMs: tsPrepareMs, runMs: tsRunMs }, @@ -302,6 +384,13 @@ for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { } } const sdkRunMs = elapsedMs(sdkRunStart); +assertAdapterResults( + "ts-sdk-prepared-package", + cases.map((item) => + canonicalResult(preparedSdk.redact_text(item.text, item.operatorsConfig)), + ), + rustCoreResults, +); printSummary( "ts-sdk-prepared-package", { @@ -328,6 +417,22 @@ for (let iteration = 0; iteration < TOP_LEVEL_ITERATIONS; iteration += 1) { } } const topLevelRunMs = elapsedMs(topLevelRunStart); +assertAdapterResults( + "ts-sdk-one-shot", + cases.map((item) => + canonicalResult( + redactTextWithSdk({ + binding: native, + config: configJson, + fullText: item.text, + ...(item.operatorsConfig !== undefined + ? { operators: item.operatorsConfig } + : {}), + }), + ), + ), + rustCoreResults, +); printSummary( "ts-sdk-one-shot", { prepareMs: 0, runMs: topLevelRunMs }, @@ -340,6 +445,7 @@ const pyOutput = runCommand("python3", ["-c", pythonScript], { STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, }); const pySummary = JSON.parse(pyOutput); +assertAdapterResults("python-pyo3", pySummary.caseResults, rustCoreResults); printSummary("python-pyo3", pySummary, cases.length, ITERATIONS); const pySdkOutput = runCommand("python3", ["-c", pythonSdkScript], { @@ -347,6 +453,11 @@ const pySdkOutput = runCommand("python3", ["-c", pythonSdkScript], { STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, }); const pySdkSummary = JSON.parse(pySdkOutput); +assertAdapterResults( + "python-sdk-prepared-package", + pySdkSummary.packageCaseResults, + rustCoreResults, +); printSummary( "python-sdk-prepared-package", { @@ -358,6 +469,11 @@ printSummary( cases.length, ITERATIONS, ); +assertAdapterResults( + "python-sdk-one-shot", + pySdkSummary.oneShotCaseResults, + rustCoreResults, +); printSummary( "python-sdk-one-shot", { prepareMs: 0, runMs: pySdkSummary.oneShotMs }, @@ -393,6 +509,158 @@ function buildCases() { return fixtureCases; } +function callRustCoreResults(perfPayload, tempDirectory) { + const parityPayloadPath = join(tempDirectory, "native-adapter-parity.json"); + writeFileSync( + parityPayloadPath, + JSON.stringify({ + config_json: perfPayload.config_json, + cases: perfPayload.cases.map(({ text, operators_json }) => ({ + text, + operators_json, + })), + }), + ); + const output = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_parity", + "--release", + "--locked", + "--quiet", + ], + { + STELLA_ANONYMIZE_PARITY_PAYLOAD: parityPayloadPath, + }, + ); + return JSON.parse(output).map(canonicalResult); +} + +function assertAdapterResults(adapter, actualResults, expectedResults) { + if (actualResults.length !== expectedResults.length) { + throw new Error( + `${adapter} returned ${actualResults.length} parity results, expected ${expectedResults.length}`, + ); + } + + for (let index = 0; index < expectedResults.length; index += 1) { + const actual = canonicalResult(actualResults[index]); + const expected = expectedResults[index]; + const actualSignature = resultSignature(actual); + const expectedSignature = resultSignature(expected); + if (actualSignature === expectedSignature) { + continue; + } + throw new Error( + [ + `${adapter} parity mismatch at case ${index}`, + `expected=${hashSignature(expectedSignature)}`, + `actual=${hashSignature(actualSignature)}`, + ].join(" "), + ); + } +} + +function canonicalResult(result) { + const redaction = result.redaction; + return { + resolved_entities: readArray( + result, + "resolvedEntities", + "resolved_entities", + ).map(canonicalEntity), + redaction: { + redacted_text: readValue(redaction, "redactedText", "redacted_text"), + redaction_map: canonicalRedactionEntries( + readValue(redaction, "redactionMap", "redaction_map"), + ), + operator_map: canonicalOperatorEntries( + readValue(redaction, "operatorMap", "operator_map"), + ), + entity_count: readValue(redaction, "entityCount", "entity_count"), + }, + }; +} + +function canonicalEntity(entity) { + return { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: + readOptionalValue(entity, "sourceDetail", "source_detail") ?? null, + }; +} + +function canonicalRedactionEntries(entries) { + if (entries instanceof Map) { + return [...entries.entries()].map(([placeholder, original]) => ({ + placeholder, + original, + })); + } + return entries.map(({ placeholder, original }) => ({ + placeholder, + original, + })); +} + +function canonicalOperatorEntries(entries) { + if (entries instanceof Map) { + return [...entries.entries()].map(([placeholder, operator]) => ({ + placeholder, + operator, + })); + } + return entries.map(({ placeholder, operator }) => ({ + placeholder, + operator, + })); +} + +function readArray(value, camelKey, snakeKey) { + const result = readValue(value, camelKey, snakeKey); + if (!Array.isArray(result)) { + throw new TypeError(`Expected array field ${camelKey}/${snakeKey}`); + } + return result; +} + +function readValue(value, camelKey, snakeKey) { + if (Object.hasOwn(value, camelKey)) { + return value[camelKey]; + } + if (Object.hasOwn(value, snakeKey)) { + return value[snakeKey]; + } + throw new TypeError(`Missing field ${camelKey}/${snakeKey}`); +} + +function readOptionalValue(value, camelKey, snakeKey) { + if (Object.hasOwn(value, camelKey)) { + return value[camelKey]; + } + if (Object.hasOwn(value, snakeKey)) { + return value[snakeKey]; + } + return undefined; +} + +function resultSignature(result) { + return JSON.stringify(result); +} + +function hashSignature(signature) { + return createHash("sha256").update(signature).digest("hex").slice(0, 16); +} + function nativeLibraryPath(name) { if (process.platform === "darwin") { return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); From cce3ff0e428dfc20fc954e448c109e3e42ed3cf6 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:23:50 +0200 Subject: [PATCH 109/130] test: smoke python wheel install --- .github/tools/check-python-wheel.mjs | 50 ++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/.github/tools/check-python-wheel.mjs b/.github/tools/check-python-wheel.mjs index 2aff9e15..75b40ef7 100644 --- a/.github/tools/check-python-wheel.mjs +++ b/.github/tools/check-python-wheel.mjs @@ -46,6 +46,7 @@ try { if (![...files].some(isNativeExtension)) { throw new Error("wheel is missing the native _native extension"); } + smokeInstalledWheel(wheelPath); console.log( JSON.stringify({ @@ -74,6 +75,55 @@ function readWheelFiles(wheelPath) { ); } +function smokeInstalledWheel(wheelPath) { + execFileSync( + "uv", + [ + "run", + "--isolated", + "--no-project", + "--python", + "3.11", + "--with", + wheelPath, + "python", + "-c", + [ + "import json", + "import stella_anonymize as anonymize", + "required = [", + " 'PreparedAnonymizer',", + " 'PreparedSearch',", + " 'load_prepared_package',", + " 'prepare_search_package',", + " 'redact_text',", + "]", + "missing = [name for name in required if not hasattr(anonymize, name)]", + "if missing:", + " raise SystemExit(f'missing exports: {missing}')", + "config_json = json.dumps({", + " 'regex_patterns': [{'kind': 'regex', 'pattern': r'\\b[A-Z]{2}\\d{4}\\b'}],", + " 'slices': {'regex': {'start': 0, 'end': 1}},", + " 'regex_meta': [{'label': 'registration number', 'score': 1.0}],", + "})", + "package_bytes = anonymize.prepare_search_package(config_json)", + "prepared = anonymize.load_prepared_package(package_bytes)", + "result = prepared.redact_text('Reference AB1234')", + "if result.redaction.entity_count != 1:", + " raise SystemExit(f'unexpected entity count: {result.redaction.entity_count}')", + "if result.redaction.redacted_text == 'Reference AB1234':", + " raise SystemExit('redaction did not change text')", + "print(json.dumps({", + " 'event': 'python-wheel-import-smoke',", + " 'version': anonymize.native_package_version(),", + " 'entity_count': result.redaction.entity_count,", + "}))", + ].join("\n"), + ], + { stdio: "inherit" }, + ); +} + function isNativeExtension(file) { return ( file.startsWith("stella_anonymize/_native.") && From 4ace954b9bfb9fdae33dc3d7798d29c4ba81d7c0 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:49:36 +0200 Subject: [PATCH 110/130] perf: narrow native match work --- crates/anonymize-core/src/address_context.rs | 364 +++++++++++++++---- crates/anonymize-core/src/byte_offsets.rs | 23 ++ crates/anonymize-core/src/processors.rs | 155 ++++---- crates/anonymize-core/tests/prepared.rs | 33 ++ 4 files changed, 417 insertions(+), 158 deletions(-) diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index a35c1a65..2fa8412e 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -42,6 +42,12 @@ struct WordBefore { has_dot: bool, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ScanRange { + start: usize, + end: usize, +} + impl PreparedAddressContextData { pub(crate) fn new(data: AddressContextData) -> Result { Ok(Self { @@ -93,53 +99,74 @@ impl PreparedAddressContextData { .collect::>(); let header_end = header_end(full_text); let offsets = ByteOffsets::new(full_text); + let scan_ranges = address_context_scan_ranges( + full_text, + &offsets, + header_end, + &address_entities, + )?; - for found in self.slash_house_number.find_iter(full_text) { - let num_start = usize_to_u32("address_context.num_start", found.start())?; - let num_end = usize_to_u32("address_context.num_end", found.end())?; - if covered_by(existing_entities, num_start, num_end) { - continue; - } - - let in_header = num_start < header_end; - let near_address = address_entities.iter().any(|entity| { - within_context_window(&offsets, entity, num_start, num_end) - }); - if !in_header && !near_address { - continue; - } - - let Some(scan_start) = skip_whitespace_back(full_text, found.start()) - else { - continue; - }; - let Some((street_start, has_temporal_prep)) = - self.scan_street_start(full_text, scan_start)? - else { + for range in scan_ranges { + let Some(segment) = full_text.get(range.start..range.end) else { continue; }; - let street_start_u32 = - usize_to_u32("address_context.street_start", street_start)?; - if has_temporal_prep { - continue; - } - if covered_by(existing_entities, street_start_u32, num_end) { - continue; - } - - let street_text = text_slice(full_text, street_start_u32, num_end)?; - if street_text.len() < 4 { - continue; + for found in self.slash_house_number.find_iter(segment) { + let num_start_byte = range.start.saturating_add(found.start()); + let num_end_byte = range.start.saturating_add(found.end()); + if !self.full_slash_house_match_is_identical( + full_text, + num_start_byte, + num_end_byte, + ) { + continue; + } + let num_start = + usize_to_u32("address_context.num_start", num_start_byte)?; + let num_end = usize_to_u32("address_context.num_end", num_end_byte)?; + if covered_by(existing_entities, num_start, num_end) { + continue; + } + + let in_header = num_start < header_end; + let near_address = address_entities.iter().any(|entity| { + within_context_window(&offsets, entity, num_start, num_end) + }); + if !in_header && !near_address { + continue; + } + + let Some(scan_start) = skip_whitespace_back(full_text, num_start_byte) + else { + continue; + }; + let Some((street_start, has_temporal_prep)) = + self.scan_street_start(full_text, scan_start)? + else { + continue; + }; + let street_start_u32 = + usize_to_u32("address_context.street_start", street_start)?; + if has_temporal_prep { + continue; + } + if covered_by(existing_entities, street_start_u32, num_end) { + continue; + } + + let street_text = text_slice(full_text, street_start_u32, num_end)?; + if street_text.len() < 4 { + continue; + } + let score = address_context_score(full_text, street_start, in_header); + results.push(address_context_entity( + street_start_u32, + num_end, + "address", + street_text, + score, + DetectionSource::Regex, + )); } - let score = address_context_score(full_text, street_start, in_header); - results.push(address_context_entity( - street_start_u32, - num_end, - "address", - street_text, - score, - DetectionSource::Regex, - )); } self.detect_bare_house_numbers( @@ -150,6 +177,18 @@ impl PreparedAddressContextData { Ok(results) } + fn full_slash_house_match_is_identical( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + self + .slash_house_number + .find_at(full_text, start) + .is_some_and(|found| found.start() == start && found.end() == end) + } + fn scan_street_start( &self, full_text: &str, @@ -212,45 +251,71 @@ impl PreparedAddressContextData { existing_entities: &[PipelineEntity], results: &mut Vec, ) -> Result<()> { - for captures in self.bare_house_number.captures_iter(full_text) { - let Some(captured) = captures.name("value") else { + let offsets = ByteOffsets::new(full_text); + let ranges = + bare_house_scan_ranges(full_text, &offsets, existing_entities, results)?; + for range in ranges { + let Some(segment) = full_text.get(range.start..range.end) else { continue; }; - let start = usize_to_u32("address_context.bare_start", captured.start())?; - let end = usize_to_u32("address_context.bare_end", captured.end())?; - if !near_confirmed_address_same_line( - full_text, - existing_entities, - results, - start, - end, - )? { - continue; - } - - let word = captured - .as_str() - .split_whitespace() - .next() - .unwrap_or("") - .to_lowercase(); - if self.bare_house_stopwords.contains(&word) { - continue; + for captures in self.bare_house_number.captures_iter(segment) { + let Some(full_match) = captures.get(0) else { + continue; + }; + let match_start = range.start.saturating_add(full_match.start()); + let match_end = range.start.saturating_add(full_match.end()); + if !self.full_bare_house_match_is_identical( + full_text, + match_start, + match_end, + ) { + continue; + } + let Some(captured) = captures.name("value") else { + continue; + }; + let start = usize_to_u32( + "address_context.bare_start", + range.start.saturating_add(captured.start()), + )?; + let end = usize_to_u32( + "address_context.bare_end", + range.start.saturating_add(captured.end()), + )?; + if !near_confirmed_address_same_line( + full_text, + existing_entities, + results, + start, + end, + )? { + continue; + } + + let word = captured + .as_str() + .split_whitespace() + .next() + .unwrap_or("") + .to_lowercase(); + if self.bare_house_stopwords.contains(&word) { + continue; + } + if overlaps_any(existing_entities, start, end) + || overlaps_any(results, start, end) + { + continue; + } + + results.push(address_context_entity( + start, + end, + "address", + captured.as_str(), + 0.75, + DetectionSource::Regex, + )); } - if overlaps_any(existing_entities, start, end) - || overlaps_any(results, start, end) - { - continue; - } - - results.push(address_context_entity( - start, - end, - "address", - captured.as_str(), - 0.75, - DetectionSource::Regex, - )); } Ok(()) } @@ -262,6 +327,12 @@ impl PreparedAddressContextData { ) -> Result> { let header_end = header_end(full_text); let offsets = ByteOffsets::new(full_text); + let header_scan_end = header_scan_end(full_text, &offsets, header_end)?; + let header = + full_text.get(..header_scan_end).ok_or(Error::InvalidSpan { + start: 0, + end: u32::try_from(header_scan_end).unwrap_or(u32::MAX), + })?; let context_entities = existing_entities .iter() .filter(|entity| { @@ -270,7 +341,7 @@ impl PreparedAddressContextData { .collect::>(); let mut results = Vec::new(); - for captures in self.orphan_street_line.captures_iter(full_text) { + for captures in self.orphan_street_line.captures_iter(header) { let Some(captured) = captures.name("value") else { continue; }; @@ -298,6 +369,18 @@ impl PreparedAddressContextData { } Ok(results) } + + fn full_bare_house_match_is_identical( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + self + .bare_house_number + .find_at(full_text, start) + .is_some_and(|found| found.start() == start && found.end() == end) + } } fn lowercased_set(values: Vec) -> BTreeSet { @@ -307,6 +390,127 @@ fn lowercased_set(values: Vec) -> BTreeSet { .collect() } +fn address_context_scan_ranges( + full_text: &str, + offsets: &ByteOffsets<'_>, + header_end: u32, + address_entities: &[&PipelineEntity], +) -> Result> { + let mut ranges = Vec::new(); + let header_end = offsets.validate_offset(header_end)?; + if header_end > 0 { + ranges.push(ScanRange { + start: 0, + end: header_end, + }); + } + + for entity in address_entities { + let start = + offsets.offset_before_utf16_units(entity.start, STREET_CONTEXT_WINDOW)?; + let end = + offsets.offset_after_utf16_units(entity.end, STREET_CONTEXT_WINDOW)?; + push_scan_range(full_text, &mut ranges, start, end)?; + } + + Ok(merge_scan_ranges(ranges)) +} + +fn bare_house_scan_ranges( + full_text: &str, + offsets: &ByteOffsets<'_>, + existing_entities: &[PipelineEntity], + new_entities: &[PipelineEntity], +) -> Result> { + let mut ranges = Vec::new(); + for entity in existing_entities.iter().chain(new_entities.iter()) { + if entity.label != "address" || is_caller_owned_entity(entity) { + continue; + } + let start = offsets + .offset_before_utf16_units(entity.start, BARE_HOUSE_CONTEXT_WINDOW)?; + let end = offsets + .offset_after_utf16_units(entity.end, BARE_HOUSE_CONTEXT_WINDOW)?; + ranges.push(line_expanded_scan_range(full_text, offsets, start, end)?); + } + Ok(merge_scan_ranges(ranges)) +} + +fn push_scan_range( + full_text: &str, + ranges: &mut Vec, + start: u32, + end: u32, +) -> Result<()> { + if start >= end { + return Ok(()); + } + let start = usize::try_from(start) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: start })?; + let end = usize::try_from(end) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: end })?; + if start > full_text.len() || end > full_text.len() { + return Err(Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + ranges.push(ScanRange { start, end }); + Ok(()) +} + +fn merge_scan_ranges(mut ranges: Vec) -> Vec { + ranges.sort_by_key(|range| (range.start, range.end)); + let mut merged = Vec::::new(); + for range in ranges { + let Some(last) = merged.last_mut() else { + merged.push(range); + continue; + }; + if range.start <= last.end { + last.end = last.end.max(range.end); + continue; + } + merged.push(range); + } + merged +} + +fn line_expanded_scan_range( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, +) -> Result { + let start = offsets.validate_offset(start)?; + let end = offsets.validate_offset(end)?; + let line_start = full_text + .get(..start) + .and_then(|prefix| prefix.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + let line_end = full_text + .get(end..) + .and_then(|suffix| suffix.find('\n').map(|index| end.saturating_add(index))) + .unwrap_or(full_text.len()); + Ok(ScanRange { + start: line_start, + end: line_end, + }) +} + +fn header_scan_end( + full_text: &str, + offsets: &ByteOffsets<'_>, + header_end: u32, +) -> Result { + let header_end = offsets.validate_offset(header_end)?; + let tail = full_text.get(header_end..).ok_or(Error::InvalidSpan { + start: u32::try_from(header_end).unwrap_or(u32::MAX), + end: offsets.len()?, + })?; + let Some(relative_newline) = tail.find('\n') else { + return Ok(full_text.len()); + }; + Ok(header_end.saturating_add(relative_newline)) +} + fn compile_regex(field: &'static str, pattern: &str) -> Result { Regex::new(pattern).map_err(|error| Error::InvalidStaticData { field, diff --git a/crates/anonymize-core/src/byte_offsets.rs b/crates/anonymize-core/src/byte_offsets.rs index 51c0af67..492c040e 100644 --- a/crates/anonymize-core/src/byte_offsets.rs +++ b/crates/anonymize-core/src/byte_offsets.rs @@ -100,4 +100,27 @@ impl<'a> ByteOffsets<'a> { } self.len() } + + pub(crate) fn offset_before_utf16_units( + &self, + end: u32, + max_units: u32, + ) -> Result { + let end_byte = self.validate_offset(end)?; + let prefix = self + .text + .get(..end_byte) + .ok_or(Error::InvalidSpan { start: 0, end })?; + let mut units = 0_u32; + for (byte, ch) in prefix.char_indices().rev() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + if units.saturating_add(width) > max_units { + let offset = byte.saturating_add(ch.len_utf8()); + return u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + units = units.saturating_add(width); + } + Ok(0) + } } diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index 213640b1..c2b3c5af 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -277,11 +277,12 @@ pub struct SigningPlaceGuardData { #[derive(Clone, Debug, Eq, PartialEq)] struct RawDenyListMatch { + pattern: usize, start: u32, end: u32, labels: Vec, custom_labels: Vec, - sources: Vec, + has_person_name_source: bool, text: String, } @@ -339,56 +340,57 @@ pub fn process_deny_list_matches( data: &DenyListMatchData, ) -> Result> { let offsets = ByteOffsets::new(full_text); - let mut matches_by_pattern = + let mut matches = collect_deny_list_matches(matches, slice, full_text, data, &offsets)?; - suppress_shorter_curated_contained_matches(&mut matches_by_pattern); + suppress_shorter_curated_contained_matches(&mut matches); let mut results = Vec::new(); let mut name_hits = Vec::new(); - for pattern_matches in matches_by_pattern.values() { - for found in pattern_matches { - for label in &found.custom_labels { - let mut entity = PipelineEntity::detected( - found.start, - found.end, - label.clone(), - found.text.clone(), - DENY_LIST_SCORE, - DetectionSource::DenyList, - ); - entity.source_detail = Some(SourceDetail::CustomDenyList); - results.push(entity); - } + for found in &matches { + for label in &found.custom_labels { + let mut entity = PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + ); + entity.source_detail = Some(SourceDetail::CustomDenyList); + results.push(entity); } + } - for found in pattern_matches { - if found.labels.iter().any(|label| label == PERSON_LABEL) - && !filter_contains( - data - .filters - .as_ref() - .map(|filters| &filters.person_stopwords), - &found.text.to_lowercase(), - ) - { - name_hits.push(found.clone()); - } + for found in &matches { + if found.labels.is_empty() { + continue; + } + if found.labels.iter().any(|label| label == PERSON_LABEL) + && !filter_contains( + data + .filters + .as_ref() + .map(|filters| &filters.person_stopwords), + &found.text.to_lowercase(), + ) + { + name_hits.push(found.clone()); + } - let suppress_address = should_suppress_address(full_text, data, found)?; - for label in found.labels.iter().filter(|label| *label != PERSON_LABEL) { - if label == ADDRESS_LABEL && suppress_address { - continue; - } - results.push(PipelineEntity::detected( - found.start, - found.end, - label.clone(), - found.text.clone(), - DENY_LIST_SCORE, - DetectionSource::DenyList, - )); + let suppress_address = should_suppress_address(full_text, data, found)?; + for label in found.labels.iter().filter(|label| *label != PERSON_LABEL) { + if label == ADDRESS_LABEL && suppress_address { + continue; } + results.push(PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + )); } } @@ -410,16 +412,14 @@ pub fn process_deny_list_matches( } fn suppress_shorter_curated_contained_matches( - matches_by_pattern: &mut BTreeMap>, + matches: &mut [RawDenyListMatch], ) { let mut ranges = Vec::<(u32, u32)>::new(); - for matches in matches_by_pattern.values() { - for found in matches { - if found.labels.is_empty() { - continue; - } - ranges.push((found.start, found.end)); + for found in matches.iter() { + if found.labels.is_empty() { + continue; } + ranges.push((found.start, found.end)); } ranges.sort_by(|left, right| { @@ -448,18 +448,13 @@ fn suppress_shorter_curated_contained_matches( return; } - for matches in matches_by_pattern.values_mut() { - for found in matches.iter_mut() { - if found.labels.is_empty() { - continue; - } - if suppress.contains(&(found.start, found.end)) { - found.labels.clear(); - } + for found in matches.iter_mut() { + if found.labels.is_empty() { + continue; + } + if suppress.contains(&(found.start, found.end)) { + found.labels.clear(); } - matches.retain(|found| { - !found.labels.is_empty() || !found.custom_labels.is_empty() - }); } } @@ -469,8 +464,8 @@ fn collect_deny_list_matches( full_text: &str, data: &DenyListMatchData, offsets: &ByteOffsets<'_>, -) -> Result>> { - let mut matches_by_pattern = BTreeMap::>::new(); +) -> Result> { + let mut results = Vec::new(); for found in matches { let Some(local_index) = slice.local_index(found.pattern()) else { @@ -533,20 +528,27 @@ fn collect_deny_list_matches( continue; } - matches_by_pattern - .entry(local_index) - .or_default() - .push(RawDenyListMatch { - start: found.start(), - end: found.end(), - labels: curated_labels, - custom_labels, - sources: sources.to_strings(), - text: match_text, - }); + results.push(RawDenyListMatch { + pattern: local_index, + start: found.start(), + end: found.end(), + labels: curated_labels, + custom_labels, + has_person_name_source: sources + .iter() + .any(|source| source == FIRST_NAME_SOURCE || source == SURNAME_SOURCE), + text: match_text, + }); } - Ok(matches_by_pattern) + results.sort_by(|left, right| { + left + .pattern + .cmp(&right.pattern) + .then_with(|| left.start.cmp(&right.start)) + .then_with(|| left.end.cmp(&right.end)) + }); + Ok(results) } struct CuratedDenyListMatch<'a> { @@ -954,10 +956,7 @@ fn has_curated_source(sources: StringGroup<'_>) -> bool { } fn has_person_name_source(found: &RawDenyListMatch) -> bool { - found - .sources - .iter() - .any(|source| source == FIRST_NAME_SOURCE || source == SURNAME_SOURCE) + found.has_person_name_source } fn filter_contains(set: Option<&BTreeSet>, value: &str) -> bool { diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs index 3468c146..a36a5003 100644 --- a/crates/anonymize-core/tests/prepared.rs +++ b/crates/anonymize-core/tests/prepared.rs @@ -456,6 +456,39 @@ fn prepared_search_measures_slash_address_context_in_text_offsets() { ); } +#[test] +fn prepared_search_finds_slash_address_context_after_long_multibyte_prefix() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!( + "{}\nPraha 10 {} Vinohradská 2512/2a.", + "č".repeat(4_000), + "á".repeat(145) + ); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Vinohradská 2512/2a") + ); +} + #[test] fn prepared_search_ignores_caller_owned_addresses_for_bare_house_context() { let mut meta = RegexMatchMeta::new("address", 1.0); From 92da7bbccdc38fd0302b2b49ad2dd036c52f3982 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 21:56:08 +0200 Subject: [PATCH 111/130] feat: scope native package builds --- .../scripts/build-native-pipeline-package.mjs | 66 ++++++++++++++++--- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/packages/anonymize/scripts/build-native-pipeline-package.mjs b/packages/anonymize/scripts/build-native-pipeline-package.mjs index bc1ca0b3..3d6cc185 100755 --- a/packages/anonymize/scripts/build-native-pipeline-package.mjs +++ b/packages/anonymize/scripts/build-native-pipeline-package.mjs @@ -64,6 +64,16 @@ function parseArgs(values) { result.defaultDictionaries = true; break; } + case "--language": { + result.language = requiredValue(values, index, value); + index += 1; + break; + } + case "--languages": { + result.languages = requiredValue(values, index, value); + index += 1; + break; + } case "--help": { printHelp(); process.exit(0); @@ -85,15 +95,19 @@ function requiredValue(values, index, option) { async function loadPackageInput(options) { const input = await loadBasePackageInput(options); - if (!options.defaultDictionaries || input.config.dictionaries !== undefined) { - return input; - } + const withDictionaries = + !options.defaultDictionaries || input.config.dictionaries !== undefined + ? input + : { + ...input, + config: { + ...input.config, + dictionaries: await loadDefaultDictionaries(), + }, + }; return { - ...input, - config: { - ...input.config, - dictionaries: await loadDefaultDictionaries(), - }, + ...withDictionaries, + config: applyCliLanguageScope(withDictionaries.config, options), }; } @@ -132,6 +146,40 @@ function defaultNativePipelineConfig() { }; } +function applyCliLanguageScope(config, options) { + if (options.language !== undefined && options.languages !== undefined) { + throw new Error("Use either --language or --languages, not both"); + } + if (options.language !== undefined) { + const language = normalizeLanguageOption(options.language, "--language"); + return { ...config, language, languages: undefined }; + } + if (options.languages === undefined) { + return config; + } + const languages = normalizeLanguageList(options.languages); + return { ...config, language: undefined, languages }; +} + +function normalizeLanguageOption(value, option) { + const language = value.trim().toLowerCase(); + if (language.length === 0) { + throw new Error(`${option} requires a non-empty language code`); + } + return language; +} + +function normalizeLanguageList(value) { + const languages = value + .split(",") + .map((entry) => normalizeLanguageOption(entry, "--languages")) + .filter((entry, index, entries) => entries.indexOf(entry) === index); + if (languages.length === 0) { + throw new Error("--languages requires at least one language code"); + } + return languages; +} + async function loadDefaultDictionaries() { let loaded; try { @@ -163,6 +211,8 @@ Options: --out Output package path. Defaults to native-pipeline.stlanonpkg. --config ESM module exporting a PipelineConfig or { config, gazetteerEntries }. --export Export name to read from the config module. Defaults to default. + --language Build a package scoped to one content language. + --languages Build a package scoped to comma-separated content languages. --default-dictionaries Load @stll/anonymize-data into configs that do not provide dictionaries. --raw Write an uncompressed package. `); From 599a62a2697ec3ad3d15c5a3b3c2a24e4268db17 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 26 Jun 2026 22:15:18 +0200 Subject: [PATCH 112/130] chore: consume merged text search artifacts --- Cargo.lock | 2 +- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8d1f4569..41fad21f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77#8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77" +source = "git+https://github.com/stella/text-search?rev=0f35d481d990b720ccbce3594d38b7438846efc2#0f35d481d990b720ccbce3594d38b7438846efc2" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index 61a9e6a3..b5a48387 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -14,7 +14,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "8b0e074ea2d4fdb7d21ad02d36f949dbf1e23c77" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0f35d481d990b720ccbce3594d38b7438846efc2" } [dev-dependencies] proptest = "1" From d7150ec2eee16a8d99eb0cdcd2d683330315e2f2 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 01:52:15 +0200 Subject: [PATCH 113/130] docs: adopt Rust agent conventions --- .ai/manifest.json | 1 + .ai/shared | 2 +- AGENTS.md | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/.ai/manifest.json b/.ai/manifest.json index 6a192fbb..16cf5ed8 100644 --- a/.ai/manifest.json +++ b/.ai/manifest.json @@ -6,6 +6,7 @@ "stella-public-repo", "engineering", "typescript", + "rust", "testing", "linting" ], diff --git a/.ai/shared b/.ai/shared index 2519c1c1..69216a30 160000 --- a/.ai/shared +++ b/.ai/shared @@ -1 +1 @@ -Subproject commit 2519c1c1bc7fd3ec09a846624968ed496e6dd79f +Subproject commit 69216a3067cb1639281a8988b2e33aaab0f3bac0 diff --git a/AGENTS.md b/AGENTS.md index 27edb423..1a382724 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -51,6 +51,10 @@ details unless they are already public in the repository. - If TypeScript can make a class of bug structurally impossible (branded types, discriminated unions, exhaustive checks), prefer that over runtime validation or manual discipline +- Avoid boolean fields for states that may grow. Use a named discriminator or + domain type for values that answer "which kind/status/mode/type?" rather than + a permanent yes/no question; a two-value union, enum, or equivalent domain type + now is usually cheaper than migrating an `isX` flag later. - Conventional Commits: `feat:`, `chore:`, `fix:`, `docs:` - Rebase feature branches onto main (linear history) - Fail fast: validate at boundaries, return/throw early @@ -126,6 +130,86 @@ details unless they are already public in the repository. `getX()` getter so it runs at first use, not at import time. This prevents TDZ errors from non-deterministic module evaluation order. +## Coding Conventions + +### Rust + +- Use Rust 2024 for new crates. Pin the toolchain in `rust-toolchain.toml` and + keep `rustfmt` and `clippy` installed. +- In workspaces, put shared lint policy in `[workspace.lints]`; member crates + should opt in with `[lints] workspace = true`. +- Treat + `cargo clippy --workspace --all-targets --all-features -- -D warnings` as the + baseline quality gate unless a repo documents a narrower command. +- Use Dylint for shared stella-specific Rust lints that Clippy cannot express. + Run `cargo dylint --workspace --all` after Clippy when the repo has + `dylint.toml`. +- Prefer fixing custom lint rules at the shared source over broad local + suppressions when a rule is wrong across repos. +- Forbid unsafe code by default. If `unsafe` is truly required, keep it in a + tiny module with a `SAFETY:` comment explaining the invariant the caller and + callee rely on. +- Do not use `unwrap()`, `expect()`, `panic!()`, `todo!()`, or + `unimplemented!()` in production code. Return typed errors or make the + impossible state unrepresentable. +- Avoid unchecked indexing and string slicing. Prefer iterator methods, + `.get()`, typed span helpers, and APIs that preserve UTF-8 boundary safety. +- Avoid `as` casts. Prefer `TryFrom`, `From`, explicit checked conversion + helpers, or domain newtypes. +- Prefer narrow domain types over primitive strings/numbers for IDs, byte + offsets, language codes, entity labels, versions, and artifact formats. +- Keep struct fields private unless direct construction is part of the public + contract. Use smart constructors for values that must satisfy invariants. +- Use enums for real closed domain states. Do not create enums just to simulate + named arguments or boolean options. +- For functions, use positional parameters for one or two obvious arguments. + Use a named `SomethingOptions`, `SomethingArgs`, or `SomethingParams` struct + for 3+ arguments or same-type arguments that are easy to swap. +- Use `bon` builders for public APIs, constructors, or setup functions with + many optional/boolean parameters where named callsites improve readability. Do + not use it to hide unclear domain modeling. +- Prefer `Result` with a concrete error enum for library code. Use + `thiserror` for typed errors; use `miette` only where human-facing diagnostics + are valuable. +- Add `#[must_use]` to builders, config transforms, computed results, and APIs + where ignoring the return value is likely a bug. +- Keep comments concise. Comment invariants, non-obvious algorithms, generated + data contracts, and safety boundaries; do not narrate straightforward code. +- Keep data out of code. Domain dictionaries, language rules, fixtures, and + generated artifacts should live in reproducible data files or build outputs, + organized by language/concept where relevant. +- Public docs, logs, diagnostics, and comments should write `stella` lowercase. + +### Rust Module Side Effects + +- Avoid expensive module-level initialization. Prefer explicit prepare/build + steps, lazy singletons, or build-time generated artifacts. +- Do not do filesystem, network, environment, or global logger setup from + library imports. Applications and CLIs own process-level side effects. +- Keep binding crates thin. Business logic belongs in the Rust core crate; + TypeScript, Python, WASM, and NAPI layers should translate types and call the + same core logic. +- Keep generated artifacts versioned and validated at load time. Reject stale, + mismatched, or oversized artifacts with typed errors. + +### Rust Testing + +- Use `cargo nextest run --workspace --all-features` when available; otherwise + use the repo's documented `cargo test` command. +- Add property tests with `proptest` for parsers, span math, redaction, + normalization, serialization, and logic where examples do not cover the input + space. +- Add fuzz targets with `cargo-fuzz` for byte/string parsers, document readers, + search primitives, artifact decoders, and boundary-sensitive code. +- Use fixture parity tests when replacing an implementation in another language. + The Rust core, TypeScript binding, and Python binding should produce the same + structured result from the same fixtures. +- Benchmark behavior that is part of the product. Track cold start, warm run, + artifact load, preparation, and execution separately. +- Do not snapshot sensitive raw text unless the fixture is intentionally public + and minimal. Prefer normalized summaries, counts, spans, labels, and redacted + output. + ## Testing Only test what can actually go wrong: bugs the type system, framework, or linter would From f080fed8e5a53310d887793736a2935df256aa9a Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 01:52:20 +0200 Subject: [PATCH 114/130] chore: wire Rust Dylint checks --- .cargo/config.toml | 1 + .github/workflows/ci.yml | 5 +++++ dylint.toml | 4 ++++ package.json | 3 ++- 4 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 dylint.toml diff --git a/.cargo/config.toml b/.cargo/config.toml index 20687cf9..a9e50b0e 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -5,4 +5,5 @@ protocol = "sparse" [alias] ci-fmt = "fmt --all -- --check" ci-clippy = "clippy --workspace --all-targets --all-features --locked -- -D warnings" +ci-dylint = "dylint --workspace --all" ci-test = "test --workspace --all-features --locked" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5f2ec21c..c08fc4a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,12 @@ jobs: - name: Setup Rust run: | rustup toolchain install 1.96.0 --profile minimal --component rustfmt,clippy + rustup toolchain install nightly-2026-04-16 --profile minimal \ + --component rustc-dev \ + --component llvm-tools-preview rustup default 1.96.0 + cargo install cargo-dylint --version 6.0.1 --locked + cargo install dylint-link --version 6.0.1 --locked - name: Rust checks run: bun run rust:check diff --git a/dylint.toml b/dylint.toml new file mode 100644 index 00000000..93aae24c --- /dev/null +++ b/dylint.toml @@ -0,0 +1,4 @@ +[workspace.metadata.dylint] +libraries = [ + { git = "https://github.com/stella/tooling", rev = "8b76dd794ca7bd7ac56ffb1179c30ccb898f960b", pattern = "rust-lints/*" }, +] diff --git a/package.json b/package.json index 4eb1335a..15869e9d 100644 --- a/package.json +++ b/package.json @@ -19,8 +19,9 @@ "test": "turbo run test", "rust:fmt": "cargo ci-fmt", "rust:lint": "cargo ci-clippy", + "rust:dylint": "cargo ci-dylint", "rust:test": "cargo ci-test", - "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:test", + "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:dylint && bun run rust:test", "python:typecheck": "uvx --from ty==0.0.29 ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", "python:wheel": "node .github/tools/check-python-wheel.mjs", "sync:version": "node .github/tools/sync-runtime-version.mjs", From 734ed7fc05b03ce7321b04b671ab43a90960b46b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 01:52:26 +0200 Subject: [PATCH 115/130] chore: add Rust config builders --- Cargo.lock | 88 +++++++++++++++++++++++++ crates/anonymize-core/Cargo.toml | 1 + crates/anonymize-core/src/search.rs | 22 ++++++- crates/anonymize-core/src/types.rs | 4 +- crates/anonymize-core/tests/builders.rs | 39 +++++++++++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 crates/anonymize-core/tests/builders.rs diff --git a/Cargo.lock b/Cargo.lock index 41fad21f..b3639b9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,6 +93,31 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "bon" +version = "3.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a602c73c7b0148ec6d12af6fd5cc7a46e2eacc8878271a999abac56eed12f561" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dee98b0db6a962de883bf5d20362dee4d7ca0d12fe39a7c6c73c844e1cd7c1f" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "cc" version = "1.2.65" @@ -162,6 +187,40 @@ version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn", +] + [[package]] name = "digest" version = "0.11.3" @@ -327,6 +386,12 @@ dependencies = [ "typenum", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "itoa" version = "1.0.18" @@ -488,6 +553,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -680,6 +755,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "rusty-fork" version = "0.3.1" @@ -806,6 +887,7 @@ dependencies = [ name = "stella-anonymize-core" version = "1.5.0" dependencies = [ + "bon", "fancy-regex", "proptest", "regex", @@ -879,6 +961,12 @@ dependencies = [ "stella-regex-set-core", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "syn" version = "2.0.118" diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index b5a48387..a68bee4b 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -10,6 +10,7 @@ keywords = ["anonymization", "pii", "redaction", "text"] categories = ["text-processing"] [dependencies] +bon = "3.9.3" fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 94b269d7..5e737f73 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -29,6 +29,7 @@ pub enum SearchPattern { } #[derive( + bon::Builder, Clone, Copy, Debug, @@ -39,12 +40,16 @@ pub enum SearchPattern { serde::Serialize, )] pub struct SearchOptions { + #[builder(default)] pub literal: LiteralSearchOptions, + #[builder(default)] pub regex: RegexSearchOptions, + #[builder(default)] pub fuzzy: FuzzySearchOptions, } #[derive( + bon::Builder, Clone, Copy, Debug, @@ -57,11 +62,14 @@ pub struct SearchOptions { serde::Serialize, )] pub struct LiteralSearchOptions { + #[builder(default)] pub case_insensitive: bool, + #[builder(default)] pub whole_words: bool, } #[derive( + bon::Builder, Clone, Copy, Debug, @@ -72,16 +80,28 @@ pub struct LiteralSearchOptions { serde::Serialize, )] pub struct RegexSearchOptions { + #[builder(default)] pub whole_words: bool, + #[builder(default)] pub overlap_all: bool, } #[derive( - Clone, Copy, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize, + bon::Builder, + Clone, + Copy, + Debug, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, )] pub struct FuzzySearchOptions { + #[builder(default)] pub case_insensitive: bool, + #[builder(default = true)] pub whole_words: bool, + #[builder(default)] pub normalize_diacritics: bool, } diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs index 08fb087c..2e97e4af 100644 --- a/crates/anonymize-core/src/types.rs +++ b/crates/anonymize-core/src/types.rs @@ -185,9 +185,11 @@ pub enum OperatorType { Redact, } -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(bon::Builder, Clone, Debug, Eq, PartialEq)] pub struct OperatorConfig { + #[builder(default)] pub operators: BTreeMap, + #[builder(default = String::from("[REDACTED]"))] pub redact_string: String, } diff --git a/crates/anonymize-core/tests/builders.rs b/crates/anonymize-core/tests/builders.rs new file mode 100644 index 00000000..601ea8f2 --- /dev/null +++ b/crates/anonymize-core/tests/builders.rs @@ -0,0 +1,39 @@ +use stella_anonymize_core::{ + FuzzySearchOptions, LiteralSearchOptions, OperatorConfig, RegexSearchOptions, + SearchOptions, +}; + +#[test] +fn search_options_builder_preserves_defaults() { + let options = SearchOptions::builder() + .literal( + LiteralSearchOptions::builder() + .case_insensitive(true) + .build(), + ) + .build(); + + assert!(options.literal.case_insensitive); + assert!(!options.literal.whole_words); + assert_eq!(options.regex, RegexSearchOptions::default()); + assert_eq!(options.fuzzy, FuzzySearchOptions::default()); +} + +#[test] +fn fuzzy_options_builder_preserves_whole_word_default() { + let options = FuzzySearchOptions::builder() + .normalize_diacritics(true) + .build(); + + assert!(!options.case_insensitive); + assert!(options.whole_words); + assert!(options.normalize_diacritics); +} + +#[test] +fn operator_config_builder_preserves_redaction_default() { + let config = OperatorConfig::builder().build(); + + assert!(config.operators.is_empty()); + assert_eq!(config.redact_string, "[REDACTED]"); +} From 5e6a100eaf4e5eba6168f8e1f2591445e4435056 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 01:52:30 +0200 Subject: [PATCH 116/130] chore: satisfy Rust lint gate --- crates/anonymize-core/src/address_context.rs | 10 ++++++---- crates/anonymize-core/src/processors.rs | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs index 2fa8412e..b707e5a0 100644 --- a/crates/anonymize-core/src/address_context.rs +++ b/crates/anonymize-core/src/address_context.rs @@ -329,10 +329,12 @@ impl PreparedAddressContextData { let offsets = ByteOffsets::new(full_text); let header_scan_end = header_scan_end(full_text, &offsets, header_end)?; let header = - full_text.get(..header_scan_end).ok_or(Error::InvalidSpan { - start: 0, - end: u32::try_from(header_scan_end).unwrap_or(u32::MAX), - })?; + full_text + .get(..header_scan_end) + .ok_or_else(|| Error::InvalidSpan { + start: 0, + end: u32::try_from(header_scan_end).unwrap_or(u32::MAX), + })?; let context_entities = existing_entities .iter() .filter(|entity| { diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs index c2b3c5af..dd644ce6 100644 --- a/crates/anonymize-core/src/processors.rs +++ b/crates/anonymize-core/src/processors.rs @@ -955,7 +955,7 @@ fn has_curated_source(sources: StringGroup<'_>) -> bool { .any(|source| source != CUSTOM_DENY_LIST_SOURCE) } -fn has_person_name_source(found: &RawDenyListMatch) -> bool { +const fn has_person_name_source(found: &RawDenyListMatch) -> bool { found.has_person_name_source } From 960a8eb437d641bb2393362134ea944ea847e255 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 01:53:40 +0200 Subject: [PATCH 117/130] docs: refresh Rust agent conventions --- .ai/shared | 2 +- AGENTS.md | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.ai/shared b/.ai/shared index 69216a30..eebb10aa 160000 --- a/.ai/shared +++ b/.ai/shared @@ -1 +1 @@ -Subproject commit 69216a3067cb1639281a8988b2e33aaab0f3bac0 +Subproject commit eebb10aa9beb793e8098fc333018fd715a7af475 diff --git a/AGENTS.md b/AGENTS.md index 1a382724..4d70dbb2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -160,8 +160,10 @@ details unless they are already public in the repository. offsets, language codes, entity labels, versions, and artifact formats. - Keep struct fields private unless direct construction is part of the public contract. Use smart constructors for values that must satisfy invariants. -- Use enums for real closed domain states. Do not create enums just to simulate - named arguments or boolean options. +- Use enums for real closed domain states and boolean-blind choices where + variants carry domain meaning. For callsite ergonomics alone, prefer an + options struct or `bon` builder over an enum that only simulates named + arguments. - For functions, use positional parameters for one or two obvious arguments. Use a named `SomethingOptions`, `SomethingArgs`, or `SomethingParams` struct for 3+ arguments or same-type arguments that are easy to swap. From fecf43c3634aa24e77829e5284b6347fa14dbf86 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 07:15:05 +0200 Subject: [PATCH 118/130] chore: consume merged Rust lint tooling --- dylint.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dylint.toml b/dylint.toml index 93aae24c..0966fea2 100644 --- a/dylint.toml +++ b/dylint.toml @@ -1,4 +1,4 @@ [workspace.metadata.dylint] libraries = [ - { git = "https://github.com/stella/tooling", rev = "8b76dd794ca7bd7ac56ffb1179c30ccb898f960b", pattern = "rust-lints/*" }, + { git = "https://github.com/stella/tooling", rev = "fe7012b863ad2fcdd788a3fab759b31181bbf9c8", pattern = "rust-lints/*" }, ] From cbb1e3b34f1bd4060f2b2452e0b3f4d35221b527 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 07:15:30 +0200 Subject: [PATCH 119/130] test: name native fixture improvements --- .../scripts/migration-fixture-perf.mjs | 52 +++--- .../__test__/native-adapter-parity.test.ts | 157 ++++++++++++++++++ 2 files changed, 190 insertions(+), 19 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 47cf30fa..41b2f4df 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -74,7 +74,7 @@ const WRITE_NATIVE_PACKAGE_PATH = const USER_DATA_SCENARIO = process.env.ANONYMIZE_MIGRATION_USER_DATA_SCENARIO?.trim() ?? "none"; -const ACCEPTED_NATIVE_STATIC_DELTAS = new Map( +const INTENTIONAL_NATIVE_STATIC_IMPROVEMENTS = new Map( [ { fixture: "cs/asset-transfer-court-declensions.txt", @@ -182,9 +182,13 @@ async function runCoordinator() { if (baseline !== null) { const comparison = compareSnapshots(baseline, candidate); console.log(JSON.stringify(comparisonForLog(comparison))); - const acceptedByPolicy = - ALLOW_ACCEPTED_MISMATCHES && comparison.acceptedEqual; - if (!comparison.equal && !acceptedByPolicy && FAIL_ON_MISMATCH !== "0") { + const intentionalByPolicy = + ALLOW_ACCEPTED_MISMATCHES && comparison.intentionalEqual; + if ( + !comparison.equal && + !intentionalByPolicy && + FAIL_ON_MISMATCH !== "0" + ) { throw new Error( `Fixture parity failed for ${comparison.mismatches.length} fixture(s)`, ); @@ -860,8 +864,11 @@ function compareSnapshots(baseline, candidate) { baseline: baseline.variant, candidate: candidate.variant, equal: mismatches.length === 0, + intentionalEqual: mismatches.every( + (mismatch) => mismatch.intentionalImprovementReason !== null, + ), acceptedEqual: mismatches.every( - (mismatch) => mismatch.acceptedReason !== null, + (mismatch) => mismatch.intentionalImprovementReason !== null, ), mismatchSummary: mismatchSummary(mismatches), fixtureCount: fixtureNames.size, @@ -913,7 +920,7 @@ function mismatchSummary(mismatches) { let materialMismatchCount = 0; let redactionMismatchCount = 0; let sourceOnlyMismatchCount = 0; - let acceptedMismatchCount = 0; + let intentionalImprovementCount = 0; let unexplainedMismatchCount = 0; let unexplainedMaterialMismatchCount = 0; let unexplainedRedactionMismatchCount = 0; @@ -921,21 +928,21 @@ function mismatchSummary(mismatches) { for (const mismatch of mismatches) { const category = mismatch.category ?? mismatch.kind; byCategory[category] = (byCategory[category] ?? 0) + 1; - const accepted = mismatch.acceptedReason !== null; - if (accepted) { - acceptedMismatchCount += 1; + const intentional = mismatch.intentionalImprovementReason !== null; + if (intentional) { + intentionalImprovementCount += 1; } else { unexplainedMismatchCount += 1; } if (mismatch.sourceAgnosticEqual !== true) { materialMismatchCount += 1; - if (!accepted) { + if (!intentional) { unexplainedMaterialMismatchCount += 1; } } if (mismatch.redactedTextEqual === false) { redactionMismatchCount += 1; - if (!accepted) { + if (!intentional) { unexplainedRedactionMismatchCount += 1; } } @@ -954,7 +961,8 @@ function mismatchSummary(mismatches) { materialMismatchCount, redactionMismatchCount, sourceOnlyMismatchCount, - acceptedMismatchCount, + intentionalImprovementCount, + acceptedMismatchCount: intentionalImprovementCount, unexplainedMismatchCount, unexplainedMaterialMismatchCount, unexplainedRedactionMismatchCount, @@ -1044,23 +1052,29 @@ function describeMismatch(fixture, expected, actual) { }; return { ...mismatch, - acceptedReason: acceptedMismatchReason(mismatch), + intentionalImprovementReason: intentionalImprovementReason(mismatch), + acceptedReason: intentionalImprovementReason(mismatch), }; } -function acceptedMismatchReason(mismatch) { +function intentionalImprovementReason(mismatch) { if (mismatch.sourceAgnosticEqual === true) { return "source-only"; } - const accepted = ACCEPTED_NATIVE_STATIC_DELTAS.get(mismatch.fixture); - if (accepted === undefined) { + const improvement = INTENTIONAL_NATIVE_STATIC_IMPROVEMENTS.get( + mismatch.fixture, + ); + if (improvement === undefined) { return null; } if ( - entitySummariesEqual(mismatch.candidateExtra, accepted.candidateExtra) && - entitySummariesEqual(mismatch.candidateMissing, accepted.candidateMissing) + entitySummariesEqual(mismatch.candidateExtra, improvement.candidateExtra) && + entitySummariesEqual( + mismatch.candidateMissing, + improvement.candidateMissing, + ) ) { - return accepted.reason; + return improvement.reason; } return null; } diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index e12fbebe..9cf654a6 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -162,6 +162,19 @@ type ContractFixtureCase = { text: string; }; +type ExpectedNativeFixtureEntity = { + label: string; + source?: string; + text: string; +}; + +type NativeFixtureImprovementCase = { + language: (typeof CONTRACT_FIXTURE_LANGUAGES)[number]; + fixture: string; + includes?: ExpectedNativeFixtureEntity[]; + excludes?: ExpectedNativeFixtureEntity[]; +}; + type PythonNativeOffsetSlice = { start: number; end: number; @@ -182,6 +195,61 @@ const CONTRACT_FIXTURES_DIR = join( "contracts", ); const CONTRACT_FIXTURE_LANGUAGES = ["cs", "de", "en"] as const; +const NATIVE_FIXTURE_IMPROVEMENTS: NativeFixtureImprovementCase[] = [ + { + language: "cs", + fixture: "asset-transfer-court-declensions.txt", + includes: [ + { + label: "address", + source: "regex", + text: "Václavské náměstí 9, 110 00 Praha 1", + }, + ], + }, + { + language: "cs", + fixture: "nakit-legal-services-framework.txt", + excludes: [{ label: "person", text: "Objednatele" }], + }, + { + language: "cs", + fixture: "vinci-donation-agreement.txt", + includes: [ + { + label: "organization", + source: "deny-list", + text: "České vysoké učení technické v Praze", + }, + { + label: "organization", + source: "coreference", + text: "VINCI Construction CS", + }, + ], + }, + { + language: "en", + fixture: "software-license-agreement.txt", + includes: [ + { + label: "address", + source: "regex", + text: "200 West Street, New York, NY 10282", + }, + { + label: "address", + source: "regex", + text: "1209 Orange Street, Wilmington, DE 19801", + }, + { + label: "phone number", + source: "regex", + text: "(212) 555-0142", + }, + ], + }, +]; const CONFIG_JSON = JSON.stringify({ regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], @@ -2052,6 +2120,70 @@ describe("native adapter parity", () => { } }); + test("native fixture improvements are explicit", async () => { + const adapters = getAdapters(); + const languages = [ + ...new Set(NATIVE_FIXTURE_IMPROVEMENTS.map(({ language }) => language)), + ]; + + for (const language of languages) { + const fixtures = new Map( + loadContractFixtureCases(language).map(({ name, text }) => [ + name, + text, + ]), + ); + const scopedConfig = applyPipelineLanguageScope({ + ...contractTestConfig(`native-fixture-improvements-${language}`), + language, + }); + const dictionaryScope: Parameters[0] = {}; + if (scopedConfig.denyListCountries !== undefined) { + dictionaryScope.denyListCountries = scopedConfig.denyListCountries; + } + if (scopedConfig.nameCorpusLanguages !== undefined) { + dictionaryScope.nameCorpusLanguages = scopedConfig.nameCorpusLanguages; + } + const dictionaries = await loadTestDictionaries(dictionaryScope); + const search = await preparePipelineSearch({ + config: { + ...scopedConfig, + dictionaries, + }, + context: createPipelineContext(), + }); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + + for (const improvement of NATIVE_FIXTURE_IMPROVEMENTS.filter( + (item) => item.language === language, + )) { + const text = fixtures.get(improvement.fixture); + expect(text).toBeDefined(); + if (text === undefined) { + continue; + } + + const result = toBindingStaticResult( + anonymizer.redactStaticEntities(text), + ); + for (const entity of improvement.includes ?? []) { + expectNativeFixtureEntity(result, entity); + } + for (const entity of improvement.excludes ?? []) { + expectNativeFixtureEntityAbsent(result, entity); + } + } + } + }); + test("JSON operator config accepts camel-case redactString", () => { const adapters = getAdapters(); const text = @@ -2713,6 +2845,31 @@ const loadContractFixtureCases = ( text: readFileSync(join(CONTRACT_FIXTURES_DIR, language, name), "utf8"), })); +const findNativeFixtureEntity = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => + result.resolved_entities.find( + (entity) => + entity.label === expected.label && + entity.text === expected.text && + (expected.source === undefined || entity.source === expected.source), + ); + +const expectNativeFixtureEntity = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => { + expect(findNativeFixtureEntity(result, expected)).toMatchObject(expected); +}; + +const expectNativeFixtureEntityAbsent = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => { + expect(findNativeFixtureEntity(result, expected)).toBeUndefined(); +}; + const packageJsonVersion = (): string => { const packageJson = JSON.parse( readFileSync(join(ROOT_DIR, "packages", "anonymize", "package.json"), { From 2d94722860ab2fa3b0371b9599cda6b8c8baaa1b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 07:30:55 +0200 Subject: [PATCH 120/130] perf: cache Python config facade --- .../anonymize-py/python/stella_anonymize/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py index c8d8e2ed..93587b68 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.py +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -183,6 +183,11 @@ def _load_prepared_package(package_bytes: bytes) -> PreparedAnonymizer: return PreparedAnonymizer.from_prepared_package_bytes(package_bytes) +@lru_cache(maxsize=8) +def _prepare_from_config_json(config_json: str) -> PreparedAnonymizer: + return PreparedAnonymizer.from_config_json(config_json) + + def redact_text( config_json: str, full_text: str, @@ -190,7 +195,7 @@ def redact_text( *, redact_string: str | None = None, ) -> StaticRedactionResult: - return PreparedAnonymizer.from_config_json(config_json).redact_text( + return _prepare_from_config_json(config_json).redact_text( full_text, operators, redact_string=redact_string, @@ -204,7 +209,7 @@ def redact_text_json( *, redact_string: str | None = None, ) -> str: - return PreparedAnonymizer.from_config_json(config_json).redact_text_json( + return _prepare_from_config_json(config_json).redact_text_json( full_text, operators, redact_string=redact_string, @@ -218,7 +223,7 @@ def diagnostics_json( *, redact_string: str | None = None, ) -> str: - return PreparedAnonymizer.from_config_json(config_json).diagnostics_json( + return _prepare_from_config_json(config_json).diagnostics_json( full_text, operators, redact_string=redact_string, From 24de82650c378c9821455e9c3dc427527255410f Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 07:31:21 +0200 Subject: [PATCH 121/130] chore: wire local prepared regex stack --- Cargo.lock | 2 -- crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3639b9c..20bc2d30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -933,7 +933,6 @@ dependencies = [ [[package]] name = "stella-regex-set-core" version = "1.0.5" -source = "git+https://github.com/stella/regex-set?rev=8b80241a5a54cef8fdc6b6b34119981db0c6f597#8b80241a5a54cef8fdc6b6b34119981db0c6f597" dependencies = [ "fancy-regex", "regex", @@ -954,7 +953,6 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" -source = "git+https://github.com/stella/text-search?rev=0f35d481d990b720ccbce3594d38b7438846efc2#0f35d481d990b720ccbce3594d38b7438846efc2" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index a68bee4b..d4c1b361 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -15,7 +15,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0f35d481d990b720ccbce3594d38b7438846efc2" } +stella-text-search-core = { version = "1.0.6", path = "/private/tmp/stll-text-search-regex-prepared/crates/core" } [dev-dependencies] proptest = "1" From d51b119a00d07f350f61a51c15feb4c0b723deee Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 07:52:22 +0200 Subject: [PATCH 122/130] perf: preload default native pipeline --- .../scripts/migration-fixture-perf.mjs | 2 + .../scripts/native-package-ux-perf.mjs | 3 + .../src/__test__/native-node.test.ts | 54 +++++++ packages/anonymize/src/native-node.ts | 142 +++++++++++++----- 4 files changed, 166 insertions(+), 35 deletions(-) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 41b2f4df..6a32a1e8 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -554,6 +554,8 @@ function describeNativeTimingScenario({ firstPrepareMs: nativePrepareMs, cachedPrepareMs: nativeCachedPrepareAvgMs, firstRunMs: coldRunMs, + setupBeforeClickMs: roundMs(nativePackageReadMs + nativePrepareMs), + preloadedClickMs: coldRunMs, firstTouchMs: roundMs(nativePackageReadMs + nativePrepareMs + coldRunMs), warmClickMs: warmAvgMs, }; diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs index 3382c7be..278521c1 100644 --- a/packages/anonymize/scripts/native-package-ux-perf.mjs +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -52,8 +52,11 @@ function runScenario({ name, compressed }) { offlinePackageBuildMs: build.timings.nativePackagePrepareMs, firstPackageReadMs: load.timings.nativePackageReadMs, firstPrepareMs: load.timings.nativePrepareMs, + setupBeforeClickMs: + load.timings.nativePackageReadMs + load.timings.nativePrepareMs, cachedPrepareMs: load.timings.nativeCachedPrepareAvgMs, firstRunMs: load.timings.coldRunMs, + preloadedClickMs: load.timings.coldRunMs, firstTouchMs: load.timings.nativeFirstTouchMs, warmClickMs: load.timings.nativeWarmClickMs, prepareTopStages: nativeDiagnostics?.prepare?.topStages ?? [], diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 455ac13b..4cadba55 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -15,8 +15,10 @@ import { native_package_version, normalize_for_search, preloadDefaultNativePipeline, + preloadDefaultNativePipelineAsync, prepare_search_package, readNativePipelinePackageFile, + readNativePipelinePackageFileAsync, redact_text, redact_text_json, } from "../native-node"; @@ -116,6 +118,20 @@ describe("native node loader", () => { } }); + test("loads native pipeline package bytes from a file asynchronously", async () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-package-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, Uint8Array.of(4, 3, 2, 1)); + + expect([ + ...(await readNativePipelinePackageFileAsync(packagePath)), + ]).toEqual([4, 3, 2, 1]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + test("creates a native pipeline from a package file", () => { const dir = mkdtempSync(join(tmpdir(), "anonymize-native-pipeline-")); const packagePath = join(dir, "pipeline.stlanonpkg"); @@ -202,6 +218,44 @@ describe("native node loader", () => { } }); + test("preloads the default native pipeline asynchronously", async () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-async-cache-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(16, 17, 18)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const [first, second] = await Promise.all([ + preloadDefaultNativePipelineAsync({ + binding, + packagePath, + expectedVersion: "1.5.0", + }), + preloadDefaultNativePipelineAsync({ + binding, + packagePath, + expectedVersion: "1.5.0", + }), + ]); + const syncCached = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(syncCached).toBe(first); + expect(capturedBytes).toEqual([[16, 17, 18]]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + test("shared SDK helpers delegate through the native binding", () => { const sharedSdkFunctions: Record< (typeof SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS)[number], diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 4f07507c..2b414cf9 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -1,5 +1,6 @@ import { createRequire } from "node:module"; import { readFileSync } from "node:fs"; +import { readFile } from "node:fs/promises"; import process from "node:process"; import { @@ -69,6 +70,10 @@ const defaultNativePipelineCache = new WeakMap< NativeAnonymizeBinding, Map >(); +const defaultNativePipelineInflightCache = new WeakMap< + NativeAnonymizeBinding, + Map> +>(); export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; @@ -109,6 +114,10 @@ export const readNativePipelinePackageFile = ( packagePath: string, ): Uint8Array => new Uint8Array(readFileSync(packagePath)); +export const readNativePipelinePackageFileAsync = async ( + packagePath: string, +): Promise => new Uint8Array(await readFile(packagePath)); + export const native_package_version = ( options: NativeSdkOptions = {}, ): string => nativePackageVersionWithBinding(resolveNativeSdkBinding(options)); @@ -197,6 +206,19 @@ export const readDefaultNativePipelinePackageFile = (): Uint8Array => { } }; +export const readDefaultNativePipelinePackageFileAsync = + async (): Promise => { + try { + return new Uint8Array( + await readFile(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL), + ); + } catch (error) { + throw new Error( + `Default native pipeline package is unavailable: ${formatLoadError(error)}`, + ); + } + }; + export const createNativePipelineFromPackageFile = ({ binding, packagePath, @@ -218,33 +240,70 @@ export const createNativePipelineFromPackageFile = ({ }); }; -export const createNativePipelineFromDefaultPackage = ({ - binding, - packagePath, - expectedVersion, - ...loadOptions -}: DefaultNativePipelinePackageOptions = {}): PreparedNativePipeline => { - const resolvedBinding = - binding ?? - loadNativeAnonymizeBinding({ - ...loadOptions, - ...(expectedVersion !== undefined ? { expectedVersion } : {}), - }); - if (binding && expectedVersion !== undefined) { - assertNativeBindingVersion({ binding, expectedVersion }); +export const createNativePipelineFromDefaultPackage = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => + createNativePipelineFromResolvedDefaultPackage( + resolveDefaultNativePipelineOptions(options), + ); + +export const getDefaultNativePipeline = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => { + const resolvedOptions = resolveDefaultNativePipelineOptions(options); + const cache = defaultPipelineCacheFor(resolvedOptions.binding); + const key = defaultPipelineCacheKey(resolvedOptions); + const cached = cache.get(key); + if (cached !== undefined) { + return cached; } - return createNativePipelineFromResolvedDefaultPackage({ - binding: resolvedBinding, - ...(packagePath !== undefined ? { packagePath } : {}), - }); + const pipeline = + createNativePipelineFromResolvedDefaultPackage(resolvedOptions); + cache.set(key, pipeline); + return pipeline; }; -export const getDefaultNativePipeline = ({ +export const preloadDefaultNativePipeline = getDefaultNativePipeline; + +export const preloadDefaultNativePipelineAsync = ( + options: DefaultNativePipelinePackageOptions = {}, +): Promise => { + const resolvedOptions = resolveDefaultNativePipelineOptions(options); + const cache = defaultPipelineCacheFor(resolvedOptions.binding); + const key = defaultPipelineCacheKey(resolvedOptions); + const cached = cache.get(key); + if (cached !== undefined) { + return Promise.resolve(cached); + } + + const inflightCache = defaultPipelineInflightCacheFor( + resolvedOptions.binding, + ); + const inflight = inflightCache.get(key); + if (inflight !== undefined) { + return inflight; + } + + const promise = createNativePipelineFromResolvedDefaultPackageAsync( + resolvedOptions, + ) + .then((pipeline) => { + cache.set(key, pipeline); + return pipeline; + }) + .finally(() => { + inflightCache.delete(key); + }); + inflightCache.set(key, promise); + return promise; +}; + +const resolveDefaultNativePipelineOptions = ({ binding, packagePath, expectedVersion, ...loadOptions -}: DefaultNativePipelinePackageOptions = {}): PreparedNativePipeline => { +}: DefaultNativePipelinePackageOptions = {}): ResolvedDefaultNativePipelineOptions => { const resolvedBinding = binding ?? loadNativeAnonymizeBinding({ @@ -254,25 +313,12 @@ export const getDefaultNativePipeline = ({ if (binding && expectedVersion !== undefined) { assertNativeBindingVersion({ binding, expectedVersion }); } - const cache = defaultPipelineCacheFor(resolvedBinding); - const key = defaultPipelineCacheKey({ - binding: resolvedBinding, - ...(packagePath !== undefined ? { packagePath } : {}), - }); - const cached = cache.get(key); - if (cached !== undefined) { - return cached; - } - const pipeline = createNativePipelineFromResolvedDefaultPackage({ + return { binding: resolvedBinding, ...(packagePath !== undefined ? { packagePath } : {}), - }); - cache.set(key, pipeline); - return pipeline; + }; }; -export const preloadDefaultNativePipeline = getDefaultNativePipeline; - const createNativePipelineFromResolvedDefaultPackage = ({ binding, packagePath, @@ -287,6 +333,20 @@ const createNativePipelineFromResolvedDefaultPackage = ({ }); }; +const createNativePipelineFromResolvedDefaultPackageAsync = async ({ + binding, + packagePath, +}: ResolvedDefaultNativePipelineOptions): Promise => { + const packageBytes = + packagePath === undefined + ? await readDefaultNativePipelinePackageFileAsync() + : await readNativePipelinePackageFileAsync(packagePath); + return createNativePipelineFromPackage({ + binding, + packageBytes, + }); +}; + const resolveNativeSdkBinding = ({ binding, expectedVersion, @@ -316,6 +376,18 @@ const defaultPipelineCacheFor = ( return created; }; +const defaultPipelineInflightCacheFor = ( + binding: NativeAnonymizeBinding, +): Map> => { + const cached = defaultNativePipelineInflightCache.get(binding); + if (cached !== undefined) { + return cached; + } + const created = new Map>(); + defaultNativePipelineInflightCache.set(binding, created); + return created; +}; + const defaultPipelineCacheKey = ({ binding, packagePath, From f67806e92c0a7f04c724df6dcc74d5325f4bfd36 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 08:04:03 +0200 Subject: [PATCH 123/130] perf: select scoped native packages --- packages/anonymize/README.md | 10 ++ packages/anonymize/package.json | 1 + .../anonymize/scripts/build-native-node.mjs | 65 ++++++++++--- .../scripts/native-package-ux-perf.mjs | 39 +++++++- .../src/__test__/native-node.test.ts | 47 +++++++++ packages/anonymize/src/native-node.ts | 96 +++++++++++++++---- 6 files changed, 228 insertions(+), 30 deletions(-) diff --git a/packages/anonymize/README.md b/packages/anonymize/README.md index ecd7fe50..4b291457 100644 --- a/packages/anonymize/README.md +++ b/packages/anonymize/README.md @@ -31,6 +31,16 @@ console.log(result.redaction.redactedText); Call `getDefaultNativePipeline()` once during service startup and reuse the returned anonymizer. The package ships with a prepared native package, so the normal request path avoids rebuilding search automata. +If your deployment knows the document language up front, build scoped package artifacts and select them at startup: + +```bash +STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES=en,cs bun run build +``` + +```ts +const anonymizer = getDefaultNativePipeline({ language: "en" }); +``` + For build-time generated packages or caller-owned data, prepare the package before runtime and load the bytes in the process that handles documents. ```bash diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index e75c2202..c2039652 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -35,6 +35,7 @@ "index.cjs", "*.node", "native-pipeline.stlanonpkg", + "native-pipeline.*.stlanonpkg", "scripts/build-native-pipeline-package.mjs" ], "publishConfig": { diff --git a/packages/anonymize/scripts/build-native-node.mjs b/packages/anonymize/scripts/build-native-node.mjs index cbf5a3be..b68bb82f 100644 --- a/packages/anonymize/scripts/build-native-node.mjs +++ b/packages/anonymize/scripts/build-native-node.mjs @@ -5,6 +5,9 @@ import { fileURLToPath } from "node:url"; const packageRoot = dirname(dirname(fileURLToPath(import.meta.url))); const repoRoot = dirname(dirname(packageRoot)); +const scopedPackageLanguages = languageListFromEnv( + process.env.STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES, +); const sourceByPlatform = { darwin: "libstella_anonymize_napi.dylib", @@ -33,16 +36,56 @@ if (!existsSync(source)) { copyFileSync(source, join(packageRoot, "stella_anonymize_napi.node")); -execFileSync( - process.execPath, - [ - join(packageRoot, "scripts", "build-native-pipeline-package.mjs"), +buildNativePipelinePackage([ + "--out", + join(packageRoot, "native-pipeline.stlanonpkg"), + "--default-dictionaries", +]); + +for (const language of scopedPackageLanguages) { + buildNativePipelinePackage([ "--out", - join(packageRoot, "native-pipeline.stlanonpkg"), + join(packageRoot, `native-pipeline.${language}.stlanonpkg`), "--default-dictionaries", - ], - { - cwd: packageRoot, - stdio: "inherit", - }, -); + "--language", + language, + ]); +} + +function buildNativePipelinePackage(args) { + execFileSync( + process.execPath, + [ + join(packageRoot, "scripts", "build-native-pipeline-package.mjs"), + ...args, + ], + { + cwd: packageRoot, + stdio: "inherit", + }, + ); +} + +function languageListFromEnv(value) { + if (value === undefined || value.trim().length === 0) { + return []; + } + const languages = value + .split(",") + .map((entry) => normalizeLanguage(entry)) + .filter((entry, index, entries) => entries.indexOf(entry) === index); + if (languages.length === 0) { + throw new Error("STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES is empty"); + } + return languages; +} + +function normalizeLanguage(value) { + const language = value.trim().toLowerCase(); + if (!/^[a-z0-9]+(?:-[a-z0-9]+)*$/u.test(language)) { + throw new Error( + `Invalid STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES entry: ${value}`, + ); + } + return language; +} diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs index 278521c1..db923af5 100644 --- a/packages/anonymize/scripts/native-package-ux-perf.mjs +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -16,6 +16,7 @@ const MIGRATION_SCRIPT = join( const SCENARIOS = [ { name: "compressed", compressed: true }, { name: "raw", compressed: false }, + ...languageScenarios(), ]; const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-package-ux-")); @@ -32,14 +33,23 @@ try { rmSync(tempRoot, { force: true, recursive: true }); } -function runScenario({ name, compressed }) { +function runScenario({ name, compressed, language }) { const packagePath = join(tempRoot, `${name}.stlanonpkg`); + const languageEnv = + language === undefined + ? {} + : { + ANONYMIZE_MIGRATION_CONTENT_LANGUAGE: language, + ANONYMIZE_MIGRATION_FIXTURE_LANGUAGES: language, + }; const build = runMigration({ + ...languageEnv, ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: compressed ? "1" : "0", ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1", ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH: packagePath, }); const load = runMigration({ + ...languageEnv, ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH: packagePath, }); const nativeDiagnostics = load.nativeDiagnostics ?? null; @@ -47,6 +57,7 @@ function runScenario({ name, compressed }) { return { name, compressed, + language: language ?? null, fixtureCount: load.fixtureCount, packageBytes: build.timings.nativePackageBytes, offlinePackageBuildMs: build.timings.nativePackagePrepareMs, @@ -70,6 +81,32 @@ function runScenario({ name, compressed }) { }; } +function languageScenarios() { + const value = process.env.ANONYMIZE_NATIVE_PACKAGE_UX_LANGUAGES ?? "en,cs,de"; + if (value.trim().length === 0) { + return []; + } + return value + .split(",") + .map((entry) => normalizeLanguage(entry)) + .filter((entry, index, entries) => entries.indexOf(entry) === index) + .map((language) => ({ + name: `compressed-${language}`, + compressed: true, + language, + })); +} + +function normalizeLanguage(value) { + const language = value.trim().toLowerCase(); + if (!/^[a-z0-9]+(?:-[a-z0-9]+)*$/u.test(language)) { + throw new Error( + `Invalid ANONYMIZE_NATIVE_PACKAGE_UX_LANGUAGES entry: ${value}`, + ); + } + return language; +} + function runMigration(extraEnv) { const child = spawnSync(process.execPath, [MIGRATION_SCRIPT], { cwd: ROOT_DIR, diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index 4cadba55..a522bee5 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -2,6 +2,7 @@ import { describe, expect, test } from "bun:test"; import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { fileURLToPath } from "node:url"; import type { NativeAnonymizeBinding } from "../native"; import { @@ -17,6 +18,7 @@ import { preloadDefaultNativePipeline, preloadDefaultNativePipelineAsync, prepare_search_package, + readDefaultNativePipelinePackageFile, readNativePipelinePackageFile, readNativePipelinePackageFileAsync, redact_text, @@ -256,6 +258,51 @@ describe("native node loader", () => { } }); + test("loads language-scoped default native pipeline packages", () => { + const language = "zz-test"; + const packagePath = fileURLToPath( + new URL(`../../native-pipeline.${language}.stlanonpkg`, import.meta.url), + ); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(31, 32, 33)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const first = getDefaultNativePipeline({ + binding, + language: "ZZ-Test", + expectedVersion: "1.5.0", + }); + const second = getDefaultNativePipeline({ + binding, + language, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(capturedBytes).toEqual([[31, 32, 33]]); + } finally { + rmSync(packagePath, { force: true }); + } + }); + + test("rejects unsafe default native package language selectors", () => { + expect(() => + readDefaultNativePipelinePackageFile({ language: "../en" }), + ).toThrow("Default native pipeline language must match"); + expect(() => + getDefaultNativePipeline({ + binding: fakeNativeBinding("1.5.0"), + language: "en", + packagePath: "/tmp/native-pipeline.stlanonpkg", + }), + ).toThrow("Use either language or packagePath"); + }); + test("shared SDK helpers delegate through the native binding", () => { const sharedSdkFunctions: Record< (typeof SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS)[number], diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 2b414cf9..8c959a47 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -51,20 +51,27 @@ export type NativeSdkPackageOptions = NativeSdkOptions & { export type DefaultNativePipelinePackageOptions = LoadNativeBindingOptions & { binding?: NativeAnonymizeBinding; + language?: string; packagePath?: string; }; type ResolvedDefaultNativePipelineOptions = { binding: NativeAnonymizeBinding; + language?: string; packagePath?: string; }; +export type DefaultNativePipelinePackageFileOptions = { + language?: string; +}; + const LOCAL_NATIVE_LOADER = "../index.cjs"; const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; const DEFAULT_NATIVE_PIPELINE_PACKAGE_URL = new URL( "../native-pipeline.stlanonpkg", import.meta.url, ); +const DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/u; const DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY = ""; const defaultNativePipelineCache = new WeakMap< NativeAnonymizeBinding, @@ -196,28 +203,31 @@ export const diagnostics_json = ( ...(operators !== undefined ? { operators } : {}), }); -export const readDefaultNativePipelinePackageFile = (): Uint8Array => { +export const readDefaultNativePipelinePackageFile = ({ + language, +}: DefaultNativePipelinePackageFileOptions = {}): Uint8Array => { + const packageUrl = defaultNativePipelinePackageUrl(language); try { - return new Uint8Array(readFileSync(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL)); + return new Uint8Array(readFileSync(packageUrl)); } catch (error) { throw new Error( - `Default native pipeline package is unavailable: ${formatLoadError(error)}`, + `${defaultNativePipelinePackageDescription(language)} is unavailable: ${formatLoadError(error)}`, ); } }; -export const readDefaultNativePipelinePackageFileAsync = - async (): Promise => { - try { - return new Uint8Array( - await readFile(DEFAULT_NATIVE_PIPELINE_PACKAGE_URL), - ); - } catch (error) { - throw new Error( - `Default native pipeline package is unavailable: ${formatLoadError(error)}`, - ); - } - }; +export const readDefaultNativePipelinePackageFileAsync = async ({ + language, +}: DefaultNativePipelinePackageFileOptions = {}): Promise => { + const packageUrl = defaultNativePipelinePackageUrl(language); + try { + return new Uint8Array(await readFile(packageUrl)); + } catch (error) { + throw new Error( + `${defaultNativePipelinePackageDescription(language)} is unavailable: ${formatLoadError(error)}`, + ); + } +}; export const createNativePipelineFromPackageFile = ({ binding, @@ -300,10 +310,14 @@ export const preloadDefaultNativePipelineAsync = ( const resolveDefaultNativePipelineOptions = ({ binding, + language, packagePath, expectedVersion, ...loadOptions }: DefaultNativePipelinePackageOptions = {}): ResolvedDefaultNativePipelineOptions => { + if (language !== undefined && packagePath !== undefined) { + throw new Error("Use either language or packagePath, not both"); + } const resolvedBinding = binding ?? loadNativeAnonymizeBinding({ @@ -315,17 +329,23 @@ const resolveDefaultNativePipelineOptions = ({ } return { binding: resolvedBinding, + ...(language !== undefined + ? { language: normalizeDefaultNativePipelineLanguage(language) } + : {}), ...(packagePath !== undefined ? { packagePath } : {}), }; }; const createNativePipelineFromResolvedDefaultPackage = ({ binding, + language, packagePath, }: ResolvedDefaultNativePipelineOptions): PreparedNativePipeline => { const packageBytes = packagePath === undefined - ? readDefaultNativePipelinePackageFile() + ? readDefaultNativePipelinePackageFile( + defaultPackageFileOptions(language), + ) : readNativePipelinePackageFile(packagePath); return createNativePipelineFromPackage({ binding, @@ -335,11 +355,14 @@ const createNativePipelineFromResolvedDefaultPackage = ({ const createNativePipelineFromResolvedDefaultPackageAsync = async ({ binding, + language, packagePath, }: ResolvedDefaultNativePipelineOptions): Promise => { const packageBytes = packagePath === undefined - ? await readDefaultNativePipelinePackageFileAsync() + ? await readDefaultNativePipelinePackageFileAsync( + defaultPackageFileOptions(language), + ) : await readNativePipelinePackageFileAsync(packagePath); return createNativePipelineFromPackage({ binding, @@ -347,6 +370,11 @@ const createNativePipelineFromResolvedDefaultPackageAsync = async ({ }); }; +const defaultPackageFileOptions = ( + language: string | undefined, +): DefaultNativePipelinePackageFileOptions => + language === undefined ? {} : { language }; + const resolveNativeSdkBinding = ({ binding, expectedVersion, @@ -390,13 +418,45 @@ const defaultPipelineInflightCacheFor = ( const defaultPipelineCacheKey = ({ binding, + language, packagePath, }: ResolvedDefaultNativePipelineOptions): string => [ binding.nativePackageVersion(), - packagePath ?? DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY, + packagePath ?? + (language === undefined + ? DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY + : `language:${language}`), ].join("\0"); +const defaultNativePipelinePackageUrl = (language: string | undefined): URL => { + if (language === undefined) { + return DEFAULT_NATIVE_PIPELINE_PACKAGE_URL; + } + const normalized = normalizeDefaultNativePipelineLanguage(language); + return new URL( + `../native-pipeline.${normalized}.stlanonpkg`, + import.meta.url, + ); +}; + +const defaultNativePipelinePackageDescription = ( + language: string | undefined, +): string => + language === undefined + ? "Default native pipeline package" + : `Default native pipeline package for language "${normalizeDefaultNativePipelineLanguage(language)}"`; + +const normalizeDefaultNativePipelineLanguage = (language: string): string => { + const normalized = language.trim().toLowerCase(); + if (!DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN.test(normalized)) { + throw new Error( + `Default native pipeline language must match ${DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN.source}`, + ); + } + return normalized; +}; + type NativeBindingSpecifiersOptions = { env: Record; }; From 6129f2dc15633695898733439de867f39db6f157 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 08:29:06 +0200 Subject: [PATCH 124/130] perf: add static regex prefilters --- .../src/__test__/pipeline-config.test.ts | 28 +++++ .../anonymize/src/build-unified-search.ts | 4 +- packages/anonymize/src/detectors/regex.ts | 119 +++++++++++++++++- 3 files changed, 148 insertions(+), 3 deletions(-) diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 28416f14..1abfd7b5 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -12,6 +12,7 @@ import { import { buildUnifiedSearch } from "../build-unified-search"; import { REGEX_META, + REGEX_PATTERNS, getNativeSigningClausePatterns, getSigningClausePatterns, } from "../detectors/regex"; @@ -388,6 +389,33 @@ describe("pipeline config semantics", () => { ).toEqual([]); }); + test("native config carries static regex prefilter metadata", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + labels: ["email address"], + }, + [], + createPipelineContext(), + ); + + expect(REGEX_PATTERNS.every((pattern) => typeof pattern === "string")).toBe( + true, + ); + const emailPattern = search.nativeStaticConfig.regex_patterns.find( + (pattern) => + pattern.kind === "regex" && + pattern.pattern === "\\b[\\w.+\\-]+@[\\w\\-]+(?:\\.[\\w\\-]+)+\\b", + ); + + expect(emailPattern).toMatchObject({ + lazy: true, + prefilter_any: ["@"], + prefilter_case_insensitive: false, + }); + }); + test("native trigger config carries currency terms and monetary extension data", async () => { const search = await buildUnifiedSearch( { diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 303cf871..aabf1aff 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -42,7 +42,7 @@ import { LEGAL_SUFFIXES } from "./config/legal-forms"; import { loadLanguageConfigs } from "./util/lang-loader"; import { - REGEX_PATTERNS, + REGEX_PATTERN_ENTRIES, REGEX_META, NATIVE_REGEX_VALIDATOR_IDS, getCurrencyPatternEntries, @@ -769,7 +769,7 @@ const buildUnifiedSearchSources = async ( const allRegex: PatternEntry[] = []; const regexMeta: RegexMeta[] = []; if (config.enableRegex) { - for (const [index, pattern] of REGEX_PATTERNS.entries()) { + for (const [index, pattern] of REGEX_PATTERN_ENTRIES.entries()) { const meta = REGEX_META[index]; if (!meta || !labelIsAllowed(meta.label, allowedLabels)) { continue; diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 914f8e9e..9c119e8e 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -1,4 +1,4 @@ -import type { Match } from "@stll/text-search"; +import type { Match, PatternEntry } from "@stll/text-search"; import type { Validator } from "@stll/stdnum"; import { at, @@ -175,12 +175,25 @@ type RegexDef = { label: string; score: number; minByteLength?: number; + lazy?: true; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; validator?: Validator; validatorId?: string; validatorInput?: (text: string) => string; validatorInputKind?: "digits-only" | "crypto-wallet-candidate"; }; +type RegexPatternEntry = { + pattern: string; + literal?: false; + lazy?: true; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; +}; + type AmountWordsConfig = { patterns?: Array<{ lang: string; @@ -625,6 +638,9 @@ const EMAIL: RegexDef = { pattern: `\\b[\\w.+\\-]+@[\\w\\-]+(?:\\.[\\w\\-]+)+\\b`, label: "email address", score: 1, + lazy: true, + prefilterAny: ["@"], + prefilterCaseInsensitive: false, }; // [^\S\n] instead of \s: separators must not @@ -669,6 +685,9 @@ const TEL_PREFIX_PHONE: RegexDef = { label: "phone number", score: 0.95, minByteLength: MIN_PHONE_LENGTH, + lazy: true, + prefilterAny: ["tel", "telefon"], + prefilterCaseInsensitive: true, }; /** @@ -800,6 +819,9 @@ const ES_POSTAL: RegexDef = { `[^\\S\\n]{0,3}:?[^\\S\\n]{0,3}\\d{5}\\b`, label: "address", score: 0.7, + lazy: true, + prefilterAny: ["C.P", "CP", "código postal", "codigo postal"], + prefilterCaseInsensitive: true, }; // Spanish DNI: 8 digits + 1 letter. Letter is a @@ -843,6 +865,9 @@ const NHS_NUMBER_CONTEXT: RegexDef = { validator: gb.nhs, validatorInput: DIGITS_ONLY_VALIDATOR_INPUT, validatorInputKind: "digits-only", + lazy: true, + prefilterAny: ["NHS", "National Health Service"], + prefilterCaseInsensitive: true, }; const PASSPORT_CONTEXT: RegexDef = { @@ -856,6 +881,9 @@ const PASSPORT_CONTEXT: RegexDef = { `(?:[A-Za-z]{1,2}\\d{6,8}|\\d{2}[A-Za-z]{2}\\d{5}|\\d{7,9})\\b`, label: "passport number", score: 0.96, + lazy: true, + prefilterAny: ["passport"], + prefilterCaseInsensitive: true, }; const FR_CNI_CONTEXT: RegexDef = { @@ -870,6 +898,9 @@ const FR_CNI_CONTEXT: RegexDef = { `)\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["CNI", "carte nationale", "French national identity card"], + prefilterCaseInsensitive: true, }; const CY_TIC_CONTEXT: RegexDef = { @@ -881,6 +912,9 @@ const CY_TIC_CONTEXT: RegexDef = { `\\d{8}[A-Za-z]\\b`, label: "tax identification number", score: 0.96, + lazy: true, + prefilterAny: ["TIC", "tax identification code"], + prefilterCaseInsensitive: true, }; const CY_ID_CARD_CONTEXT: RegexDef = { @@ -892,6 +926,9 @@ const CY_ID_CARD_CONTEXT: RegexDef = { `\\d{6,8}\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["Cyprus", "Cypriot", "identity card", "ID card"], + prefilterCaseInsensitive: true, }; const UK_DRIVING_LICENCE_CONTEXT: RegexDef = { @@ -903,6 +940,9 @@ const UK_DRIVING_LICENCE_CONTEXT: RegexDef = { `[A-Za-z9]{5}\\d{6}[A-Za-z0-9]{2}\\d[A-Za-z]{2}\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["driving licence", "driving license"], + prefilterCaseInsensitive: true, }; const US_DRIVER_LICENSE_CONTEXT: RegexDef = { @@ -917,6 +957,20 @@ const US_DRIVER_LICENSE_CONTEXT: RegexDef = { `)\\b`, label: "identity card number", score: 0.8, + lazy: true, + prefilterAny: [ + "driver license", + "driver licence", + "drivers license", + "drivers licence", + "driver's license", + "driver's licence", + "driver’s license", + "driver’s licence", + "driving license", + "driving licence", + ], + prefilterCaseInsensitive: true, }; const MEDICAL_LICENSE_CONTEXT: RegexDef = { @@ -931,6 +985,18 @@ const MEDICAL_LICENSE_CONTEXT: RegexDef = { `(?:[A-Za-z]{0,3}\\d{5,8}|\\d{2}[A-Za-z]\\d{4}[A-Za-z])\\b`, label: "registration number", score: 0.85, + lazy: true, + prefilterAny: [ + "GMC", + "NMC", + "medical", + "physician", + "doctor", + "surgeon", + "nursing", + "nurse", + ], + prefilterCaseInsensitive: true, }; const CRYPTO_WALLET_CANDIDATE = crypto.wallet.candidatePattern ?? "(?!)"; @@ -950,6 +1016,9 @@ const CRYPTO_WALLET_ADDRESS: RegexDef = { validator: crypto.wallet, validatorInput: getCryptoWalletCandidate, validatorInputKind: "crypto-wallet-candidate", + lazy: true, + prefilterAny: ["0x", "bc1", "BTC", "Bitcoin", "crypto", "wallet", "address"], + prefilterCaseInsensitive: true, }; const AU_ABN_FORMATTED: RegexDef = { @@ -973,6 +1042,9 @@ const NO_MVA_FORMATTED: RegexDef = { label: "tax identification number", score: 0.95, validator: no.mva, + lazy: true, + prefilterAny: ["MVA"], + prefilterCaseInsensitive: false, }; const US_EIN_FORMATTED: RegexDef = { @@ -1035,6 +1107,9 @@ const BR_RG_WITH_SSP: RegexDef = { `[^\\S\\n]+SSP(?:/[A-Z]{2})?\\b`, label: "national identification number", score: 0.95, + lazy: true, + prefilterAny: ["SSP"], + prefilterCaseInsensitive: false, }; // Brazilian OAB (lawyer registration). Format: @@ -1047,6 +1122,9 @@ const BR_OAB: RegexDef = { `(?:\\d{1,3}(?:\\.\\d{3})+|\\d{4,6})\\b`, label: "registration number", score: 0.95, + lazy: true, + prefilterAny: ["OAB/"], + prefilterCaseInsensitive: false, }; // URL: scheme + host + optional port + path + query + @@ -1064,6 +1142,9 @@ const URL: RegexDef = { `(?:[/?#][^\\s)\\]>]*[^\\s.,;:!?)\\]>])?`, label: "url", score: 1, + lazy: true, + prefilterAny: ["http://", "https://", "http:", "https:", "www."], + prefilterCaseInsensitive: false, }; // Bare domain: no protocol/www prefix, ends with a @@ -1177,6 +1258,9 @@ const TIME_12H: RegexDef = { `(?=[\\s,;!?)]|$)`, label: "date", score: 0.9, + lazy: true, + prefilterAny: ["am", "pm", "a.m", "p.m"], + prefilterCaseInsensitive: true, }; const PERCENT_NUMBER_BODY = `(?:\\d{1,3}(?:[.,]\\d{3})+(?:[.,]\\d{1,4})?|\\d+(?:[.,]\\d{1,4})?)`; @@ -1229,6 +1313,9 @@ const PERCENT_RATE: RegexDef = { `)(?![\\p{L}\\p{N}_])`, label: "monetary amount", score: 0.85, + lazy: true, + prefilterAny: ["%"], + prefilterCaseInsensitive: false, }; // ── Collected definitions ──────────────────────────── @@ -1304,6 +1391,36 @@ export const REGEX_PATTERNS: readonly string[] = ALL_REGEX_DEFS.map( (d) => d.pattern, ); +const toRegexPatternEntry = (definition: RegexDef): PatternEntry => { + if ( + definition.lazy === undefined && + definition.prefilterAny === undefined && + definition.prefilterCaseInsensitive === undefined && + definition.prefilterRegex === undefined + ) { + return definition.pattern; + } + + const entry: RegexPatternEntry = { pattern: definition.pattern }; + if (definition.lazy !== undefined) { + entry.lazy = definition.lazy; + } + if (definition.prefilterAny !== undefined) { + entry.prefilterAny = definition.prefilterAny; + } + if (definition.prefilterCaseInsensitive !== undefined) { + entry.prefilterCaseInsensitive = definition.prefilterCaseInsensitive; + } + if (definition.prefilterRegex !== undefined) { + entry.prefilterRegex = definition.prefilterRegex; + } + return entry; +}; + +/** Static regex entries with compile-time prefilter hints. */ +export const REGEX_PATTERN_ENTRIES: readonly PatternEntry[] = + ALL_REGEX_DEFS.map(toRegexPatternEntry); + /** Parallel metadata. Index = pattern index. */ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( (d): RegexMeta => { From f3611fd0485df8583ce86fb2c705ce36f7ebc663 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 08:29:14 +0200 Subject: [PATCH 125/130] perf: warm native regex preloads --- crates/anonymize-core/src/prepared.rs | 8 +++++++ crates/anonymize-core/src/search.rs | 10 +++++++++ crates/anonymize-napi/src/lib.rs | 8 +++++++ .../python/stella_anonymize/__init__.py | 3 +++ .../python/stella_anonymize/__init__.pyi | 1 + .../python/stella_anonymize/_native.pyi | 1 + crates/anonymize-py/src/lib.rs | 7 ++++++ .../scripts/migration-fixture-perf.mjs | 16 ++++++++++++++ .../src/__test__/native-node.test.ts | 20 ++++++++++++++--- packages/anonymize/src/native-node.ts | 9 +++++++- packages/anonymize/src/native-sdk-contract.ts | 1 + packages/anonymize/src/native.ts | 22 +++++++++++++++++++ 12 files changed, 102 insertions(+), 4 deletions(-) diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs index 030423db..7ffac5ba 100644 --- a/crates/anonymize-core/src/prepared.rs +++ b/crates/anonymize-core/src/prepared.rs @@ -301,6 +301,14 @@ impl PreparedSearch { Self::new_inner(config, None, None) } + pub fn warm_lazy_regex(&self) -> Result<()> { + self.regex.warm_lazy_regex()?; + self.custom_regex.warm_lazy_regex()?; + self.legal_forms.warm_lazy_regex()?; + self.triggers.warm_lazy_regex()?; + self.literals.warm_lazy_regex() + } + pub fn prepare_artifacts( config: PreparedSearchConfig, ) -> Result { diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs index 5e737f73..7c0d8980 100644 --- a/crates/anonymize-core/src/search.rs +++ b/crates/anonymize-core/src/search.rs @@ -338,6 +338,16 @@ impl SearchIndex { Ok(false) } + pub fn warm_lazy_regex(&self) -> Result<()> { + for slot in &self.slots { + slot + .search + .warm_lazy_regex() + .map_err(|error| search_error(&error))?; + } + Ok(()) + } + #[must_use] pub fn len(&self) -> usize { self diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs index 386f0f3b..484a752f 100644 --- a/crates/anonymize-napi/src/lib.rs +++ b/crates/anonymize-napi/src/lib.rs @@ -602,6 +602,14 @@ impl NativePreparedSearch { .map_err(|error| to_napi_serde_error(&error)) } + #[napi] + pub fn warm_lazy_regex(&self) -> Result<()> { + self + .inner + .warm_lazy_regex() + .map_err(|error| to_napi_core_error(&error)) + } + #[napi] #[allow(clippy::needless_pass_by_value)] pub fn redact_static_entities( diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py index 93587b68..ef20a12d 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.py +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -84,6 +84,9 @@ def from_prepared_package_bytes( def prepare_diagnostics_json(self) -> str: return self._prepared.prepare_diagnostics_json() + def warm_lazy_regex(self) -> None: + self._prepared.warm_lazy_regex() + def redact_text( self, full_text: str, diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.pyi b/crates/anonymize-py/python/stella_anonymize/__init__.pyi index b0efc661..4c07bb8d 100644 --- a/crates/anonymize-py/python/stella_anonymize/__init__.pyi +++ b/crates/anonymize-py/python/stella_anonymize/__init__.pyi @@ -40,6 +40,7 @@ class PreparedAnonymizer: package_bytes: BytesLike, ) -> PreparedAnonymizer: ... def prepare_diagnostics_json(self) -> str: ... + def warm_lazy_regex(self) -> None: ... def redact_text( self, full_text: str, diff --git a/crates/anonymize-py/python/stella_anonymize/_native.pyi b/crates/anonymize-py/python/stella_anonymize/_native.pyi index e871a0f2..bbdff4d8 100644 --- a/crates/anonymize-py/python/stella_anonymize/_native.pyi +++ b/crates/anonymize-py/python/stella_anonymize/_native.pyi @@ -60,6 +60,7 @@ class PreparedSearch: package_bytes: BytesLike, ) -> PreparedSearch: ... def prepare_diagnostics_json(self) -> str: ... + def warm_lazy_regex(self) -> None: ... def redact_static_entities( self, full_text: str, diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs index 4cf4659a..1c736fa6 100644 --- a/crates/anonymize-py/src/lib.rs +++ b/crates/anonymize-py/src/lib.rs @@ -138,6 +138,13 @@ impl PyPreparedSearch { .map_err(|error| to_py_serde_error(&error)) } + fn warm_lazy_regex(&self) -> PyResult<()> { + self + .inner + .warm_lazy_regex() + .map_err(|error| to_py_core_error(&error)) + } + fn redact_static_entities( &self, full_text: &str, diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 6a32a1e8..7d903b42 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -1739,6 +1739,7 @@ function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { }; const prepareStart = Bun.nanoseconds(); const prepared = prepare(); + warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; @@ -1746,6 +1747,7 @@ function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(); + warmNativePreparedSearch(cachedPrepared); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1801,6 +1803,7 @@ function createNativeStaticRunnerFromJsonBytes(configBytes) { }; const prepareStart = Bun.nanoseconds(); const prepared = prepare(configBytes); + warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; @@ -1808,6 +1811,7 @@ function createNativeStaticRunnerFromJsonBytes(configBytes) { for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(configBytes); + warmNativePreparedSearch(cachedPrepared); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1834,6 +1838,7 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); const prepareStart = Bun.nanoseconds(); const prepared = prepare(); + warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; @@ -1841,6 +1846,7 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(); + warmNativePreparedSearch(cachedPrepared); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1861,6 +1867,16 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { }; } +function warmNativePreparedSearch(prepared) { + const warmLazyRegex = + typeof prepared.warmLazyRegex === "function" + ? prepared.warmLazyRegex + : prepared.warm_lazy_regex; + if (typeof warmLazyRegex === "function") { + warmLazyRegex.call(prepared); + } +} + function writeNativePackageIfRequested(packageBytes) { if (packageBytes !== null && WRITE_NATIVE_PACKAGE_PATH.length > 0) { writeFileSync(WRITE_NATIVE_PACKAGE_PATH, packageBytes); diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts index a522bee5..c22381cd 100644 --- a/packages/anonymize/src/__test__/native-node.test.ts +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -190,10 +190,14 @@ describe("native node loader", () => { const capturedBytes: number[][] = []; try { writeFileSync(packagePath, Uint8Array.of(13, 14, 15)); + let warmCount = 0; const binding = fakeNativeBinding("1.5.0", { onPreparedPackageBytes: (bytes) => { capturedBytes.push([...bytes]); }, + onWarmLazyRegex: () => { + warmCount += 1; + }, }); const first = getDefaultNativePipeline({ @@ -215,6 +219,7 @@ describe("native node loader", () => { expect(second).toBe(first); expect(preloaded).toBe(first); expect(capturedBytes).toEqual([[13, 14, 15]]); + expect(warmCount).toBe(1); } finally { rmSync(dir, { recursive: true, force: true }); } @@ -226,10 +231,14 @@ describe("native node loader", () => { const capturedBytes: number[][] = []; try { writeFileSync(packagePath, Uint8Array.of(16, 17, 18)); + let warmCount = 0; const binding = fakeNativeBinding("1.5.0", { onPreparedPackageBytes: (bytes) => { capturedBytes.push([...bytes]); }, + onWarmLazyRegex: () => { + warmCount += 1; + }, }); const [first, second] = await Promise.all([ @@ -253,6 +262,7 @@ describe("native node loader", () => { expect(second).toBe(first); expect(syncCached).toBe(first); expect(capturedBytes).toEqual([[16, 17, 18]]); + expect(warmCount).toBe(1); } finally { rmSync(dir, { recursive: true, force: true }); } @@ -404,6 +414,7 @@ type FakeNativeBindingOptions = { preparedSearchAsConstructor?: boolean; compressedPackageBytes?: Uint8Array; onPreparedPackageBytes?: (bytes: Uint8Array) => void; + onWarmLazyRegex?: () => void; }; const fakeNativeBinding = ( @@ -411,10 +422,10 @@ const fakeNativeBinding = ( options: FakeNativeBindingOptions = {}, ): NativeAnonymizeBinding => { const preparedSearch = { - fromConfigJsonBytes: () => fakePreparedSearch(), + fromConfigJsonBytes: () => fakePreparedSearch(options.onWarmLazyRegex), fromPreparedPackageBytes: (bytes: Uint8Array) => { options.onPreparedPackageBytes?.(bytes); - return fakePreparedSearch(); + return fakePreparedSearch(options.onWarmLazyRegex); }, }; const NativePreparedSearch = options.preparedSearchAsConstructor @@ -431,8 +442,11 @@ const fakeNativeBinding = ( }; }; -const fakePreparedSearch = () => ({ +const fakePreparedSearch = (onWarmLazyRegex?: () => void) => ({ prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + warmLazyRegex: () => { + onWarmLazyRegex?.(); + }, redactStaticEntities: emptyStaticRedactionBindingResult, redactStaticEntitiesDiagnosticsJson: emptyStaticRedactionDiagnosticJson, }); diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts index 8c959a47..af98e2e6 100644 --- a/packages/anonymize/src/native-node.ts +++ b/packages/anonymize/src/native-node.ts @@ -273,7 +273,13 @@ export const getDefaultNativePipeline = ( return pipeline; }; -export const preloadDefaultNativePipeline = getDefaultNativePipeline; +export const preloadDefaultNativePipeline = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => { + const pipeline = getDefaultNativePipeline(options); + pipeline.warmLazyRegex(); + return pipeline; +}; export const preloadDefaultNativePipelineAsync = ( options: DefaultNativePipelinePackageOptions = {}, @@ -298,6 +304,7 @@ export const preloadDefaultNativePipelineAsync = ( resolvedOptions, ) .then((pipeline) => { + pipeline.warmLazyRegex(); cache.set(key, pipeline); return pipeline; }) diff --git a/packages/anonymize/src/native-sdk-contract.ts b/packages/anonymize/src/native-sdk-contract.ts index 35dd7e71..1c2df34c 100644 --- a/packages/anonymize/src/native-sdk-contract.ts +++ b/packages/anonymize/src/native-sdk-contract.ts @@ -18,6 +18,7 @@ export const SHARED_NATIVE_SDK_PREPARED_METHODS = [ "redact_text_json", "diagnostics_json", "prepare_diagnostics_json", + "warm_lazy_regex", ] as const; export const SHARED_NATIVE_SDK_CLASS_NAMES = [ diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts index a11f6b82..afbe37a3 100644 --- a/packages/anonymize/src/native.ts +++ b/packages/anonymize/src/native.ts @@ -60,6 +60,8 @@ type CanonicalStaticRedactionResult = { export type NativePreparedSearchBinding = { prepareDiagnosticsJson?: () => string; + warmLazyRegex?: () => void; + warm_lazy_regex?: () => void; redactStaticEntities: ( fullText: string, operators?: NativeBindingOperatorConfig, @@ -190,6 +192,18 @@ export class PreparedNativeAnonymizer { return this.prepareDiagnosticsJson(); } + warmLazyRegex(): void { + if (this.#prepared.warmLazyRegex) { + this.#prepared.warmLazyRegex(); + return; + } + this.#prepared.warm_lazy_regex?.(); + } + + warm_lazy_regex(): void { + this.warmLazyRegex(); + } + redactStaticEntities( fullText: string, operators?: NativeOperatorConfig, @@ -262,6 +276,14 @@ export class PreparedNativePipeline { return this.prepareDiagnosticsJson(); } + warmLazyRegex(): void { + this.#anonymizer.warmLazyRegex(); + } + + warm_lazy_regex(): void { + this.warmLazyRegex(); + } + redactText( fullText: string, operators?: NativeOperatorConfig, From 97f0735dcfa0cd82d78a6256b0386d7a1a3606c0 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 08:42:22 +0200 Subject: [PATCH 126/130] perf: report native package warmup scenarios --- crates/anonymize-py/README.md | 5 +- packages/anonymize/README.md | 4 +- .../scripts/migration-fixture-perf.mjs | 51 ++++++++++++++++--- .../scripts/native-package-ux-perf.mjs | 43 +++++++++++++++- 4 files changed, 94 insertions(+), 9 deletions(-) diff --git a/crates/anonymize-py/README.md b/crates/anonymize-py/README.md index b327bd45..29ca5356 100644 --- a/crates/anonymize-py/README.md +++ b/crates/anonymize-py/README.md @@ -17,6 +17,7 @@ import stella_anonymize as anonymize package_bytes = anonymize.prepare_search_package(config_json) prepared = anonymize.load_prepared_package(package_bytes) +prepared.warm_lazy_regex() result = prepared.redact_text(text, redact_string="***") print(result.redaction.redacted_text) @@ -28,16 +29,18 @@ For prepared package files: import stella_anonymize as anonymize prepared = anonymize.load_prepared_package_file("anonymize.stlanonpkg") +prepared.warm_lazy_regex() result_json = prepared.redact_text_json(text) ``` -Top-level `redact_text()` and `redact_text_json()` are available for one-off calls, but they prepare from config on each invocation. Use `load_prepared_package()` or `load_prepared_package_file()` for repeated document processing. +Top-level `redact_text()` and `redact_text_json()` are available for one-off calls, but they prepare from config on each invocation. Use `load_prepared_package()` or `load_prepared_package_file()` for repeated document processing, then call `warm_lazy_regex()` before the first document when startup can absorb that cost. ## API - `prepare_search_package(config_json, compressed=True) -> bytes` - `load_prepared_package(package_bytes) -> PreparedAnonymizer` - `load_prepared_package_file(package_path) -> PreparedAnonymizer` +- `PreparedAnonymizer.warm_lazy_regex()` - `PreparedAnonymizer.redact_text(text, operators=None, redact_string=None)` - `PreparedAnonymizer.redact_text_json(text, operators=None, redact_string=None)` - `PreparedAnonymizer.diagnostics_json(text, operators=None, redact_string=None)` diff --git a/packages/anonymize/README.md b/packages/anonymize/README.md index 4b291457..89aa8349 100644 --- a/packages/anonymize/README.md +++ b/packages/anonymize/README.md @@ -29,7 +29,7 @@ const result = anonymizer.redact_text(text); console.log(result.redaction.redactedText); ``` -Call `getDefaultNativePipeline()` once during service startup and reuse the returned anonymizer. The package ships with a prepared native package, so the normal request path avoids rebuilding search automata. +Call `getDefaultNativePipeline()` once during service startup and reuse the returned anonymizer. The package ships with a prepared native package, so the normal request path avoids rebuilding search automata. Use `preloadDefaultNativePipeline()` or `preloadDefaultNativePipelineAsync()` when the first document should not pay lazy regex warm-up. If your deployment knows the document language up front, build scoped package artifacts and select them at startup: @@ -53,6 +53,7 @@ bunx stella-anonymize-build-native-package \ import { load_prepared_package_file } from "@stll/anonymize/native-node"; const anonymizer = load_prepared_package_file("./dist/anonymize.stlanonpkg"); +anonymizer.warmLazyRegex(); const result = anonymizer.redact_text(text, { redactString: "***" }); ``` @@ -65,6 +66,7 @@ import stella_anonymize as anonymize package_bytes = anonymize.prepare_search_package(config_json) prepared = anonymize.load_prepared_package(package_bytes) +prepared.warm_lazy_regex() result = prepared.redact_text(text, redact_string="***") print(result.redaction.redacted_text) diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs index 7d903b42..be634901 100644 --- a/packages/anonymize/scripts/migration-fixture-perf.mjs +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -373,8 +373,11 @@ async function runWorker() { const nativeArtifactBytes = runtimeRunner?.artifactBytes ?? 0; const nativePackagePrepareMs = runtimeRunner?.packagePrepareMs ?? 0; const nativePackageBytes = runtimeRunner?.packageBytes ?? 0; + const nativeWarmPrepareMs = runtimeRunner?.warmPrepareMs ?? 0; const nativeCachedPrepareMsByIteration = runtimeRunner?.cachedPrepareMsByIteration ?? []; + const nativeCachedWarmPrepareMsByIteration = + runtimeRunner?.cachedWarmPrepareMsByIteration ?? []; const nativeCachedPrepareAvgMs = nativeCachedPrepareMsByIteration.length === 0 ? 0 @@ -384,6 +387,15 @@ async function runWorker() { 0, ) / nativeCachedPrepareMsByIteration.length, ); + const nativeCachedWarmPrepareAvgMs = + nativeCachedWarmPrepareMsByIteration.length === 0 + ? 0 + : roundMs( + nativeCachedWarmPrepareMsByIteration.reduce( + (sum, value) => sum + value, + 0, + ) / nativeCachedWarmPrepareMsByIteration.length, + ); const coldRun = runtimeRunner === null @@ -446,7 +458,9 @@ async function runWorker() { nativePackageReadMs, nativePackagePrepareMs, nativePrepareMs, + nativeWarmPrepareMs, nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, coldRunMs: coldRun.ms, warmAvgMs, }); @@ -474,8 +488,11 @@ async function runWorker() { nativePackagePrepareMs, nativePackageBytes, nativePrepareMs, + nativeWarmPrepareMs, nativeCachedPrepareMsByIteration, + nativeCachedWarmPrepareMsByIteration, nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, nativeFirstTouchMs: nativeTimingScenario.firstTouchMs, nativeWarmClickMs: nativeTimingScenario.warmClickMs, coldRunMs: coldRun.ms, @@ -529,7 +546,9 @@ function describeNativeTimingScenario({ nativePackageReadMs, nativePackagePrepareMs, nativePrepareMs, + nativeWarmPrepareMs, nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, coldRunMs, warmAvgMs, }) { @@ -552,7 +571,9 @@ function describeNativeTimingScenario({ packageReadMs: nativePackageReadMs, offlinePackageBuildMs: nativePackagePrepareMs, firstPrepareMs: nativePrepareMs, + firstWarmPrepareMs: nativeWarmPrepareMs, cachedPrepareMs: nativeCachedPrepareAvgMs, + cachedWarmPrepareMs: nativeCachedWarmPrepareAvgMs, firstRunMs: coldRunMs, setupBeforeClickMs: roundMs(nativePackageReadMs + nativePrepareMs), preloadedClickMs: coldRunMs, @@ -1739,15 +1760,18 @@ function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { }; const prepareStart = Bun.nanoseconds(); const prepared = prepare(); - warmNativePreparedSearch(prepared); + const warmPrepareMs = warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; let cachedPrepareDiagnostics = null; for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(); - warmNativePreparedSearch(cachedPrepared); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1758,6 +1782,7 @@ function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { prepareDiagnostics, cachedPrepareDiagnostics, cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, configBytes: Buffer.byteLength(configJson, "utf8"), artifactBytes: artifactBytes?.byteLength ?? 0, artifactPrepareMs, @@ -1765,6 +1790,7 @@ function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { packagePrepareMs, stringifyMs, prepareMs, + warmPrepareMs, }; } @@ -1803,15 +1829,18 @@ function createNativeStaticRunnerFromJsonBytes(configBytes) { }; const prepareStart = Bun.nanoseconds(); const prepared = prepare(configBytes); - warmNativePreparedSearch(prepared); + const warmPrepareMs = warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; let cachedPrepareDiagnostics = null; for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(configBytes); - warmNativePreparedSearch(cachedPrepared); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1822,6 +1851,7 @@ function createNativeStaticRunnerFromJsonBytes(configBytes) { prepareDiagnostics, cachedPrepareDiagnostics, cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, configBytes: configBytes.byteLength, artifactBytes: artifactBytes?.byteLength ?? 0, artifactPrepareMs, @@ -1829,6 +1859,7 @@ function createNativeStaticRunnerFromJsonBytes(configBytes) { packagePrepareMs, stringifyMs: 0, prepareMs, + warmPrepareMs, }; } @@ -1838,15 +1869,18 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); const prepareStart = Bun.nanoseconds(); const prepared = prepare(); - warmNativePreparedSearch(prepared); + const warmPrepareMs = warmNativePreparedSearch(prepared); const prepareMs = elapsedMs(prepareStart); const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; let cachedPrepareDiagnostics = null; for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { const cachedPrepareStart = Bun.nanoseconds(); const cachedPrepared = prepare(); - warmNativePreparedSearch(cachedPrepared); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); cachedPrepareDiagnostics = JSON.parse( cachedPrepared.prepareDiagnosticsJson(), @@ -1857,6 +1891,7 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { prepareDiagnostics, cachedPrepareDiagnostics, cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, configBytes: 0, artifactBytes: 0, artifactPrepareMs: 0, @@ -1864,6 +1899,7 @@ function createNativeStaticRunnerFromPackageBytes(packageBytes) { packagePrepareMs: 0, stringifyMs: 0, prepareMs, + warmPrepareMs, }; } @@ -1873,8 +1909,11 @@ function warmNativePreparedSearch(prepared) { ? prepared.warmLazyRegex : prepared.warm_lazy_regex; if (typeof warmLazyRegex === "function") { + const warmStart = Bun.nanoseconds(); warmLazyRegex.call(prepared); + return elapsedMs(warmStart); } + return 0; } function writeNativePackageIfRequested(packageBytes) { diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs index db923af5..cb42c3a1 100644 --- a/packages/anonymize/scripts/native-package-ux-perf.mjs +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -17,6 +17,7 @@ const SCENARIOS = [ { name: "compressed", compressed: true }, { name: "raw", compressed: false }, ...languageScenarios(), + ...userDataScenarios(), ]; const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-package-ux-")); @@ -33,7 +34,7 @@ try { rmSync(tempRoot, { force: true, recursive: true }); } -function runScenario({ name, compressed, language }) { +function runScenario({ name, compressed, language, userDataScenario }) { const packagePath = join(tempRoot, `${name}.stlanonpkg`); const languageEnv = language === undefined @@ -42,14 +43,22 @@ function runScenario({ name, compressed, language }) { ANONYMIZE_MIGRATION_CONTENT_LANGUAGE: language, ANONYMIZE_MIGRATION_FIXTURE_LANGUAGES: language, }; + const userDataEnv = + userDataScenario === undefined || userDataScenario === "none" + ? {} + : { + ANONYMIZE_MIGRATION_USER_DATA_SCENARIO: userDataScenario, + }; const build = runMigration({ ...languageEnv, + ...userDataEnv, ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: compressed ? "1" : "0", ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1", ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH: packagePath, }); const load = runMigration({ ...languageEnv, + ...userDataEnv, ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH: packagePath, }); const nativeDiagnostics = load.nativeDiagnostics ?? null; @@ -58,14 +67,17 @@ function runScenario({ name, compressed, language }) { name, compressed, language: language ?? null, + userDataScenario: userDataScenario ?? "none", fixtureCount: load.fixtureCount, packageBytes: build.timings.nativePackageBytes, offlinePackageBuildMs: build.timings.nativePackagePrepareMs, firstPackageReadMs: load.timings.nativePackageReadMs, firstPrepareMs: load.timings.nativePrepareMs, + firstWarmPrepareMs: load.timings.nativeWarmPrepareMs, setupBeforeClickMs: load.timings.nativePackageReadMs + load.timings.nativePrepareMs, cachedPrepareMs: load.timings.nativeCachedPrepareAvgMs, + cachedWarmPrepareMs: load.timings.nativeCachedWarmPrepareAvgMs, firstRunMs: load.timings.coldRunMs, preloadedClickMs: load.timings.coldRunMs, firstTouchMs: load.timings.nativeFirstTouchMs, @@ -107,6 +119,35 @@ function normalizeLanguage(value) { return language; } +function userDataScenarios() { + const value = + process.env.ANONYMIZE_NATIVE_PACKAGE_UX_USER_DATA_SCENARIOS ?? + "sample,heavy"; + if (value.trim().length === 0) { + return []; + } + return value + .split(",") + .map((entry) => normalizeUserDataScenario(entry)) + .filter((entry) => entry !== "none") + .filter((entry, index, entries) => entries.indexOf(entry) === index) + .map((userDataScenario) => ({ + name: `compressed-user-${userDataScenario}`, + compressed: true, + userDataScenario, + })); +} + +function normalizeUserDataScenario(value) { + const scenario = value.trim().toLowerCase(); + if (scenario === "none" || scenario === "sample" || scenario === "heavy") { + return scenario; + } + throw new Error( + `ANONYMIZE_NATIVE_PACKAGE_UX_USER_DATA_SCENARIOS must contain none, sample, or heavy; got ${value}`, + ); +} + function runMigration(extraEnv) { const child = spawnSync(process.execPath, [MIGRATION_SCRIPT], { cwd: ROOT_DIR, From 97367d2d5206a526525abf33a7fb76b94342e916 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 17:33:34 +0200 Subject: [PATCH 127/130] chore: pin text search core --- Cargo.lock | 2 ++ crates/anonymize-core/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 20bc2d30..ccef74f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -933,6 +933,7 @@ dependencies = [ [[package]] name = "stella-regex-set-core" version = "1.0.5" +source = "git+https://github.com/stella/regex-set?rev=75b6a7f7a89880b70c8497f5b86a3f09748ea3fd#75b6a7f7a89880b70c8497f5b86a3f09748ea3fd" dependencies = [ "fancy-regex", "regex", @@ -953,6 +954,7 @@ dependencies = [ [[package]] name = "stella-text-search-core" version = "1.0.6" +source = "git+https://github.com/stella/text-search?rev=0e44094dbcd027218a767439ded062bf615015d0#0e44094dbcd027218a767439ded062bf615015d0" dependencies = [ "stella-aho-corasick-core", "stella-fuzzy-search-core", diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml index d4c1b361..38a213c7 100644 --- a/crates/anonymize-core/Cargo.toml +++ b/crates/anonymize-core/Cargo.toml @@ -15,7 +15,7 @@ fancy-regex = "0.18" regex = "1" serde = { version = "1", features = ["derive"] } stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } -stella-text-search-core = { version = "1.0.6", path = "/private/tmp/stll-text-search-regex-prepared/crates/core" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0e44094dbcd027218a767439ded062bf615015d0" } [dev-dependencies] proptest = "1" From 0aca8a7b344059d3219b0edc7dd6477fc2f1781b Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 17:38:06 +0200 Subject: [PATCH 128/130] chore: avoid native package shadowing --- .../anonymize/scripts/build-native-pipeline-package.mjs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/anonymize/scripts/build-native-pipeline-package.mjs b/packages/anonymize/scripts/build-native-pipeline-package.mjs index 3d6cc185..0c6d6270 100755 --- a/packages/anonymize/scripts/build-native-pipeline-package.mjs +++ b/packages/anonymize/scripts/build-native-pipeline-package.mjs @@ -146,19 +146,19 @@ function defaultNativePipelineConfig() { }; } -function applyCliLanguageScope(config, options) { +function applyCliLanguageScope(pipelineConfig, options) { if (options.language !== undefined && options.languages !== undefined) { throw new Error("Use either --language or --languages, not both"); } if (options.language !== undefined) { const language = normalizeLanguageOption(options.language, "--language"); - return { ...config, language, languages: undefined }; + return { ...pipelineConfig, language, languages: undefined }; } if (options.languages === undefined) { - return config; + return pipelineConfig; } const languages = normalizeLanguageList(options.languages); - return { ...config, language: undefined, languages }; + return { ...pipelineConfig, language: undefined, languages }; } function normalizeLanguageOption(value, option) { From 29b7453089ecc717288e5a225478260c71e9e6ac Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 17:49:29 +0200 Subject: [PATCH 129/130] fix: address native review gaps --- README.md | 4 ++-- packages/anonymize/src/__test__/constants-parity.test.ts | 5 +++++ .../anonymize/src/__test__/dictionary-bundle.test.ts | 9 ++++++--- packages/anonymize/src/index-shared.ts | 2 ++ packages/data/dictionaries/index.ts | 6 ++---- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0f2626fb..742a6d7d 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ bun add @stll/anonymize bun add @stll/anonymize-data # Browser / Vite usage bun add @stll/anonymize-wasm -# Python SDK -uv add stella-anonymize-core +# Python SDK source build +uv add ./crates/anonymize-py ``` Or anonymize from the terminal without installing: diff --git a/packages/anonymize/src/__test__/constants-parity.test.ts b/packages/anonymize/src/__test__/constants-parity.test.ts index 54a8306c..7ecbcda1 100644 --- a/packages/anonymize/src/__test__/constants-parity.test.ts +++ b/packages/anonymize/src/__test__/constants-parity.test.ts @@ -23,4 +23,9 @@ describe("@stll/anonymize/constants subpath parity", () => { ...fromRoot.OPERATOR_TYPES, ]); }); + + test("native shared SDK helpers are exported from the root entrypoint", () => { + expect(typeof fromRoot.redact_text).toBe("function"); + expect(typeof fromRoot.redact_text_json).toBe("function"); + }); }); diff --git a/packages/anonymize/src/__test__/dictionary-bundle.test.ts b/packages/anonymize/src/__test__/dictionary-bundle.test.ts index 38c5a81d..55d18d5b 100644 --- a/packages/anonymize/src/__test__/dictionary-bundle.test.ts +++ b/packages/anonymize/src/__test__/dictionary-bundle.test.ts @@ -12,10 +12,13 @@ describe("dictionary bundle scoping", () => { expect(Object.keys(bundle.citiesByCountry)).toContain("CZ"); }); - test("unsupported name language scope falls back to packaged names", async () => { + test("unsupported non-empty name language scope keeps names empty", async () => { const bundle = await loadDictionaryBundle({ nameLanguages: ["pt-br"] }); - expect(Object.keys(bundle.firstNames).length).toBeGreaterThan(0); - expect(Object.keys(bundle.surnames).length).toBeGreaterThan(0); + expect(bundle.firstNames).toEqual({}); + expect(bundle.surnames).toEqual({}); + expect(Object.values(bundle.denyListMeta)).not.toContainEqual( + expect.objectContaining({ category: "Names" }), + ); }); }); diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index dd5b589e..287d084f 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -65,6 +65,7 @@ export { normalize_for_search, prepareNativeSearchPackage, prepare_search_package, + redact_text, redact_text_json, } from "./native"; export type { @@ -84,6 +85,7 @@ export type { PreparedSearch as PreparedSearchInstance, SharedNativePreparedPackageOptions, SharedNativeDiagnosticsJsonOptions, + SharedNativeRedactTextOptions, SharedNativeRedactTextJsonOptions, SharedNativeSearchPackageOptions, } from "./native"; diff --git a/packages/data/dictionaries/index.ts b/packages/data/dictionaries/index.ts index 2f54316b..71ab052e 100644 --- a/packages/data/dictionaries/index.ts +++ b/packages/data/dictionaries/index.ts @@ -1033,9 +1033,7 @@ export const loadDictionaryBundle = async ({ const countryScope = normalizeCountryCodes(countries); const scopedNameLanguages = normalizeNameLanguages(nameLanguages); const hasScopedNames = - nameLanguages !== undefined && - nameLanguages.length > 0 && - scopedNameLanguages.length > 0; + nameLanguages !== undefined && nameLanguages.length > 0; const dictionaryIds = ALL_DICTIONARY_IDS.filter((id) => dictionaryIdIsInScope(id, countryScope, hasScopedNames), ); @@ -1053,7 +1051,7 @@ export const loadDictionaryBundle = async ({ } const nameDictionaries = await loadNameDictionaries( - scopedNameLanguages.length > 0 ? scopedNameLanguages : undefined, + hasScopedNames ? scopedNameLanguages : undefined, ); const requestedCityScope = cityCountries ?? countries; const cityScope = From 913d09e2c9cf0d037882ed752f166770b1607821 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Sat, 27 Jun 2026 18:14:26 +0200 Subject: [PATCH 130/130] test: allow slow native parity --- .../__test__/native-adapter-parity.test.ts | 115 ++++++++++-------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts index 9cf654a6..73ffe0a9 100644 --- a/packages/anonymize/src/__test__/native-adapter-parity.test.ts +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -60,6 +60,8 @@ import { loadTestDictionaries } from "./load-dictionaries"; setDefaultTimeout(240_000); +const SLOW_NATIVE_FIXTURE_PARITY_TIMEOUT_MS = 600_000; + type NativeAdapter = Omit< NativeAnonymizeBinding, | "prepareStaticSearchPackageBytes" @@ -2062,63 +2064,68 @@ describe("native adapter parity", () => { }); }); - test("native facade and Python match on contract fixture packages", async () => { - const adapters = getAdapters(); - for (const language of CONTRACT_FIXTURE_LANGUAGES) { - const fixtures = loadContractFixtureCases(language); - const scopedConfig = applyPipelineLanguageScope({ - ...contractTestConfig(`native-facade-fixture-parity-${language}`), - language, - }); - const dictionaryScope: Parameters[0] = {}; - if (scopedConfig.denyListCountries !== undefined) { - dictionaryScope.denyListCountries = scopedConfig.denyListCountries; - } - if (scopedConfig.nameCorpusLanguages !== undefined) { - dictionaryScope.nameCorpusLanguages = scopedConfig.nameCorpusLanguages; - } - const dictionaries = await loadTestDictionaries(dictionaryScope); - const search = await preparePipelineSearch({ - config: { - ...scopedConfig, - dictionaries, - }, - context: createPipelineContext(), - }); - const configJson = JSON.stringify(search.nativeStaticConfig); - const packageBytes = prepareNativeSearchPackage({ - binding: adapters.native, - config: search.nativeStaticConfig, - compressed: true, - }); - const anonymizer = createNativeAnonymizerFromPackage({ - binding: adapters.native, - packageBytes, - }); + test( + "native facade and Python match on contract fixture packages", + async () => { + const adapters = getAdapters(); + for (const language of CONTRACT_FIXTURE_LANGUAGES) { + const fixtures = loadContractFixtureCases(language); + const scopedConfig = applyPipelineLanguageScope({ + ...contractTestConfig(`native-facade-fixture-parity-${language}`), + language, + }); + const dictionaryScope: Parameters[0] = {}; + if (scopedConfig.denyListCountries !== undefined) { + dictionaryScope.denyListCountries = scopedConfig.denyListCountries; + } + if (scopedConfig.nameCorpusLanguages !== undefined) { + dictionaryScope.nameCorpusLanguages = + scopedConfig.nameCorpusLanguages; + } + const dictionaries = await loadTestDictionaries(dictionaryScope); + const search = await preparePipelineSearch({ + config: { + ...scopedConfig, + dictionaries, + }, + context: createPipelineContext(), + }); + const configJson = JSON.stringify(search.nativeStaticConfig); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); - const tsResults = fixtures.map(({ text }) => - toBindingStaticResult(anonymizer.redactStaticEntities(text)), - ); - const pyResults = callPythonPreparedPackageCases( - adapters.pythonModulePath, - adapters.tempDir, - Buffer.from(packageBytes), - fixtures.map(({ text }) => ({ text, operators: null })), - "prepare_static_search_compressed_package_bytes", - configJson, - ); + const tsResults = fixtures.map(({ text }) => + toBindingStaticResult(anonymizer.redactStaticEntities(text)), + ); + const pyResults = callPythonPreparedPackageCases( + adapters.pythonModulePath, + adapters.tempDir, + Buffer.from(packageBytes), + fixtures.map(({ text }) => ({ text, operators: null })), + "prepare_static_search_compressed_package_bytes", + configJson, + ); - for (const [index, fixture] of fixtures.entries()) { - expect({ - fixture: `${language}/${fixture.name}`, - result: pyResults.at(index), - }).toEqual({ - fixture: `${language}/${fixture.name}`, - result: tsResults.at(index), - }); + for (const [index, fixture] of fixtures.entries()) { + expect({ + fixture: `${language}/${fixture.name}`, + result: pyResults.at(index), + }).toEqual({ + fixture: `${language}/${fixture.name}`, + result: tsResults.at(index), + }); + } } - } - }); + }, + SLOW_NATIVE_FIXTURE_PARITY_TIMEOUT_MS, + ); test("native fixture improvements are explicit", async () => { const adapters = getAdapters();