diff --git a/.ai/manifest.json b/.ai/manifest.json index 6a192fbb..16cf5ed8 100644 --- a/.ai/manifest.json +++ b/.ai/manifest.json @@ -6,6 +6,7 @@ "stella-public-repo", "engineering", "typescript", + "rust", "testing", "linting" ], diff --git a/.ai/shared b/.ai/shared index 2519c1c1..eebb10aa 160000 --- a/.ai/shared +++ b/.ai/shared @@ -1 +1 @@ -Subproject commit 2519c1c1bc7fd3ec09a846624968ed496e6dd79f +Subproject commit eebb10aa9beb793e8098fc333018fd715a7af475 diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 00000000..a9e50b0e --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,9 @@ +# Use sparse index for faster resolution. +[registries.crates-io] +protocol = "sparse" + +[alias] +ci-fmt = "fmt --all -- --check" +ci-clippy = "clippy --workspace --all-targets --all-features --locked -- -D warnings" +ci-dylint = "dylint --workspace --all" +ci-test = "test --workspace --all-features --locked" diff --git a/.github/tools/check-brand-case.mjs b/.github/tools/check-brand-case.mjs new file mode 100644 index 00000000..1072d17c --- /dev/null +++ b/.github/tools/check-brand-case.mjs @@ -0,0 +1,75 @@ +import { execFileSync } from "node:child_process"; +import { readFileSync } from "node:fs"; +import process from "node:process"; + +const DISALLOWED = ["S", "tella"].join(""); +const EXPECTED = DISALLOWED.toLowerCase(); +const IGNORED_PATHS = new Set(["AGENTS.md", "CLAUDE.md", "GEMINI.md"]); +const IGNORED_PREFIXES = [ + ".ai/", + ".agents/", + ".claude/", + ".github/assets/", + "packages/data/dictionaries/", + "packages/*/dist/", + "packages/anonymize/wasm/dist/", + "target/", +]; + +const trackedFiles = execFileSync("git", ["ls-files", "-z"], { + encoding: "utf8", +}) + .split("\0") + .filter(Boolean) + .filter((file) => !isIgnored(file)); + +let hasFailure = false; + +for (const file of trackedFiles) { + const content = readFileSync(file); + if (content.includes(0)) { + continue; + } + + const text = content.toString("utf8"); + let index = text.indexOf(DISALLOWED); + while (index !== -1) { + const { line, column } = lineColumnFor(text, index); + console.error( + `${file}:${line}:${column} uses disallowed brand casing; use "${EXPECTED}"`, + ); + hasFailure = true; + index = text.indexOf(DISALLOWED, index + DISALLOWED.length); + } +} + +if (hasFailure) { + process.exit(1); +} + +function isIgnored(file) { + if (IGNORED_PATHS.has(file)) { + return true; + } + return IGNORED_PREFIXES.some((pattern) => { + if (!pattern.includes("*")) { + return file.startsWith(pattern); + } + const [prefix, suffix] = pattern.split("*"); + return file.startsWith(prefix) && file.includes(suffix); + }); +} + +function lineColumnFor(text, index) { + let line = 1; + let column = 1; + for (let i = 0; i < index; i += 1) { + if (text.charCodeAt(i) === 10) { + line += 1; + column = 1; + continue; + } + column += 1; + } + return { line, column }; +} diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 3e039419..2b710655 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -7,6 +7,13 @@ const PACKAGES = [ expected: [ "dist/index.d.mts", "dist/index.mjs", + "dist/native.d.mts", + "dist/native.mjs", + "dist/native-node.d.mts", + "dist/native-node.mjs", + "index.cjs", + "stella_anonymize_napi.node", + "native-pipeline.stlanonpkg", // Dynamically imported corpus chunk; missing means the // bundler stopped resolving the non-Western name imports. "dist/names-nw-in.mjs", diff --git a/.github/tools/check-python-wheel.mjs b/.github/tools/check-python-wheel.mjs new file mode 100644 index 00000000..75b40ef7 --- /dev/null +++ b/.github/tools/check-python-wheel.mjs @@ -0,0 +1,132 @@ +import { execFileSync } from "node:child_process"; +import { mkdtempSync, readdirSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import process from "node:process"; + +const outDir = mkdtempSync(join(tmpdir(), "stella-anonymize-wheel-")); +const profile = process.env.ANONYMIZE_PYTHON_WHEEL_PROFILE ?? "ci"; + +try { + execFileSync( + "uvx", + [ + "--from", + "maturin>=1.14,<2", + "maturin", + "build", + "--manifest-path", + "crates/anonymize-py/Cargo.toml", + "--locked", + "--profile", + profile, + "--out", + outDir, + ], + { stdio: "inherit" }, + ); + + const wheel = readdirSync(outDir).find((file) => file.endsWith(".whl")); + if (wheel === undefined) { + throw new Error("maturin did not emit a wheel"); + } + + const wheelPath = join(outDir, wheel); + const files = new Set(JSON.parse(readWheelFiles(wheelPath))); + const required = [ + "stella_anonymize/__init__.py", + "stella_anonymize/__init__.pyi", + "stella_anonymize/_native.pyi", + "stella_anonymize/py.typed", + ]; + const missing = required.filter((file) => !files.has(file)); + if (missing.length > 0) { + throw new Error(`wheel is missing files: ${missing.join(", ")}`); + } + if (![...files].some(isNativeExtension)) { + throw new Error("wheel is missing the native _native extension"); + } + smokeInstalledWheel(wheelPath); + + console.log( + JSON.stringify({ + event: "python-wheel-check", + wheel, + profile, + }), + ); +} finally { + rmSync(outDir, { force: true, recursive: true }); +} + +function readWheelFiles(wheelPath) { + return execFileSync( + "python3", + [ + "-c", + [ + "import json, sys, zipfile", + "with zipfile.ZipFile(sys.argv[1]) as wheel:", + " print(json.dumps(wheel.namelist()))", + ].join("\n"), + wheelPath, + ], + { encoding: "utf8" }, + ); +} + +function smokeInstalledWheel(wheelPath) { + execFileSync( + "uv", + [ + "run", + "--isolated", + "--no-project", + "--python", + "3.11", + "--with", + wheelPath, + "python", + "-c", + [ + "import json", + "import stella_anonymize as anonymize", + "required = [", + " 'PreparedAnonymizer',", + " 'PreparedSearch',", + " 'load_prepared_package',", + " 'prepare_search_package',", + " 'redact_text',", + "]", + "missing = [name for name in required if not hasattr(anonymize, name)]", + "if missing:", + " raise SystemExit(f'missing exports: {missing}')", + "config_json = json.dumps({", + " 'regex_patterns': [{'kind': 'regex', 'pattern': r'\\b[A-Z]{2}\\d{4}\\b'}],", + " 'slices': {'regex': {'start': 0, 'end': 1}},", + " 'regex_meta': [{'label': 'registration number', 'score': 1.0}],", + "})", + "package_bytes = anonymize.prepare_search_package(config_json)", + "prepared = anonymize.load_prepared_package(package_bytes)", + "result = prepared.redact_text('Reference AB1234')", + "if result.redaction.entity_count != 1:", + " raise SystemExit(f'unexpected entity count: {result.redaction.entity_count}')", + "if result.redaction.redacted_text == 'Reference AB1234':", + " raise SystemExit('redaction did not change text')", + "print(json.dumps({", + " 'event': 'python-wheel-import-smoke',", + " 'version': anonymize.native_package_version(),", + " 'entity_count': result.redaction.entity_count,", + "}))", + ].join("\n"), + ], + { stdio: "inherit" }, + ); +} + +function isNativeExtension(file) { + return ( + file.startsWith("stella_anonymize/_native.") && + [".so", ".pyd", ".dll", ".dylib"].some((suffix) => file.endsWith(suffix)) + ); +} diff --git a/.github/tools/sync-runtime-version.mjs b/.github/tools/sync-runtime-version.mjs index 75b7cfad..b6e07cda 100644 --- a/.github/tools/sync-runtime-version.mjs +++ b/.github/tools/sync-runtime-version.mjs @@ -10,7 +10,16 @@ const PACKAGE_FILES = [ "packages/cli/package.json", ]; +const CARGO_WORKSPACE_MANIFEST = "Cargo.toml"; +const CARGO_LOCKED_PACKAGES = [ + "stella-anonymize-adapter-contract", + "stella-anonymize-core", + "stella-anonymize-napi", + "stella-anonymize-py", +]; +const PYPROJECT_FILES = ["crates/anonymize-py/pyproject.toml"]; const LOCK_FILE = "bun.lock"; +const CARGO_LOCK_FILE = "Cargo.lock"; const checkOnly = process.argv.includes("--check"); const version = readFileSync("VERSION", "utf8").trim(); @@ -30,6 +39,29 @@ const SYNCED_DEPENDENCY_RANGE_RE = /("@stll\/anonymize": "\^)([^"]+)(")/g; const escapeRegExp = (value) => value.replaceAll(/[.*+?^${}()|[\]\\]/g, "\\$&"); +const syncTextVersion = ({ file, label, re }) => { + const text = readFileSync(file, "utf8"); + const match = text.match(re); + if (!match) { + console.error(`${file} has no ${label} version entry`); + hasMismatch = true; + return; + } + const current = match[2]; + if (current === version) { + return; + } + if (checkOnly) { + console.error( + `${file} has ${label} version ${current}; expected ${version}`, + ); + hasMismatch = true; + return; + } + writeFileSync(file, text.replace(re, `$1${version}$3`)); + console.log(`Updated ${file} ${label} version to ${version}`); +}; + for (const file of PACKAGE_FILES) { const pkg = JSON.parse(readFileSync(file, "utf8")); const wantedRange = `^${version}`; @@ -61,6 +93,34 @@ for (const file of PACKAGE_FILES) { console.log(`Updated ${file} to ${version}`); } +syncTextVersion({ + file: CARGO_WORKSPACE_MANIFEST, + label: "Cargo workspace", + re: /(\[workspace\.package\][\s\S]*?\nversion\s*=\s*")([^"]+)(")/, +}); + +for (const file of PYPROJECT_FILES) { + const text = readFileSync(file, "utf8"); + const explicitVersion = text.match(/^version\s*=\s*"([^"]+)"/m); + if (explicitVersion) { + syncTextVersion({ + file, + label: "Python project", + re: /(^version\s*=\s*")([^"]+)(")/m, + }); + continue; + } + + if (/\bdynamic\s*=\s*\[[^\]]*"version"[^\]]*\]/m.test(text)) { + continue; + } + + console.error( + `${file} must either derive version dynamically from Cargo or match VERSION`, + ); + hasMismatch = true; +} + const lockText = readFileSync(LOCK_FILE, "utf8"); let lockChanged = false; let syncedLockText = lockText.replaceAll( @@ -116,6 +176,45 @@ if (lockChanged) { ); } +const cargoLockText = readFileSync(CARGO_LOCK_FILE, "utf8"); +let cargoLockChanged = false; +let syncedCargoLockText = cargoLockText; + +for (const packageName of CARGO_LOCKED_PACKAGES) { + const packageVersionRe = new RegExp( + `(\\[\\[package\\]\\]\\nname = "${escapeRegExp(packageName)}"\\nversion = ")([^"]+)(")`, + ); + const match = syncedCargoLockText.match(packageVersionRe); + if (!match) { + console.error(`${CARGO_LOCK_FILE} has no package entry for ${packageName}`); + hasMismatch = true; + continue; + } + const lockedVersion = match[2]; + if (lockedVersion === version) { + continue; + } + if (checkOnly) { + console.error( + `${CARGO_LOCK_FILE} package ${packageName} has version ${lockedVersion}; expected ${version}`, + ); + hasMismatch = true; + continue; + } + syncedCargoLockText = syncedCargoLockText.replace( + packageVersionRe, + `$1${version}$3`, + ); + cargoLockChanged = true; +} + +if (cargoLockChanged) { + writeFileSync(CARGO_LOCK_FILE, syncedCargoLockText); + console.log( + `Updated ${CARGO_LOCK_FILE} local package versions to ${version}`, + ); +} + if (hasMismatch) { process.exit(1); } diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd2c72ef..c08fc4a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,8 @@ name: CI +env: + UV_VERSION: "0.10.1" + on: push: branches: [main] @@ -43,6 +46,19 @@ jobs: # the real npm token to dependency lifecycle scripts. NPM_TOKEN: "" + - name: Setup Rust + run: | + rustup toolchain install 1.96.0 --profile minimal --component rustfmt,clippy + rustup toolchain install nightly-2026-04-16 --profile minimal \ + --component rustc-dev \ + --component llvm-tools-preview + rustup default 1.96.0 + cargo install cargo-dylint --version 6.0.1 --locked + cargo install dylint-link --version 6.0.1 --locked + + - name: Rust checks + run: bun run rust:check + - name: Check runtime package versions # Verify, do not mutate: a PR that bumps VERSION without bumping # the package versions (or vice versa) must fail CI, not be @@ -64,6 +80,17 @@ jobs: - name: Typecheck run: bun run typecheck + - name: Install uv + run: | + python3 -m pip install --user "uv==${{ env.UV_VERSION }}" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Python typecheck + run: bun run python:typecheck + + - name: Python wheel + run: bun run python:wheel + - name: Test run: bun run test @@ -73,9 +100,32 @@ jobs: - name: Smoke test built artifact run: bun run --cwd packages/anonymize smoke:dist + - name: Migration fixture parity and performance + run: | + git fetch origin main --depth=1 + bun run --cwd packages/anonymize perf:migration-fixtures + + - name: Native fixture parity and performance + env: + ANONYMIZE_MIGRATION_ALLOW_ACCEPTED_MISMATCHES: "1" + ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME: native-static + ANONYMIZE_MIGRATION_FAIL_ON_MISMATCH: "1" + ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: "1" + ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1" + ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE: "1" + run: | + git fetch origin main --depth=1 + bun run --cwd packages/anonymize perf:migration-fixtures + - name: Contract performance (informational) # Surfaces pipeline-latency regressions against the thresholds in # contract-perf.mjs. Non-blocking for now (shared-runner timing is # noisy); promote to a hard gate once a stable baseline is set. continue-on-error: true run: bun run --cwd packages/anonymize perf:contracts + + - name: Native adapter performance (informational) + # Emits aggregate TS/NAPI and Python/PyO3 timings for the shared + # Rust-backed static adapter contract. + continue-on-error: true + run: bun run --cwd packages/anonymize perf:native-adapters diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 60cacc8d..6f2c5b4d 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -28,6 +28,7 @@ jobs: allow-licenses: >- MIT, Apache-2.0, + Apache-2.0 WITH LLVM-exception, BSD-2-Clause, BSD-3-Clause, ISC, @@ -41,4 +42,8 @@ jobs: Python-2.0, Zlib, Unicode-3.0 + allow-dependencies-licenses: >- + pkg:cargo/stella-aho-corasick-core, + pkg:cargo/stella-fuzzy-search-core, + pkg:cargo/stella-regex-set-core comment-summary-in-pr: always diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bd18c3ae..0ee9ece9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,6 +2,7 @@ name: Release env: NPM_VERSION: "11.11.1" + UV_VERSION: "0.10.1" on: push: @@ -54,6 +55,19 @@ jobs: - name: Typecheck run: bun run typecheck + - name: Install uv + run: | + python3 -m pip install --user "uv==${{ env.UV_VERSION }}" + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Python typecheck + run: bun run python:typecheck + + - name: Python wheel + run: bun run python:wheel + env: + ANONYMIZE_PYTHON_WHEEL_PROFILE: release + - name: Test run: bun run test diff --git a/.gitignore b/.gitignore index 8eed3f1c..a7d4bb93 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ node_modules *.tsbuildinfo dist .turbo +target/ +*.stlanonpkg # Claude Code local worktrees. .claude/worktrees/ diff --git a/.gitleaks.toml b/.gitleaks.toml index bfb512b3..4e6c099c 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -1,4 +1,4 @@ -title = "Stella Gitleaks Configuration" +title = "stella Gitleaks Configuration" [extend] useDefault = true diff --git a/AGENTS.md b/AGENTS.md index 27edb423..4d70dbb2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -51,6 +51,10 @@ details unless they are already public in the repository. - If TypeScript can make a class of bug structurally impossible (branded types, discriminated unions, exhaustive checks), prefer that over runtime validation or manual discipline +- Avoid boolean fields for states that may grow. Use a named discriminator or + domain type for values that answer "which kind/status/mode/type?" rather than + a permanent yes/no question; a two-value union, enum, or equivalent domain type + now is usually cheaper than migrating an `isX` flag later. - Conventional Commits: `feat:`, `chore:`, `fix:`, `docs:` - Rebase feature branches onto main (linear history) - Fail fast: validate at boundaries, return/throw early @@ -126,6 +130,88 @@ details unless they are already public in the repository. `getX()` getter so it runs at first use, not at import time. This prevents TDZ errors from non-deterministic module evaluation order. +## Coding Conventions + +### Rust + +- Use Rust 2024 for new crates. Pin the toolchain in `rust-toolchain.toml` and + keep `rustfmt` and `clippy` installed. +- In workspaces, put shared lint policy in `[workspace.lints]`; member crates + should opt in with `[lints] workspace = true`. +- Treat + `cargo clippy --workspace --all-targets --all-features -- -D warnings` as the + baseline quality gate unless a repo documents a narrower command. +- Use Dylint for shared stella-specific Rust lints that Clippy cannot express. + Run `cargo dylint --workspace --all` after Clippy when the repo has + `dylint.toml`. +- Prefer fixing custom lint rules at the shared source over broad local + suppressions when a rule is wrong across repos. +- Forbid unsafe code by default. If `unsafe` is truly required, keep it in a + tiny module with a `SAFETY:` comment explaining the invariant the caller and + callee rely on. +- Do not use `unwrap()`, `expect()`, `panic!()`, `todo!()`, or + `unimplemented!()` in production code. Return typed errors or make the + impossible state unrepresentable. +- Avoid unchecked indexing and string slicing. Prefer iterator methods, + `.get()`, typed span helpers, and APIs that preserve UTF-8 boundary safety. +- Avoid `as` casts. Prefer `TryFrom`, `From`, explicit checked conversion + helpers, or domain newtypes. +- Prefer narrow domain types over primitive strings/numbers for IDs, byte + offsets, language codes, entity labels, versions, and artifact formats. +- Keep struct fields private unless direct construction is part of the public + contract. Use smart constructors for values that must satisfy invariants. +- Use enums for real closed domain states and boolean-blind choices where + variants carry domain meaning. For callsite ergonomics alone, prefer an + options struct or `bon` builder over an enum that only simulates named + arguments. +- For functions, use positional parameters for one or two obvious arguments. + Use a named `SomethingOptions`, `SomethingArgs`, or `SomethingParams` struct + for 3+ arguments or same-type arguments that are easy to swap. +- Use `bon` builders for public APIs, constructors, or setup functions with + many optional/boolean parameters where named callsites improve readability. Do + not use it to hide unclear domain modeling. +- Prefer `Result` with a concrete error enum for library code. Use + `thiserror` for typed errors; use `miette` only where human-facing diagnostics + are valuable. +- Add `#[must_use]` to builders, config transforms, computed results, and APIs + where ignoring the return value is likely a bug. +- Keep comments concise. Comment invariants, non-obvious algorithms, generated + data contracts, and safety boundaries; do not narrate straightforward code. +- Keep data out of code. Domain dictionaries, language rules, fixtures, and + generated artifacts should live in reproducible data files or build outputs, + organized by language/concept where relevant. +- Public docs, logs, diagnostics, and comments should write `stella` lowercase. + +### Rust Module Side Effects + +- Avoid expensive module-level initialization. Prefer explicit prepare/build + steps, lazy singletons, or build-time generated artifacts. +- Do not do filesystem, network, environment, or global logger setup from + library imports. Applications and CLIs own process-level side effects. +- Keep binding crates thin. Business logic belongs in the Rust core crate; + TypeScript, Python, WASM, and NAPI layers should translate types and call the + same core logic. +- Keep generated artifacts versioned and validated at load time. Reject stale, + mismatched, or oversized artifacts with typed errors. + +### Rust Testing + +- Use `cargo nextest run --workspace --all-features` when available; otherwise + use the repo's documented `cargo test` command. +- Add property tests with `proptest` for parsers, span math, redaction, + normalization, serialization, and logic where examples do not cover the input + space. +- Add fuzz targets with `cargo-fuzz` for byte/string parsers, document readers, + search primitives, artifact decoders, and boundary-sensitive code. +- Use fixture parity tests when replacing an implementation in another language. + The Rust core, TypeScript binding, and Python binding should produce the same + structured result from the same fixtures. +- Benchmark behavior that is part of the product. Track cold start, warm run, + artifact load, preparation, and execution separately. +- Do not snapshot sensitive raw text unless the fixture is intentionally public + and minimal. Prefer normalized summaries, counts, spans, labels, and redacted + output. + ## Testing Only test what can actually go wrong: bugs the type system, framework, or linter would diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..ccef74f1 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1157 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f02882884d3e1bc524fb12c79f107f6ad0e1cfd498c536ffb494301740995dfe" + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "blake3" +version = "1.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "block-buffer" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "bon" +version = "3.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a602c73c7b0148ec6d12af6fd5cc7a46e2eacc8878271a999abac56eed12f561" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dee98b0db6a962de883bf5d20362dee4d7ca0d12fe39a7c6c73c844e1cd7c1f" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "cc" +version = "1.2.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e228eec9be7c17ccb640b59b36a5cd805ea2a564a4c5e162c2f659fea30d3b96" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "convert_case" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "affbf0190ed2caf063e3def54ff444b449371d55c58e513a95ab98eca50adb49" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "ctor" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01334b89b69ff726750c5ce5073fc8bd860e99aa9a8fc5ca11b04730e3aee97a" + +[[package]] +name = "daachorse" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", +] + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fancy-regex" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e1dacd0d2082dfcf1351c4bdd566bbe89a2b263235a2b50058f1e130a47277" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom", + "libc", +] + +[[package]] +name = "keccak" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e24a010dd405bd7ed803e5253182815b41bf2e6a80cc3bfc066658e03a198aa" +dependencies = [ + "cfg-if", + "cpufeatures", +] + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "napi" +version = "3.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41bda2ac390efb5e8d22025d925ccc3f3807d8c1bea6d19b36127247c4b8f83" +dependencies = [ + "bitflags", + "ctor", + "futures", + "napi-build", + "napi-sys", + "nohash-hasher", + "rustc-hash", + "serde", + "serde_json", +] + +[[package]] +name = "napi-build" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9c366d2c8c60b86fa632df75f745509b52f9128f91a6bad4c796e44abb505e1" + +[[package]] +name = "napi-derive" +version = "3.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61d66f70256ad5aef58659966064471d0ad90e2897bc36a5a5e0389c85aabc1e" +dependencies = [ + "convert_case", + "ctor", + "napi-derive-backend", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "napi-derive-backend" +version = "5.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b4b08f15eed7a2a20c3f4c6314013fc3ac890a3afa9892b594485299ebdb2d" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "semver", + "syn", +] + +[[package]] +name = "napi-sys" +version = "3.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f5bcdf71abd3a50d00b49c1c2c75251cb3c913777d6139cd37dabc093a5e400" +dependencies = [ + "libloading", +] + +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "pyo3" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd274650b21d4bfc26a0a47587962c1edb425f69287324355cd040c3ea66071c" +dependencies = [ + "libc", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", +] + +[[package]] +name = "pyo3-build-config" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5e2a7d2f0d013342f295c048ad19237add5154a55b1c5a254c0ec93d4109078" +dependencies = [ + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca85c467da1bbc8d866eea5deff9cf29ea5f7785054a17da36e65bda9c05845b" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac53762fd065daa3194dd09337a38bd793a188100fd1a9304c4ab312d901771" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca3a1557399783172dc5bf39cfca835157732532cba56b71d2292161e53b362" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc9bad02c26382724b2d2692c6f179285e4b54eeecd7968f52a50059c3c11759" +dependencies = [ + "digest", + "keccak", + "sponge-cursor", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "sponge-cursor" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a0219bd7d979d58245a4f41f695e1ac9f8befdffadd7f61f1bae9e39abc6620" + +[[package]] +name = "stella-aho-corasick-core" +version = "1.0.4" +source = "git+https://github.com/stella/aho-corasick?rev=38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1#38bdcbf11bfbe389c8f2b7b40eb03ac50371e1e1" +dependencies = [ + "daachorse", + "unicode-case-mapping", +] + +[[package]] +name = "stella-anonymize-adapter-contract" +version = "1.5.0" +dependencies = [ + "bincode", + "blake3", + "serde", + "serde_json", + "stella-anonymize-core", + "zstd", +] + +[[package]] +name = "stella-anonymize-core" +version = "1.5.0" +dependencies = [ + "bon", + "fancy-regex", + "proptest", + "regex", + "serde", + "stella-stdnum-core", + "stella-text-search-core", +] + +[[package]] +name = "stella-anonymize-napi" +version = "1.5.0" +dependencies = [ + "blake3", + "napi", + "napi-build", + "napi-derive", + "serde_json", + "stella-anonymize-adapter-contract", + "stella-anonymize-core", +] + +[[package]] +name = "stella-anonymize-py" +version = "1.5.0" +dependencies = [ + "pyo3", + "pyo3-build-config", + "serde_json", + "stella-anonymize-adapter-contract", + "stella-anonymize-core", +] + +[[package]] +name = "stella-fuzzy-search-core" +version = "1.1.3" +source = "git+https://github.com/stella/fuzzy-search?rev=0743b9c6710a84bb7e6863fdcda9a9cc1dce4fa2#0743b9c6710a84bb7e6863fdcda9a9cc1dce4fa2" +dependencies = [ + "unicode-case-mapping", + "unicode-normalization", + "unicode-segmentation", +] + +[[package]] +name = "stella-regex-set-core" +version = "1.0.5" +source = "git+https://github.com/stella/regex-set?rev=75b6a7f7a89880b70c8497f5b86a3f09748ea3fd#75b6a7f7a89880b70c8497f5b86a3f09748ea3fd" +dependencies = [ + "fancy-regex", + "regex", + "regex-automata", + "regex-syntax", + "unicode-segmentation", +] + +[[package]] +name = "stella-stdnum-core" +version = "2.1.1" +source = "git+https://github.com/stella/stdnum?rev=2f3c3f107e3976ac059cc438d77916a592595d59#2f3c3f107e3976ac059cc438d77916a592595d59" +dependencies = [ + "sha2", + "sha3", +] + +[[package]] +name = "stella-text-search-core" +version = "1.0.6" +source = "git+https://github.com/stella/text-search?rev=0e44094dbcd027218a767439ded062bf615015d0#0e44094dbcd027218a767439ded062bf615015d0" +dependencies = [ + "stella-aho-corasick-core", + "stella-fuzzy-search-core", + "stella-regex-set-core", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + +[[package]] +name = "unicode-case-mapping" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e9026503b74f3207a4c04e6bf4ea735daa8edf6c0bbfa044cae597bb947a9db" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..233582b2 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,127 @@ +[workspace] +members = [ + "crates/anonymize-adapter-contract", + "crates/anonymize-core", + "crates/anonymize-napi", + "crates/anonymize-py", +] +resolver = "3" + +[workspace.package] +version = "1.5.0" +edition = "2024" +license = "MIT" +publish = false +repository = "https://github.com/stella/anonymize" + +[workspace.lints.rust] +warnings = { level = "deny", priority = -1 } +dead_code = "deny" +future_incompatible = { level = "deny", priority = -1 } +nonstandard_style = { level = "deny", priority = -1 } +rust_2018_idioms = { level = "deny", priority = -1 } +unreachable_code = "deny" +unreachable_patterns = "deny" +unreachable_pub = "deny" +unsafe_code = "deny" +unused_imports = "deny" +unused_macros = "deny" +unused_mut = "deny" +unused_variables = "deny" + +[workspace.lints.rustdoc] +broken_intra_doc_links = "deny" +bare_urls = "deny" + +[workspace.lints.clippy] +all = { level = "deny", priority = -1 } +pedantic = { level = "warn", priority = -1 } +cargo = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } + +dbg_macro = "deny" +todo = "deny" +unimplemented = "deny" +panic = "deny" +unwrap_used = "deny" +expect_used = "deny" +indexing_slicing = "warn" +integer_division = "warn" +arithmetic_side_effects = "warn" +as_conversions = "warn" +cast_possible_truncation = "warn" +cast_possible_wrap = "warn" +cast_precision_loss = "warn" +cast_sign_loss = "warn" +clone_on_ref_ptr = "deny" +create_dir = "deny" +decimal_literal_representation = "warn" +derive_partial_eq_without_eq = "deny" +disallowed_macros = "deny" +disallowed_methods = "deny" +disallowed_types = "deny" +empty_enum_variants_with_brackets = "deny" +empty_structs_with_brackets = "deny" +enum_glob_use = "deny" +exit = "deny" +filetype_is_file = "deny" +float_cmp = "warn" +fn_to_numeric_cast_any = "deny" +format_collect = "deny" +if_then_some_else_none = "deny" +implicit_clone = "deny" +inefficient_to_string = "deny" +large_enum_variant = "deny" +large_stack_arrays = "deny" +large_stack_frames = "deny" +manual_let_else = "deny" +manual_memcpy = "deny" +map_unwrap_or = "deny" +mem_forget = "deny" +missing_assert_message = "warn" +missing_errors_doc = "allow" +missing_panics_doc = "allow" +module_name_repetitions = "allow" +multiple_crate_versions = "allow" +needless_collect = "deny" +needless_continue = "deny" +needless_pass_by_ref_mut = "deny" +needless_pass_by_value = "deny" +or_fun_call = "deny" +print_stderr = "deny" +print_stdout = "deny" +redundant_clone = "deny" +same_name_method = "deny" +self_named_module_files = "allow" +semicolon_if_nothing_returned = "deny" +shadow_reuse = "allow" +shadow_same = "warn" +shadow_unrelated = "warn" +std_instead_of_alloc = "allow" +std_instead_of_core = "allow" +string_slice = "warn" +tests_outside_test_module = "allow" +trivially_copy_pass_by_ref = "deny" +unnecessary_wraps = "deny" +unneeded_field_pattern = "deny" +unreachable = "deny" +unused_async = "deny" +unused_self = "deny" +use_self = "warn" +used_underscore_binding = "deny" +verbose_file_reads = "deny" +wildcard_imports = "deny" + +[profile.release] +lto = "fat" +codegen-units = 1 +panic = "unwind" +strip = "symbols" + +[profile.dev] +debug = "line-tables-only" + +[profile.ci] +inherits = "dev" +debug-assertions = true +overflow-checks = true diff --git a/README.md b/README.md index 6cdb1783..742a6d7d 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,22 @@

- Stella anonymize + stella anonymize

# anonymize -Monorepo for the Stella anonymization stack. +Monorepo for the stella anonymization stack. It contains the runtime package, the published data package, and the browser/WASM entrypoint used by downstream apps. ## Packages -| Package | Purpose | -| ---------------------- | -------------------------------------------------------------- | -| `@stll/anonymize` | Native runtime for multi-layer PII detection and anonymization | -| `@stll/anonymize-data` | Published deny-list dictionaries and trigger/config data | -| `@stll/anonymize-wasm` | Browser/WASM build of the runtime | -| `@stll/anonymize-cli` | Command-line anonymization (`anonymize` binary) | +| Package | Purpose | +| ----------------------- | -------------------------------------------------------------- | +| `@stll/anonymize` | Native runtime for multi-layer PII detection and anonymization | +| `@stll/anonymize-data` | Published deny-list dictionaries and trigger/config data | +| `@stll/anonymize-wasm` | Browser/WASM build of the runtime | +| `@stll/anonymize-cli` | Command-line anonymization (`anonymize` binary) | +| `stella-anonymize-core` | Python bindings for the Rust anonymization core | ## Install @@ -25,6 +26,8 @@ bun add @stll/anonymize bun add @stll/anonymize-data # Browser / Vite usage bun add @stll/anonymize-wasm +# Python SDK source build +uv add ./crates/anonymize-py ``` Or anonymize from the terminal without installing: @@ -70,3 +73,4 @@ bun run hooks:install - [`packages/anonymize`](packages/anonymize) - [`packages/data`](packages/data) - [`packages/anonymize/wasm`](packages/anonymize/wasm) +- [`crates/anonymize-py`](crates/anonymize-py) diff --git a/bun.lock b/bun.lock index 89a3f51a..1f9a749e 100644 --- a/bun.lock +++ b/bun.lock @@ -10,7 +10,7 @@ "@stll/typescript-config": "^0.3.0", "lefthook": "^2.1.9", "oxfmt": "^0.54.0", - "oxlint": "^1.69.0", + "oxlint": "^1.70.0", "oxlint-tsgolint": "^0.23.0", "turbo": "^2.9.18", }, @@ -21,14 +21,14 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search": "^1.0.6", + "@stll/text-search": "^1.0.7", }, "devDependencies": { "@stll/anonymize-data": "workspace:*", - "@stll/text-search-wasm": "^1.0.6", + "@stll/text-search-wasm": "^1.0.7", "bun-types": "^1.3.14", "fast-check": "^4.8.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", "vite": "^8.0.16", }, @@ -45,7 +45,7 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search-wasm": "^1.0.5", + "@stll/text-search-wasm": "^1.0.7", }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6", @@ -68,9 +68,9 @@ }, "devDependencies": { "@stll/anonymize-wasm": "workspace:*", - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", }, }, @@ -82,7 +82,7 @@ "@stll/anonymize-data": "^0.0.6", }, "devDependencies": { - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", "typescript": "^6.0.3", }, @@ -92,21 +92,21 @@ "version": "0.0.6", "devDependencies": { "stopwords-iso": "1.1.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", }, }, }, "packages": { - "@babel/generator": ["@babel/generator@8.0.0-rc.6", "", { "dependencies": { "@babel/parser": "^8.0.0-rc.6", "@babel/types": "^8.0.0-rc.6", "@jridgewell/gen-mapping": "^0.3.12", "@jridgewell/trace-mapping": "^0.3.28", "@types/jsesc": "^2.5.0", "jsesc": "^3.0.2" } }, "sha512-6mIzgVK8DgEzvIapoQwhXTMnnkuE4STQmVv9H03i/tZ2ml8oev3TRvZJgTenK2Bsq0YWNtzOrFdTyNzCMFtjJQ=="], + "@babel/generator": ["@babel/generator@8.0.0", "", { "dependencies": { "@babel/parser": "^8.0.0", "@babel/types": "^8.0.0", "@jridgewell/gen-mapping": "^0.3.12", "@jridgewell/trace-mapping": "^0.3.28", "@types/jsesc": "^2.5.0", "jsesc": "^3.0.2" } }, "sha512-NT9NrVwJsbSV6Y2FSstWa71EETOnzrjkL5/wX3D2mYHtKM+qvqB1DvR4D0Setb/gDBsHzRICifwEWMO8CnTF6g=="], - "@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0-rc.6", "", {}, "sha512-BCkFy+zN6kXQed3YOT7aJl93NfDSzQc3pBfsvTVPs9gU9X3V0aefEF5kwBT0E+mDWH9QgKaZstYUQN9VdQZT4g=="], + "@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0", "", {}, "sha512-6mJgmFFFIIO82vvoLt9XtRC7/TkzXfts1t/SpRX4IHSzMgqoPYCWesVu1udUPUWioAE/2fcG6WuI8zrkE1gwrg=="], - "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.0-rc.6", "", {}, "sha512-nVJ+1JcCgntv8d78rRo++o2wuODT0Irknx2BF8Np4Ft2CRgjLqIs4qzSZ8b66yGbBdMWGmZBO9WEZv1hhNiSpg=="], + "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.2", "", {}, "sha512-9Fr9QeyCAyi1BR1jKZ6uYQ24EIhQUx5ReHfQU7drOE+TPOb+w11/dsqLkMOT2U29OdCT71XajrOT8xDc1C7orA=="], - "@babel/parser": ["@babel/parser@8.0.0-rc.6", "", { "dependencies": { "@babel/types": "^8.0.0-rc.6" }, "bin": "./bin/babel-parser.js" }, "sha512-rOS8IpdO7mQELkTPlCsTgPejO0bFuZdEDCGQJouYbYf9e1FLTym7Fei2pEjq8q7MWbX0ravcd7QQYKs1TxOuog=="], + "@babel/parser": ["@babel/parser@8.0.0", "", { "dependencies": { "@babel/types": "^8.0.0" }, "bin": "./bin/babel-parser.js" }, "sha512-aLxAE+imI9bCcyaPrUDjBv3uSkWieifjLe0kuFOZF0zli0L6GCsTmsePnTr55adbIAgYz2zhN1vnFimCBUYcRQ=="], - "@babel/types": ["@babel/types@8.0.0-rc.6", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0-rc.6", "@babel/helper-validator-identifier": "^8.0.0-rc.6" } }, "sha512-p7/ABylAYlexb31wtRdIfH9L9A0Z2T/9H6zAqzqndkY2PLkvNNc580wGhp/gGKN4Sp9sQvSkhc6Oga8/O+wTyw=="], + "@babel/types": ["@babel/types@8.0.0", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0", "@babel/helper-validator-identifier": "^8.0.0" } }, "sha512-K8ponJDxBwDHigkeFqaqT5wLGl4bTlwMafR8k7b5CPxr6Ww+UG9ls8Yx6Tcpboxu97eeGVEEyKcHmEyOwN1vSw=="], "@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="], @@ -126,7 +126,7 @@ "@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.4", "", { "dependencies": { "@tybys/wasm-util": "^0.10.1" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow=="], - "@oxc-project/types": ["@oxc-project/types@0.134.0", "", {}, "sha512-T0xuRRKrQFmocH8y+jGfpmSkGcheaJExY9lEihmR1Gm2aH+75B8CzgU2rABRQSzzDxLjZ15Sc0bRVLj5lVeNXQ=="], + "@oxc-project/types": ["@oxc-project/types@0.137.0", "", {}, "sha512-WT+Gb24i8hmvo85AIv2oEYouEXkRlKAlT9WaCa3TfLgNCN+GhrJOGZuIlMouAh38Qe4QOx26eUOVsq70qXrywA=="], "@oxfmt/binding-android-arm-eabi": ["@oxfmt/binding-android-arm-eabi@0.54.0", "", { "os": "android", "cpu": "arm" }, "sha512-NAtpl/SiaeU103e7/OmZw0MvUnsUUopW7hEm/ecegJg7YM0skQaA0IXEZoyTV6NUdiNPupdIUreRqUZTShbn/g=="], @@ -178,75 +178,75 @@ "@oxlint-tsgolint/win32-x64": ["@oxlint-tsgolint/win32-x64@0.23.0", "", { "os": "win32", "cpu": "x64" }, "sha512-5MyjFuqf+g8OUPJBSGWHJtmoWnzFJYyOg4To9WMQshZYEWig/vtu7JtJ03VWnzHv9LJkAUeApY0gVCOywFR/iQ=="], - "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.69.0", "", { "os": "android", "cpu": "arm" }, "sha512-DKQQbD5cZ/MYfDgDI7YGyGD9FSxABlsBsYFo5p26lloob543tP9+4N3guwdXIYJN+7HSZxLe8YJuwcOWw5qnHg=="], + "@oxlint/binding-android-arm-eabi": ["@oxlint/binding-android-arm-eabi@1.70.0", "", { "os": "android", "cpu": "arm" }, "sha512-zFh0P4cswmRvw6nkyb89dr18rRanuaCPAsEXsFDoQY8WdaquI8Pt4NWFjaMJg6L23cy5NeN8J9cBnREbWzZhaw=="], - "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.69.0", "", { "os": "android", "cpu": "arm64" }, "sha512-lEhb+I5pr4inux+JFwfCa1HRq3Os7NirEFQ0H1I35SVEHPm6byX0Ah47xmRha3qi6LAkxUcxViL8o/9PivjzBg=="], + "@oxlint/binding-android-arm64": ["@oxlint/binding-android-arm64@1.70.0", "", { "os": "android", "cpu": "arm64" }, "sha512-qI8o4HZjeGiBrWv+pJv4lH0Yi2Gl/JSp/EumBUApezJprIKa5PS4nU0lQsQngtky8k+SplQIOjv6hwu0SSxeyg=="], - "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.69.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-GY2YE8lOZW59BW1Ia1y+1gR0XyjrZRvVWHAr8LGeGhYHE0OQJ/7cRKXTkx1P+E9/6awEc3SX8a68SFTjh/E//A=="], + "@oxlint/binding-darwin-arm64": ["@oxlint/binding-darwin-arm64@1.70.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-8KjgVVHI5F9nVwHCRwwA78Ty7zNKP4Wd9OeN5PSv3iu/F/u1RVXoOCgLhWqust6HmwQG6xc8c+RCyaWENy24+w=="], - "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.69.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-ax1oZnOjHX3LB7myQyHEaQkDwfLb6str3/nSP6O7EVUviQGNkEGzGV0EqcBJWK+Ufwx0l4xPgyYayurvhAdl2Q=="], + "@oxlint/binding-darwin-x64": ["@oxlint/binding-darwin-x64@1.70.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-WVydssv5PSUBXFJTdNBWlmGkbNmvPGaFt/2SUT/EZRB6bq6bEOHmMlbnupZD5jmlEvi9+mZJHi8TCw15lyfSfQ=="], - "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.69.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-kHWeHv4g2h8NY+mpCxzCtY4uerMJWTN/TSnNj1CPbakFpHEJ6cTya2wWV0pDSYWOJ2+0UiEbhn3AtXxHtsnKjg=="], + "@oxlint/binding-freebsd-x64": ["@oxlint/binding-freebsd-x64@1.70.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-hJucmUf8OlinHNb1R7fI4Fw6WsAstOz7i8nmkWQfiHoZXtbufNm+MxiDTIMk1ggh2Ro4vLzgQ+bKvRY54MZoRA=="], - "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.69.0", "", { "os": "linux", "cpu": "arm" }, "sha512-gq84vM1a1oEehXo27YCDzGVcxPsZDI1yswZwz2Da1/cbnWtrL16XZZnz0G/+gIU8edtHpfjxq5c+vWEHqJfWoQ=="], + "@oxlint/binding-linux-arm-gnueabihf": ["@oxlint/binding-linux-arm-gnueabihf@1.70.0", "", { "os": "linux", "cpu": "arm" }, "sha512-1BnS7wbCYDSXwWzJJ+mc3NURoha6m6m6RT5c6vgAY3oz7C3OVXP+S0awo2mRq97arrJkVvO3qRQfyAHL+76xtQ=="], - "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.69.0", "", { "os": "linux", "cpu": "arm" }, "sha512-kIqEa98JQ0VRyrcncxA417m2AzasqTlD+FyVT1AksjvjkqQcvm7pBWYvoW3/mpyOP2XYvi5nSCCTIe6De1yu5g=="], + "@oxlint/binding-linux-arm-musleabihf": ["@oxlint/binding-linux-arm-musleabihf@1.70.0", "", { "os": "linux", "cpu": "arm" }, "sha512-yKy/UdbR55+M2yEcuiV5DCNC/gdQAjr/GioUy50QwBzSrKm8ueWADqyRLS9Xk+qjNeCYGg6A8FvUBds56ttfqg=="], - "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.69.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-j+xYiXozxGWx2cpjCrwwGR4awTxPFsRv3JZrv23RCogEPMc4R7UqjHW47p/RG0aRlbWiROCJ8coUfCwy0dvzHA=="], + "@oxlint/binding-linux-arm64-gnu": ["@oxlint/binding-linux-arm64-gnu@1.70.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-0A5XJ4alvmqFUFP/4oYSyaO+qLto/HrKEWTSaegiVl+HOufFngK2BjYw9x4RbwBt/du5QG6l5q1zeWiJYYG5yg=="], - "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.69.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-xEPpNppTfN1l/nM7gYSf9iocscu/as+p/7vxkLeLEKnYU+09Dm+5V6IhDYDh+Uz6FajEupWwCLt5SOG0y1PCKg=="], + "@oxlint/binding-linux-arm64-musl": ["@oxlint/binding-linux-arm64-musl@1.70.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-JiylyurlB0CLSedNtx1gzv3FvfWPF1h/2Y3BJszPLNt5XQFlBsH5ke0Jle3iJb3uqu5m2e7A/DwzpuCAHdiU+A=="], - "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.69.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-Ug0+eU7HJBlek+SjklYH62IlOMirEJsdxpihH0kSqX0XdrDD4NdHpQc10fK1JC35yn6KrrcN+uYzlHD38XAf8Q=="], + "@oxlint/binding-linux-ppc64-gnu": ["@oxlint/binding-linux-ppc64-gnu@1.70.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-J8VPG7I3/HmgaU4u8pNU2kFx2+0U+vPLS1dXFxXOaR/2TQ0f8AC7DRz0SRGRI1bfphnX2hVYTTtLuhL4nYKL+Q=="], - "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.69.0", "", { "os": "linux", "cpu": "none" }, "sha512-iEyI3GIg0l/s3G4qy2TlaaWKdzj4PJJStwtlocpDTC00PY9hZueotf6OKUj9+yfQh0lrpBW/pLMgTztbAHKJEg=="], + "@oxlint/binding-linux-riscv64-gnu": ["@oxlint/binding-linux-riscv64-gnu@1.70.0", "", { "os": "linux", "cpu": "none" }, "sha512-N2+4lV2KLN+oXTIIIwmWDhwkrnvqf5oX7Hw0zPjk+RuIVgiBQSOlJWF7uQoFx2siEYX0ZQ5cfSbEAHm+J3t7Wg=="], - "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.69.0", "", { "os": "linux", "cpu": "none" }, "sha512-NjHjpiI4WIKSMwuoJSZi5VToPeoYOS1FR52HLIDG6lidMdqquusgtODb4iLk0+lb1q3Z0nv2/aPRcC/olmpQGg=="], + "@oxlint/binding-linux-riscv64-musl": ["@oxlint/binding-linux-riscv64-musl@1.70.0", "", { "os": "linux", "cpu": "none" }, "sha512-1e2L7cFCvx9QDzq6NPP+0tABKb5z6nWHyddWTNKprEsjO9xNrAtPowuCGpjNXxkTdsMiZ4jc8YQ5SstZd4XK6g=="], - "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.69.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-Ai/prDewoItkDXbp38gwGZi41DycZbUTZJ3UidwoHgQC0/DaqC2TGdtBTQLJ6hSD+SAxASzh8+/eSBPmxfOacA=="], + "@oxlint/binding-linux-s390x-gnu": ["@oxlint/binding-linux-s390x-gnu@1.70.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-Kwu/l/8GcYibCWA9m9N5pRXMIKVSsL/YbgpLzYkqDhWTiqdRfnNJ/+nqIKRKQiFbHWsdlHEhzMwruJK+qcEruA=="], - "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.69.0", "", { "os": "linux", "cpu": "x64" }, "sha512-Gt3KHgp46mRKz4sJeaASmKvD8ayXookRw07RMf+NowhEztGGDZ7VrXpoW96XuKJLjFukWizOFVNjmYb/u7caNQ=="], + "@oxlint/binding-linux-x64-gnu": ["@oxlint/binding-linux-x64-gnu@1.70.0", "", { "os": "linux", "cpu": "x64" }, "sha512-tap04CsHYOl0nSAQJfPNIuBxqEPB2HnhQqwaOXLg1jnp2XfRo8Fa814dA4QC4zpvTWXCjAAaCY1W5LOORkEQuQ=="], - "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.69.0", "", { "os": "linux", "cpu": "x64" }, "sha512-7tQhJ2+p/oHv1zcfnjYI7YVzC/7iBaVOfIvFYtxdJ5F45mWgEdrCyXZXZGfiLey5t/5JhOhsaMnnv1kAzckd7g=="], + "@oxlint/binding-linux-x64-musl": ["@oxlint/binding-linux-x64-musl@1.70.0", "", { "os": "linux", "cpu": "x64" }, "sha512-hzJa/WgvtJpbBD9rgfy0qe+MjbxOXNUT0bfR1S6EQQzfTtBFA9xg5q8KSwRrQ2QfSS+TaP4j+4mVPQrfNc6UNg=="], - "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.69.0", "", { "os": "none", "cpu": "arm64" }, "sha512-vmWz6TKp/3hfA4lksR0zHBv/6xuX1jhym6eqOjdH2DXsDDHZWcp2f0KG0VCAnlVbIrjk29G4wAWMXb/Hn1YobA=="], + "@oxlint/binding-openharmony-arm64": ["@oxlint/binding-openharmony-arm64@1.70.0", "", { "os": "none", "cpu": "arm64" }, "sha512-xbsaNSNzVSnaJACCUYr1HQMyY/Q/Q1LkePmHG3UvZPvGCYGNxrsZp9OmtA6ick8xH47ltRRbRrPCM1YXYcyC+A=="], - "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.69.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-9RExaLgmaw6IoIkU9cTpT71mLfI0xZ86iZH8x518LVsOkjquJMYqb9P7KpC8lgd1t0Dxs41p2pxynq4XR3Ttzw=="], + "@oxlint/binding-win32-arm64-msvc": ["@oxlint/binding-win32-arm64-msvc@1.70.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-icAEsUI7JbW1TMRdEXV83mVAInhRVQYuuAlPpxdGwJ95chNdnCzjloRW8GglT0WvzOEZSio6fnYSk2DJ2Hv7LQ=="], - "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.69.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-1907kRPF8/PrcIw1E7LMs9JbVrpgnt/MvFdss3an8oDkYNAACXzTntV3t3869ZZhMZxb2AzRGbz1pA/jdFatXA=="], + "@oxlint/binding-win32-ia32-msvc": ["@oxlint/binding-win32-ia32-msvc@1.70.0", "", { "os": "win32", "cpu": "ia32" }, "sha512-FHMSWbVsPVs/f+Jcl04ws4JJ2wUnauyTzlpxWRG/lSO/8GpX08Fo2gQZqdA6CrRFI+zvkxl+N/KwJGWfUwYVZA=="], - "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.69.0", "", { "os": "win32", "cpu": "x64" }, "sha512-w8SOXv3mT9Fi6jY8OXdXCfnvX/3KNLXGNr4HEz2TA7S4Mv/PYAOmpB8y/ge40mxvBMgGNaSaaDwZpAsQn7HtWA=="], + "@oxlint/binding-win32-x64-msvc": ["@oxlint/binding-win32-x64-msvc@1.70.0", "", { "os": "win32", "cpu": "x64" }, "sha512-ptOlKwCz7n4AKs5VweMqG6DAg677FmKOK+vBkkL9DMNgFATIQ+upqUYBTOEwRQyRAx1ncGlPlXleV2hIcm3z4g=="], "@quansync/fs": ["@quansync/fs@1.0.0", "", { "dependencies": { "quansync": "^1.0.0" } }, "sha512-4TJ3DFtlf1L5LDMaM6CanJ/0lckGNtJcMjQ1NAV6zDmA0tEHKZtxNKin8EgPaVX1YzljbxckyT2tJrpQKAtngQ=="], - "@rolldown/binding-android-arm64": ["@rolldown/binding-android-arm64@1.1.0", "", { "os": "android", "cpu": "arm64" }, "sha512-gCYzGOSkYY6Z034suzd20euvds7lPzMEEla62DJGE/ZAlR4OMBnNbvnBSsIGUCAr52gaWMsloGxP4tVGtN5aCA=="], + "@rolldown/binding-android-arm64": ["@rolldown/binding-android-arm64@1.1.2", "", { "os": "android", "cpu": "arm64" }, "sha512-2cZ+7xRS+DBcuJBJKnfzsbleumJhBqSlJVpuzHC0nTqfd3QQ7Vx2/x5YR/D7cBamKSeWplwo82Fn9lqYUDEMfA=="], - "@rolldown/binding-darwin-arm64": ["@rolldown/binding-darwin-arm64@1.1.0", "", { "os": "darwin", "cpu": "arm64" }, "sha512-JQBD77MNgu+4Z6RAyg69acugdrhhVoWesr3l47zohYZ2YV2fwkWMArkN/2p4l6Ei+Sno7W5q+UsKdVWq5Ens0w=="], + "@rolldown/binding-darwin-arm64": ["@rolldown/binding-darwin-arm64@1.1.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-RkPMJnygxsgOYdkfqgpwY0/Fzm8d0VQe6HGU2/B00Xa9eqdLbrII+DOKAodbJAn3ZL1AJxGHkZRPYazgGY6Ljw=="], - "@rolldown/binding-darwin-x64": ["@rolldown/binding-darwin-x64@1.1.0", "", { "os": "darwin", "cpu": "x64" }, "sha512-p/8cXUTK4Sob604e+xxPhVSbDFf29E6J0l/xESM9rdCfn3aDai3nEs6TnMHUsdD5aNlFz0+gDbiGlozLKGa2YA=="], + "@rolldown/binding-darwin-x64": ["@rolldown/binding-darwin-x64@1.1.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-Uiczh6vFhwyfd7WNe7Q7mCA4KxAiLdz7jPE/WGizfRpIieoyFuNVMmM8HqZ9HwudTkY6/AeMQwlNJ9NJijguWw=="], - "@rolldown/binding-freebsd-x64": ["@rolldown/binding-freebsd-x64@1.1.0", "", { "os": "freebsd", "cpu": "x64" }, "sha512-KbtOSlVv6fElujiZWMcC3aQYhEwLVVf073RcwlSmpGQvIsKZFUqc0ef4sjUuurRwfbiI6JJXji9DQn+86hawmQ=="], + "@rolldown/binding-freebsd-x64": ["@rolldown/binding-freebsd-x64@1.1.2", "", { "os": "freebsd", "cpu": "x64" }, "sha512-+TpdtTRgHiJFjCVFbw311SuLk3KfytPOQQn+VlAEv+gBxYPtL7E6JS9e/tk+8CwxhIZvemJKo4rTKgfWNsKkkA=="], - "@rolldown/binding-linux-arm-gnueabihf": ["@rolldown/binding-linux-arm-gnueabihf@1.1.0", "", { "os": "linux", "cpu": "arm" }, "sha512-9fZ9i0o0/MQaw7om6Z6TsT7tfCk0jtbEFtC+aPqZL5RNsGWNcHvn6EHgL3dAprjq+AZzPTAQjg2JtpJaMt+6pg=="], + "@rolldown/binding-linux-arm-gnueabihf": ["@rolldown/binding-linux-arm-gnueabihf@1.1.2", "", { "os": "linux", "cpu": "arm" }, "sha512-4lv1/tkmi7ueIVHnyreaOeUpiZP26BH9rRy6hoYfR9310A2B9nUEVRDvBx69vx64Nr3eTPPRkyciqJJs+j9Jmw=="], - "@rolldown/binding-linux-arm64-gnu": ["@rolldown/binding-linux-arm64-gnu@1.1.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-+tog7T66i+yFyIuuAnjL6xmW182W/qTBOUt6BtQ6lBIM1Eikh/fSMz4HGgvuCp5uU0zuIVWng7kDYthjCMOHcg=="], + "@rolldown/binding-linux-arm64-gnu": ["@rolldown/binding-linux-arm64-gnu@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-gBSUVO0eaWgw1JMjK3gB8BMlX2Mk148s2lTiVT3e9vjVxbl7UDfMWWY8CfIaaqiXuM9fVTMxIpUz6CAo/B6Vlw=="], - "@rolldown/binding-linux-arm64-musl": ["@rolldown/binding-linux-arm64-musl@1.1.0", "", { "os": "linux", "cpu": "arm64" }, "sha512-4b7yruLIIj/oZ3GpcLOvxcLCLDMraohn3IhQfN2hBP4w9UekG0DTIajWguJosRGfySf/+h/NwRUiMKoCpxCrqQ=="], + "@rolldown/binding-linux-arm64-musl": ["@rolldown/binding-linux-arm64-musl@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-LjQP/iZLBu8o8PjIfk4x3At0/mT6h282pvz8Z5LAyhGbu/kDezyO7ea62rF5uoqmgnIYqbN/MqJ3Si3Aymi7xQ=="], - "@rolldown/binding-linux-ppc64-gnu": ["@rolldown/binding-linux-ppc64-gnu@1.1.0", "", { "os": "linux", "cpu": "ppc64" }, "sha512-QRDOVZd0bhQ5jLsUsCC3dUxDWdTSVY9WMznowZgCGOrZfLLgctWpelhUASEiBwsXfat/JwYnVd1EaxMhqyT+UQ=="], + "@rolldown/binding-linux-ppc64-gnu": ["@rolldown/binding-linux-ppc64-gnu@1.1.2", "", { "os": "linux", "cpu": "ppc64" }, "sha512-X/7bVLWelEsbyWDUSXt7zVsTniLLPIY2n1rH58qr78l9i7MNbbxBWD8gI2vRfBWf4NUXJCUuQnfZDsp32LqsfQ=="], - "@rolldown/binding-linux-s390x-gnu": ["@rolldown/binding-linux-s390x-gnu@1.1.0", "", { "os": "linux", "cpu": "s390x" }, "sha512-ypxT+Hq76NFG7woFbNbySnGEajFuYuIXeKz/jfCU+lXUoxfi3zLE6OG/ZQNeK3RpZSYJlAe2bokpsQ046CaieQ=="], + "@rolldown/binding-linux-s390x-gnu": ["@rolldown/binding-linux-s390x-gnu@1.1.2", "", { "os": "linux", "cpu": "s390x" }, "sha512-gb6dYKW/1KDorGXyy48glEBJs/sxVSC5pcVrox/pFGV4mvwSFeg2sK5L2tRkVsVlh7kueqOgg4GEcuipJcGuKg=="], - "@rolldown/binding-linux-x64-gnu": ["@rolldown/binding-linux-x64-gnu@1.1.0", "", { "os": "linux", "cpu": "x64" }, "sha512-IdovCmfROFmpTLahdecTDFL74aLERVYN68F/mLZjfVh6LfoplPfI6deyHNMTcVujbokDV5k05XrFO22zfv+qjg=="], + "@rolldown/binding-linux-x64-gnu": ["@rolldown/binding-linux-x64-gnu@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-JY4w85pU3iAiJVMh5nuk4/Mh9GjMsupe8MrIN53rwxAZW64GKrWeJBuN6SxQg9QTU5uB1cxyhDzW8jqRn1EABw=="], - "@rolldown/binding-linux-x64-musl": ["@rolldown/binding-linux-x64-musl@1.1.0", "", { "os": "linux", "cpu": "x64" }, "sha512-pcA8xlFp2tyk9T2R6Fi/rPe3bQ1MA+sSMDNUU5Ogu80GHOatkE4P8YCreGAvZErm5Ho2YRXnyvNrWiRncfVysQ=="], + "@rolldown/binding-linux-x64-musl": ["@rolldown/binding-linux-x64-musl@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-xvpA7o5KCYLB0Rwscmuylb1/zHHSUx4g4xilm4prC5jP76pEUlzBmMbgpbh7bVDbId4NcfT96gN5i6mE6UDaiw=="], - "@rolldown/binding-openharmony-arm64": ["@rolldown/binding-openharmony-arm64@1.1.0", "", { "os": "none", "cpu": "arm64" }, "sha512-4+fexHayrLCWpriPh4c6dNvL4an34DEZCG7zOM/FD5QNF6h8DT+bDXzyB/kfC8lDJbaFb7jKShtnjDQFXVQEjg=="], + "@rolldown/binding-openharmony-arm64": ["@rolldown/binding-openharmony-arm64@1.1.2", "", { "os": "none", "cpu": "arm64" }, "sha512-p/ts6KBLjuk49Bp21XH77poQGt02iNz7ChgHep7tudPOaLinR/De/RHdxF8w8Yj4r/bF/bqXwH6PZrB2sA+Nvw=="], - "@rolldown/binding-wasm32-wasi": ["@rolldown/binding-wasm32-wasi@1.1.0", "", { "dependencies": { "@emnapi/core": "1.10.0", "@emnapi/runtime": "1.10.0", "@napi-rs/wasm-runtime": "^1.1.4" }, "cpu": "none" }, "sha512-SbL++MNmOw6QamrwIGDMSSfM4ceTzFr+RjbOExJSLLBinScU4WI5OdA413h1qwPw2yH7lVF1+H4svQ+6mSXKTQ=="], + "@rolldown/binding-wasm32-wasi": ["@rolldown/binding-wasm32-wasi@1.1.2", "", { "dependencies": { "@emnapi/core": "1.11.1", "@emnapi/runtime": "1.11.1", "@napi-rs/wasm-runtime": "^1.1.5" }, "cpu": "none" }, "sha512-VMu/wmrZ9hJzYlRhbw7jK5PODlugyKZ5mOdX78+lS8OvuFkWNQdz1pFLrI2p3P0pjXOmUZ7B48o5VnMH9QOGtg=="], - "@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.1.0", "", { "os": "win32", "cpu": "arm64" }, "sha512-+xTE6XC7wBgk0VKRXGG+QAnyW5S9b8vfsFpiMjf0waQTmSQSU8onsH/beyZ8X4aXVveJnotiy7VDjLOaW8bTrg=="], + "@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.1.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-xtUJqs8qEkuSviS0n1tsohaPuz3a1SPhZywOji4Oo+sgrJs8daEDMZ0QtqL0OS7dx8PoVpg2J/ZZycPY5I2+Zg=="], - "@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.1.0", "", { "os": "win32", "cpu": "x64" }, "sha512-Ogji1TQNqH3ACLnYr+1Ns1nyrJ0CO2P585u9Hsh02pXvtFiFpgtgT2b3P4PnCOU86VVCvqtAeCN4OftMT8KU4w=="], + "@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.1.2", "", { "os": "win32", "cpu": "x64" }, "sha512-85YiLQqjUKgSO/Zjnf9e0XIn5Ymrh1fLDWBeAkZqpuBR/3R8TpfoHXuyblqyQrftSSgWO9qpcHN8mkyKsLraoA=="], "@rolldown/pluginutils": ["@rolldown/pluginutils@1.0.0", "", {}, "sha512-aKs/3GSWyV0mrhNmt/96/Z3yczC3yvrzYATCiCXQebBsGyYzjNdUphRVLeJQ67ySKVXRfMxt2lm12pmXvbPFQQ=="], @@ -276,21 +276,21 @@ "@stll/anonymize-wasm": ["@stll/anonymize-wasm@workspace:packages/anonymize/wasm"], - "@stll/fuzzy-search": ["@stll/fuzzy-search@1.1.2", "", { "optionalDependencies": { "@stll/fuzzy-search-darwin-arm64": "1.1.2", "@stll/fuzzy-search-darwin-x64": "1.1.2", "@stll/fuzzy-search-linux-arm64-gnu": "1.1.2", "@stll/fuzzy-search-linux-x64-gnu": "1.1.2", "@stll/fuzzy-search-wasm32-wasi": "1.1.2", "@stll/fuzzy-search-win32-x64-msvc": "1.1.2" } }, "sha512-0KtL+cnvZebyvo8orkR1Rb4zgUKUDdGB68a4J3lLzMKk9RTOPPOPrmQbVijHZNMD13ZA975pnSnn5ZfzqpubAw=="], + "@stll/fuzzy-search": ["@stll/fuzzy-search@1.1.3", "", { "optionalDependencies": { "@stll/fuzzy-search-darwin-arm64": "1.1.3", "@stll/fuzzy-search-darwin-x64": "1.1.3", "@stll/fuzzy-search-linux-arm64-gnu": "1.1.3", "@stll/fuzzy-search-linux-x64-gnu": "1.1.3", "@stll/fuzzy-search-wasm32-wasi": "1.1.3", "@stll/fuzzy-search-win32-x64-msvc": "1.1.3" } }, "sha512-OAZPMRT2UIrxoEc3Vv9tdzpKOHHLkMaVhqGygQQE6rz5uvbBNs/EWNrD+bEIpYu/AU053LUUltj1dZmymLz19w=="], - "@stll/fuzzy-search-darwin-arm64": ["@stll/fuzzy-search-darwin-arm64@1.1.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-DyEeolxz2Hnq9DCqsSmBgDivZHPJ/+M80SkEYaqmMfl9rllLlHuNJ7nsjnZOJ47o7Zg1JEzR86wwQRmQbY0o5w=="], + "@stll/fuzzy-search-darwin-arm64": ["@stll/fuzzy-search-darwin-arm64@1.1.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-xe5qWIRQAgo6GJqS7CQhBw3B496NazlBpQbHUMEt/0qjF1O8fzTkJT7IAYoMyHTu/A/14Cv1MVyZ/WuKSAsC7Q=="], - "@stll/fuzzy-search-darwin-x64": ["@stll/fuzzy-search-darwin-x64@1.1.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-p2YtioZhzzoIM2Ua+rxkfzu1SCp4TCILnWu6YqSX3Lk0QjOXIn0yd3qQVhU+XVrM8f/8Nn4PwWly44iNdYpyuw=="], + "@stll/fuzzy-search-darwin-x64": ["@stll/fuzzy-search-darwin-x64@1.1.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-WrBE0MSoi52bLPiiSjplelVQoPon5QLthEIFBYZE/FKNL237/1SckWgxyHlXcbYHGkdvkV3NzVUTWBhcc+XqKw=="], - "@stll/fuzzy-search-linux-arm64-gnu": ["@stll/fuzzy-search-linux-arm64-gnu@1.1.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-d1ZaTgk/7ys1jwOY7zd85/zDoHbfOYjXDCz/aTRVjvj0c3S0cNwF40TDa29QtJPzCNad7KNeQPPOpTsEiDUZgA=="], + "@stll/fuzzy-search-linux-arm64-gnu": ["@stll/fuzzy-search-linux-arm64-gnu@1.1.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-mLCGuw4uUSFsYyS8+v7rL/aUMII8bMd1XN31wjHsqJeDIpUGD7jmvIVaktssdlbkHtkeAe+G95VYdhfF8cUbhA=="], - "@stll/fuzzy-search-linux-x64-gnu": ["@stll/fuzzy-search-linux-x64-gnu@1.1.2", "", { "os": "linux", "cpu": "x64" }, "sha512-YJcwBqr1HSjt5rQ6v3iZ3SnmF+J1yFNAIG3maZbX5TRmUWYZ/M2adpLP7PO8x5GO6VgfqM+9F0oPqM/8u4UUoQ=="], + "@stll/fuzzy-search-linux-x64-gnu": ["@stll/fuzzy-search-linux-x64-gnu@1.1.3", "", { "os": "linux", "cpu": "x64" }, "sha512-LSeWf0jnVV+g9uGG3b8kTL4PdOz7/XwBnBZIzuzrLW6gkG1uhAnaxYvyGKbaCyy1Bqsu9rxmDhE1RcPSFkkyxQ=="], "@stll/fuzzy-search-wasm": ["@stll/fuzzy-search-wasm@1.1.2", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-fRaiksdcoVgZObYj1Df068y8X3fYrBYaDTuvhV8dWVBZOk+f9NP1jyZY98uv+wakW3O0WYlRvBblbBDou+B6VA=="], - "@stll/fuzzy-search-wasm32-wasi": ["@stll/fuzzy-search-wasm32-wasi@1.1.2", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "cpu": "none" }, "sha512-vLHOLFl08IyN+hHEypVvT2XHP8dZAYOXRXnJ3LwajHR9HOKoWr7AYZqkc0jkrUousOUozdDfbd2oAL8EPxs44g=="], + "@stll/fuzzy-search-wasm32-wasi": ["@stll/fuzzy-search-wasm32-wasi@1.1.3", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.3" }, "cpu": "none" }, "sha512-Sx4P9K4W1jcqlj07Zd5s9zj0kUWp+GE8V0/5W+hHw89gTMHiDnO172A/dnj3K1BB6F+BFBgioLGChdZy5hH+Vw=="], - "@stll/fuzzy-search-win32-x64-msvc": ["@stll/fuzzy-search-win32-x64-msvc@1.1.2", "", { "os": "win32", "cpu": "x64" }, "sha512-oIyp/GPgIbUaz/AGgPvcCkikO1Po80GAqt5zARhAXs489G0kwrVXoxnF3fPZd8kMvXuOVbQkSFFy257DPDbxjA=="], + "@stll/fuzzy-search-win32-x64-msvc": ["@stll/fuzzy-search-win32-x64-msvc@1.1.3", "", { "os": "win32", "cpu": "x64" }, "sha512-fP7e4B0I2/h3MKIx1Hp7+e/QLnmv4feodEVGKGmfu9P+KbpX8ZzWdBxIEeKnPSd1fGd6F9dmwQ15YC9W+1gtzQ=="], "@stll/oxlint-config": ["@stll/oxlint-config@0.3.0", "", { "peerDependencies": { "oxlint": ">=1.66.0", "oxlint-tsgolint": ">=0.23.0" } }, "sha512-kT4jS/0mgMejp5LUrQ7joHljyphqQr2kk2zGyuJGiJqFz7pHnp9lPJw5WmDyoOqEC34+jkHh0vPeNcZIEiXhog=="], @@ -312,9 +312,9 @@ "@stll/stdnum": ["@stll/stdnum@2.1.1", "", {}, "sha512-VV+9w+u3tLYjos2Z0idJBsl+iCmE171u4rhUNEh/QDqljPBjKETKyLkf81Z1sR0QeaAcn3rg+0Y4vauPVU566w=="], - "@stll/text-search": ["@stll/text-search@1.0.6", "", { "dependencies": { "@stll/aho-corasick": "^1.0.4", "@stll/fuzzy-search": "^1.1.2", "@stll/regex-set": "^1.0.5" } }, "sha512-gjBAD7rssDe7SKMoouRfzourSfI+ssWv/HdiSAUXAJe6SvZgpqI2ePbeCnGzaGj1wij4QI+QgpZpqPWlXDNM+Q=="], + "@stll/text-search": ["@stll/text-search@1.0.7", "", { "dependencies": { "@stll/aho-corasick": "^1.0.4", "@stll/fuzzy-search": "^1.1.3", "@stll/regex-set": "^1.0.5" } }, "sha512-lvAwLKzLUhIToAnmjR0noS7Oa3d+2OFuTxl4BTqX1X6Z1JYphYkVB/hbyQpfGWrjl+LhFJDhOXBPkvTA7Yw39w=="], - "@stll/text-search-wasm": ["@stll/text-search-wasm@1.0.6", "", { "dependencies": { "@stll/aho-corasick-wasm": "^1.0.4", "@stll/fuzzy-search-wasm": "^1.1.2", "@stll/regex-set-wasm": "^1.0.5" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-LRDS557o0U08k4OR3m/lZMuWu8shPtb+yXNKZ7cbgF0wXe4n4ECEqAkjymcGw7gp0fk/At35C0LRphm2cj6eyQ=="], + "@stll/text-search-wasm": ["@stll/text-search-wasm@1.0.7", "", { "dependencies": { "@stll/aho-corasick-wasm": "^1.0.4", "@stll/fuzzy-search-wasm": "^1.1.2", "@stll/regex-set-wasm": "^1.0.5" }, "peerDependencies": { "vite": ">=5 <10" }, "optionalPeers": ["vite"] }, "sha512-b7oUNQ1lhS21DxIpt56j+7hPGFdrBJgTR4yygLGdEXjLjksvf0lXOu1Qgn+NwOSXaqn8O9Rw+R8bfKO0xhs1rg=="], "@stll/typescript-config": ["@stll/typescript-config@0.3.0", "", {}, "sha512-l0dj2IirCUKbSRSJ9Xwfu4vg8SjtfRlg5MEYGrx6lo/q2GLxS6MhFY+5hPOk36bJB99grqgmUtJql9Qg0WxIhw=="], @@ -336,11 +336,11 @@ "@types/jsesc": ["@types/jsesc@2.5.1", "", {}, "sha512-9VN+6yxLOPLOav+7PwjZbxiID2bVaeq0ED4qSQmdQTdjnXJSaCVKTR58t15oqH1H5t8Ng2ZX1SabJVoN9Q34bw=="], - "@types/node": ["@types/node@25.9.3", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-603BddQMv3pUcr4U2dhujk83N2tTDVr/34wII2B6bJy6g+8WD6yUb11jszNs0gdi4PesVWl7ABt8nYMVpnLUcg=="], + "@types/node": ["@types/node@25.9.4", "", { "dependencies": { "undici-types": ">=7.24.0 <7.24.7" } }, "sha512-dszCsrKb5U7ZsVZBWiHFklTloVl0mSEnWH/iZXfZUlI4rzCUnsvGmgqfuVRHL54ugE7/wRuxEIXRa2iMZ+BG6g=="], "ansis": ["ansis@4.3.1", "", {}, "sha512-BJ8/l4R5LRE7hW9WdSuGYrLSHi2ynxeFpDFbH0K/CgNeY/tyhk+vO6TYxXC5r5CpUhNVX310xzPsN/H9lCdfOA=="], - "ast-kit": ["ast-kit@3.0.0-beta.1", "", { "dependencies": { "@babel/parser": "^8.0.0-beta.4", "estree-walker": "^3.0.3", "pathe": "^2.0.3" } }, "sha512-trmleAnZ2PxN/loHWVhhx1qeOHSRXq4TDsBBxq3GqeJitfk3+jTQ+v/C1km/KYq9M7wKqCewMh+/NAvVH7m+bw=="], + "ast-kit": ["ast-kit@3.0.0", "", { "dependencies": { "@babel/parser": "^8.0.0", "estree-walker": "^3.0.3", "pathe": "^2.0.3" } }, "sha512-8OG92q3R35qjC/4i6BLBMg8IB+fClWu/1PEwg2Z9Rn+BuNaiEgJzpzn+pxWOdHJWDCAwu2JP0wCDTozAM4QirQ=="], "birpc": ["birpc@4.0.0", "", {}, "sha512-LShSxJP0KTmd101b6DRyGBj57LZxSDYWKitQNW/mi8GRMvZb078Uf9+pveax1DrVL89vm7mWe+TovdI/UDOuPw=="], @@ -420,11 +420,11 @@ "nanoid": ["nanoid@3.3.12", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ=="], - "obug": ["obug@2.1.1", "", {}, "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ=="], + "obug": ["obug@2.1.3", "", {}, "sha512-9miFgM2OFba7hB+pRgvtV84pYTBaoTHohvmIgiRt6dRIzbwEOIaNaP+dIlGs2fNFoB0SeISs0Jz5WFVRid6Xyg=="], "oxfmt": ["oxfmt@0.54.0", "", { "dependencies": { "tinypool": "2.1.0" }, "optionalDependencies": { "@oxfmt/binding-android-arm-eabi": "0.54.0", "@oxfmt/binding-android-arm64": "0.54.0", "@oxfmt/binding-darwin-arm64": "0.54.0", "@oxfmt/binding-darwin-x64": "0.54.0", "@oxfmt/binding-freebsd-x64": "0.54.0", "@oxfmt/binding-linux-arm-gnueabihf": "0.54.0", "@oxfmt/binding-linux-arm-musleabihf": "0.54.0", "@oxfmt/binding-linux-arm64-gnu": "0.54.0", "@oxfmt/binding-linux-arm64-musl": "0.54.0", "@oxfmt/binding-linux-ppc64-gnu": "0.54.0", "@oxfmt/binding-linux-riscv64-gnu": "0.54.0", "@oxfmt/binding-linux-riscv64-musl": "0.54.0", "@oxfmt/binding-linux-s390x-gnu": "0.54.0", "@oxfmt/binding-linux-x64-gnu": "0.54.0", "@oxfmt/binding-linux-x64-musl": "0.54.0", "@oxfmt/binding-openharmony-arm64": "0.54.0", "@oxfmt/binding-win32-arm64-msvc": "0.54.0", "@oxfmt/binding-win32-ia32-msvc": "0.54.0", "@oxfmt/binding-win32-x64-msvc": "0.54.0" }, "peerDependencies": { "svelte": "^5.0.0", "vite-plus": "*" }, "optionalPeers": ["svelte", "vite-plus"], "bin": { "oxfmt": "bin/oxfmt" } }, "sha512-DjnMwn7smSLF+Mc2+pRItnuPftm/dkUFpY/d4+33y9TfKrsHZo8GLhmUg9BrOIUEy94Rlom1Q11N6vuhE+e0oQ=="], - "oxlint": ["oxlint@1.69.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.69.0", "@oxlint/binding-android-arm64": "1.69.0", "@oxlint/binding-darwin-arm64": "1.69.0", "@oxlint/binding-darwin-x64": "1.69.0", "@oxlint/binding-freebsd-x64": "1.69.0", "@oxlint/binding-linux-arm-gnueabihf": "1.69.0", "@oxlint/binding-linux-arm-musleabihf": "1.69.0", "@oxlint/binding-linux-arm64-gnu": "1.69.0", "@oxlint/binding-linux-arm64-musl": "1.69.0", "@oxlint/binding-linux-ppc64-gnu": "1.69.0", "@oxlint/binding-linux-riscv64-gnu": "1.69.0", "@oxlint/binding-linux-riscv64-musl": "1.69.0", "@oxlint/binding-linux-s390x-gnu": "1.69.0", "@oxlint/binding-linux-x64-gnu": "1.69.0", "@oxlint/binding-linux-x64-musl": "1.69.0", "@oxlint/binding-openharmony-arm64": "1.69.0", "@oxlint/binding-win32-arm64-msvc": "1.69.0", "@oxlint/binding-win32-ia32-msvc": "1.69.0", "@oxlint/binding-win32-x64-msvc": "1.69.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.22.1", "vite-plus": "*" }, "optionalPeers": ["oxlint-tsgolint", "vite-plus"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-ypZkK/aDc5NQV8zIR6s2H2Tl3aNW8FmJ1m9+2qsaYuRenl8vgnHNCGwTHviWJdUQzglOlHFchgopdtGhSy17Rw=="], + "oxlint": ["oxlint@1.70.0", "", { "optionalDependencies": { "@oxlint/binding-android-arm-eabi": "1.70.0", "@oxlint/binding-android-arm64": "1.70.0", "@oxlint/binding-darwin-arm64": "1.70.0", "@oxlint/binding-darwin-x64": "1.70.0", "@oxlint/binding-freebsd-x64": "1.70.0", "@oxlint/binding-linux-arm-gnueabihf": "1.70.0", "@oxlint/binding-linux-arm-musleabihf": "1.70.0", "@oxlint/binding-linux-arm64-gnu": "1.70.0", "@oxlint/binding-linux-arm64-musl": "1.70.0", "@oxlint/binding-linux-ppc64-gnu": "1.70.0", "@oxlint/binding-linux-riscv64-gnu": "1.70.0", "@oxlint/binding-linux-riscv64-musl": "1.70.0", "@oxlint/binding-linux-s390x-gnu": "1.70.0", "@oxlint/binding-linux-x64-gnu": "1.70.0", "@oxlint/binding-linux-x64-musl": "1.70.0", "@oxlint/binding-openharmony-arm64": "1.70.0", "@oxlint/binding-win32-arm64-msvc": "1.70.0", "@oxlint/binding-win32-ia32-msvc": "1.70.0", "@oxlint/binding-win32-x64-msvc": "1.70.0" }, "peerDependencies": { "oxlint-tsgolint": ">=0.22.1", "vite-plus": "*" }, "optionalPeers": ["oxlint-tsgolint", "vite-plus"], "bin": { "oxlint": "bin/oxlint" } }, "sha512-D6JgHtzkhRwvEC+A0Nw5AEc5bk8x5i1pHzvZIEf/a0C4hOzmAACNGtkDGPyFaxxX3ZVGxCPeig3P3rMM8XU3/g=="], "oxlint-tsgolint": ["oxlint-tsgolint@0.23.0", "", { "optionalDependencies": { "@oxlint-tsgolint/darwin-arm64": "0.23.0", "@oxlint-tsgolint/darwin-x64": "0.23.0", "@oxlint-tsgolint/linux-arm64": "0.23.0", "@oxlint-tsgolint/linux-x64": "0.23.0", "@oxlint-tsgolint/win32-arm64": "0.23.0", "@oxlint-tsgolint/win32-x64": "0.23.0" }, "bin": { "tsgolint": "bin/tsgolint.js" } }, "sha512-3mBv3CoPbh8dFbzfDGIWa2ytZjn2v+3EX4aKRXjIhsoGFzG8GCjfRirz3rwZf1wYbZzsNLTSgpw8VjQuWdp/jA=="], @@ -442,11 +442,11 @@ "resolve-pkg-maps": ["resolve-pkg-maps@1.0.0", "", {}, "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw=="], - "rolldown": ["rolldown@1.1.0", "", { "dependencies": { "@oxc-project/types": "=0.134.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.1.0", "@rolldown/binding-darwin-arm64": "1.1.0", "@rolldown/binding-darwin-x64": "1.1.0", "@rolldown/binding-freebsd-x64": "1.1.0", "@rolldown/binding-linux-arm-gnueabihf": "1.1.0", "@rolldown/binding-linux-arm64-gnu": "1.1.0", "@rolldown/binding-linux-arm64-musl": "1.1.0", "@rolldown/binding-linux-ppc64-gnu": "1.1.0", "@rolldown/binding-linux-s390x-gnu": "1.1.0", "@rolldown/binding-linux-x64-gnu": "1.1.0", "@rolldown/binding-linux-x64-musl": "1.1.0", "@rolldown/binding-openharmony-arm64": "1.1.0", "@rolldown/binding-wasm32-wasi": "1.1.0", "@rolldown/binding-win32-arm64-msvc": "1.1.0", "@rolldown/binding-win32-x64-msvc": "1.1.0" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-zpMvlJhs5PkXRTtKc0CaLBVI9AR/VDiJFpM+kx//hgToEca7FgMlGjaRIisXBcb19T76LswgmKECSQ96hjWr5A=="], + "rolldown": ["rolldown@1.1.2", "", { "dependencies": { "@oxc-project/types": "=0.137.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.1.2", "@rolldown/binding-darwin-arm64": "1.1.2", "@rolldown/binding-darwin-x64": "1.1.2", "@rolldown/binding-freebsd-x64": "1.1.2", "@rolldown/binding-linux-arm-gnueabihf": "1.1.2", "@rolldown/binding-linux-arm64-gnu": "1.1.2", "@rolldown/binding-linux-arm64-musl": "1.1.2", "@rolldown/binding-linux-ppc64-gnu": "1.1.2", "@rolldown/binding-linux-s390x-gnu": "1.1.2", "@rolldown/binding-linux-x64-gnu": "1.1.2", "@rolldown/binding-linux-x64-musl": "1.1.2", "@rolldown/binding-openharmony-arm64": "1.1.2", "@rolldown/binding-wasm32-wasi": "1.1.2", "@rolldown/binding-win32-arm64-msvc": "1.1.2", "@rolldown/binding-win32-x64-msvc": "1.1.2" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-x0CrQQqCXWGeI8dTvFfN/Dnv3yMKT9hv5jFjlOreKAx9wqLq9wz7VvLLHyaAXC90/CpggTu9SisSbsJJTPSjNQ=="], - "rolldown-plugin-dts": ["rolldown-plugin-dts@0.25.2", "", { "dependencies": { "@babel/generator": "8.0.0-rc.6", "@babel/helper-validator-identifier": "8.0.0-rc.6", "@babel/parser": "8.0.0-rc.6", "ast-kit": "^3.0.0-beta.1", "birpc": "^4.0.0", "dts-resolver": "^3.0.0", "get-tsconfig": "5.0.0-beta.5", "obug": "^2.1.1" }, "peerDependencies": { "@ts-macro/tsc": "^0.3.6", "@typescript/native-preview": ">=7.0.0-dev.20260325.1", "rolldown": "^1.0.0", "typescript": "^5.0.0 || ^6.0.0", "vue-tsc": "~3.2.0" }, "optionalPeers": ["@ts-macro/tsc", "@typescript/native-preview", "typescript", "vue-tsc"] }, "sha512-nMhN/R+vmR8GM45ZW1FWMSjRTSDDn/6w4GTf8RNrEFCBdl8B1kySWrU1ixPtbwzXoRlcO+R/S88VgXuJQwfdDg=="], + "rolldown-plugin-dts": ["rolldown-plugin-dts@0.26.0", "", { "dependencies": { "@babel/generator": "^8.0.0", "@babel/helper-validator-identifier": "^8.0.0", "@babel/parser": "^8.0.0", "ast-kit": "^3.0.0", "birpc": "^4.0.0", "dts-resolver": "^3.0.0", "get-tsconfig": "5.0.0-beta.5", "obug": "^2.1.3" }, "peerDependencies": { "@ts-macro/tsc": "^0.3.6", "@typescript/native-preview": ">=7.0.0-dev.20260325.1", "rolldown": "^1.0.0", "typescript": "^5.0.0 || ^6.0.0", "vue-tsc": "~3.2.0 || ~3.3.0" }, "optionalPeers": ["@ts-macro/tsc", "@typescript/native-preview", "typescript", "vue-tsc"] }, "sha512-e+kEPtUiDES0htk5iqkSeF4EzAV7R+vugGB44iPDuw1Kw9E+WyL1VG7PaV0IIjGHLiacztMBcMTyrr8ON9CT1Q=="], - "semver": ["semver@7.8.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-rkVq3IXh+4FDGch+KwzX3aV9W3kO54GyEgpvBzSyctDA6Xtd7RJQV1xmXbeQp5v7+VzLOfVqiutSE6GICgPFvg=="], + "semver": ["semver@7.8.5", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-Y7/KDsb8LjooZpwaqGyulO6DQlksgCncchHGk+sZIY4SBvUocMBEFH5Ur1fI4dV+Jvl0w6cjvucaIi40puRioA=="], "source-map-js": ["source-map-js@1.2.1", "", {}, "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA=="], @@ -460,7 +460,7 @@ "tree-kill": ["tree-kill@1.2.2", "", { "bin": { "tree-kill": "cli.js" } }, "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A=="], - "tsdown": ["tsdown@0.22.2", "", { "dependencies": { "ansis": "^4.3.1", "cac": "^7.0.0", "defu": "^6.1.7", "empathic": "^2.0.1", "hookable": "^6.1.1", "import-without-cache": "^0.4.0", "obug": "^2.1.1", "picomatch": "^4.0.4", "rolldown": "~1.1.0", "rolldown-plugin-dts": "^0.25.2", "semver": "^7.8.1", "tinyexec": "^1.2.4", "tinyglobby": "^0.2.17", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.22.2", "@tsdown/exe": "0.22.2", "@vitejs/devtools": "*", "publint": "^0.3.8", "tsx": "*", "typescript": "^5.0.0 || ^6.0.0", "unplugin-unused": "^0.5.0", "unrun": "*" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "tsx", "typescript", "unplugin-unused", "unrun"], "bin": { "tsdown": "./dist/run.mjs" } }, "sha512-VX9gsyKXsTnBZjnIM4jsHl9aRv+GfgkE/k1hQslilaBfZMlaw3JuGR+6yhiU0QxWBtOCDnTjwOSoXzgB7Rr50g=="], + "tsdown": ["tsdown@0.22.3", "", { "dependencies": { "ansis": "^4.3.1", "cac": "^7.0.0", "defu": "^6.1.7", "empathic": "^2.0.1", "hookable": "^6.1.1", "import-without-cache": "^0.4.0", "obug": "^2.1.3", "picomatch": "^4.0.4", "rolldown": "~1.1.1", "rolldown-plugin-dts": "^0.26.0", "semver": "^7.8.4", "tinyexec": "^1.2.4", "tinyglobby": "^0.2.17", "tree-kill": "^1.2.2", "unconfig-core": "^7.5.0" }, "peerDependencies": { "@arethetypeswrong/core": "^0.18.1", "@tsdown/css": "0.22.3", "@tsdown/exe": "0.22.3", "@vitejs/devtools": "*", "publint": "^0.3.8", "tsx": "*", "typescript": "^5.0.0 || ^6.0.0", "unplugin-unused": "^0.5.0", "unrun": "*" }, "optionalPeers": ["@arethetypeswrong/core", "@tsdown/css", "@tsdown/exe", "@vitejs/devtools", "publint", "tsx", "typescript", "unplugin-unused", "unrun"], "bin": { "tsdown": "./dist/run.mjs" } }, "sha512-louqbfA8Qf//B9jTTL0FPtXTNpjCWv1VPkbcmQMph2pTpzs+LnB1tbe4tDDRVpo2BjF5SgUXaTZe45SxB8pWHg=="], "tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], @@ -474,13 +474,23 @@ "vite": ["vite@8.0.16", "", { "dependencies": { "lightningcss": "^1.32.0", "picomatch": "^4.0.4", "postcss": "^8.5.15", "rolldown": "1.0.3", "tinyglobby": "^0.2.17" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^20.19.0 || >=22.12.0", "@vitejs/devtools": "^0.1.18", "esbuild": "^0.27.0 || ^0.28.0", "jiti": ">=1.21.0", "less": "^4.0.0", "sass": "^1.70.0", "sass-embedded": "^1.70.0", "stylus": ">=0.54.8", "sugarss": "^5.0.0", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "@vitejs/devtools", "esbuild", "jiti", "less", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-h9bXPmJichP5fLmVQo3PyaGSDE2n3aPuomeAlVRm0JLmt4rY6zmPKd59HYI4LNW8oTK7tlTsuC7l/m7awx9Jcw=="], - "ast-kit/@babel/parser": ["@babel/parser@8.0.0-rc.3", "", { "dependencies": { "@babel/types": "^8.0.0-rc.3" }, "bin": "./bin/babel-parser.js" }, "sha512-B20dvP3MfNc/XS5KKCHy/oyWl5IA6Cn9YjXRdDlCjNmUFrjvLXMNUfQq/QUy9fnG2gYkKKcrto2YaF9B32ToOQ=="], + "@rolldown/binding-wasm32-wasi/@emnapi/core": ["@emnapi/core@1.11.1", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.2", "tslib": "^2.4.0" } }, "sha512-RSvbQmHzdKzNsLYa/wHrbc3KN4sYLKAdPZxqiM2HATqv/SBk2/ENSHpvXGaLOMcsAyz0poEGqkmmKYG3OWiJEQ=="], + + "@rolldown/binding-wasm32-wasi/@emnapi/runtime": ["@emnapi/runtime@1.11.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-vgj7R3y3Wgx24IQaGPA/R6YFXLHVMOZ0uVEyIQPaWs+rd1AzfEMXlAC22FYwO1XkKR6NPsq7mUandH8oIRdZFw=="], + + "@rolldown/binding-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.5", "", { "dependencies": { "@tybys/wasm-util": "^0.10.2" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q=="], + + "@stll/fuzzy-search-wasm32-wasi/@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.5", "", { "dependencies": { "@tybys/wasm-util": "^0.10.2" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-AWPoBRJ9tsnVhor4sjO7rkni+7p+2IAEFj6cx06UgP10jkQHqay/36uRV/bFkgrh18D9vb4cr8Q0Pthskgzy+Q=="], "bun-types/@types/node": ["@types/node@25.5.0", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-jp2P3tQMSxWugkCUKLRPVUpGaL5MVFwF8RDuSRztfwgN1wmqJeMSbKlnEtQqU8UrhTmzEmZdu2I6v2dpp7XIxw=="], "vite/rolldown": ["rolldown@1.0.3", "", { "dependencies": { "@oxc-project/types": "=0.133.0", "@rolldown/pluginutils": "^1.0.0" }, "optionalDependencies": { "@rolldown/binding-android-arm64": "1.0.3", "@rolldown/binding-darwin-arm64": "1.0.3", "@rolldown/binding-darwin-x64": "1.0.3", "@rolldown/binding-freebsd-x64": "1.0.3", "@rolldown/binding-linux-arm-gnueabihf": "1.0.3", "@rolldown/binding-linux-arm64-gnu": "1.0.3", "@rolldown/binding-linux-arm64-musl": "1.0.3", "@rolldown/binding-linux-ppc64-gnu": "1.0.3", "@rolldown/binding-linux-s390x-gnu": "1.0.3", "@rolldown/binding-linux-x64-gnu": "1.0.3", "@rolldown/binding-linux-x64-musl": "1.0.3", "@rolldown/binding-openharmony-arm64": "1.0.3", "@rolldown/binding-wasm32-wasi": "1.0.3", "@rolldown/binding-win32-arm64-msvc": "1.0.3", "@rolldown/binding-win32-x64-msvc": "1.0.3" }, "bin": { "rolldown": "./bin/cli.mjs" } }, "sha512-i00lAJ2ks1BYr7rjNjKC7BcqAS7nVfiT3QX1SI5aY+AFHblCmaUf9OE9dbdzDvW6dJxbi2ZCZiy9v3CcwOiX3g=="], - "ast-kit/@babel/parser/@babel/types": ["@babel/types@8.0.0-rc.3", "", { "dependencies": { "@babel/helper-string-parser": "^8.0.0-rc.3", "@babel/helper-validator-identifier": "^8.0.0-rc.3" } }, "sha512-mOm5ZrYmphGfqVWoH5YYMTITb3cDXsFgmvFlvkvWDMsR9X8RFnt7a0Wb6yNIdoFsiMO9WjYLq+U/FMtqIYAF8Q=="], + "@rolldown/binding-wasm32-wasi/@emnapi/core/@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-c95qOXkHdydNKhscBTebqEC1CVAZpyqOfVfBzQ1qgzyl3gfeldUjIggDbIZgDKsHLgnsM+igH7TJ/eAasaVuMA=="], + + "@rolldown/binding-wasm32-wasi/@napi-rs/wasm-runtime/@tybys/wasm-util": ["@tybys/wasm-util@0.10.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg=="], + + "@stll/fuzzy-search-wasm32-wasi/@napi-rs/wasm-runtime/@tybys/wasm-util": ["@tybys/wasm-util@0.10.2", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg=="], "bun-types/@types/node/undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], @@ -515,9 +525,5 @@ "vite/rolldown/@rolldown/binding-win32-arm64-msvc": ["@rolldown/binding-win32-arm64-msvc@1.0.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-gEdFFEN70A/jxb2svrWsN3aDL7OUtmvlOy+6fa2jxG8K0wQ1ZbdeLGnidov6Yu5/733dI5ySfzFlQ/cb0bSz1g=="], "vite/rolldown/@rolldown/binding-win32-x64-msvc": ["@rolldown/binding-win32-x64-msvc@1.0.3", "", { "os": "win32", "cpu": "x64" }, "sha512-eXB7CHuaQdqmJcc3koCNtNPmT/bj2gc999kUFgBxG8Ac0NdgXc4rkCHhqrgrhN3zddvvvrgzj1e90SuSfmyIXA=="], - - "ast-kit/@babel/parser/@babel/types/@babel/helper-string-parser": ["@babel/helper-string-parser@8.0.0-rc.3", "", {}, "sha512-AmwWFx1m8G/a5cXkxLxTiWl+YEoWuoFLUCwqMlNuWO1tqAYITQAbCRPUkyBHv1VOFgfjVOqEj6L3u15J5ZCzTA=="], - - "ast-kit/@babel/parser/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@8.0.0-rc.3", "", {}, "sha512-8AWCJ2VJJyDFlGBep5GpaaQ9AAaE/FjAcrqI7jyssYhtL7WGV0DOKpJsQqM037xDbpRLHXsY8TwU7zDma7coOw=="], } } diff --git a/bunfig.toml b/bunfig.toml index d9275b36..85c08c2d 100644 --- a/bunfig.toml +++ b/bunfig.toml @@ -3,7 +3,7 @@ linker = "hoisted" # 5-day quarantine: reject package versions published # less than 5 days ago to mitigate supply chain attacks. minimumReleaseAge = 432_000 -# First-party Stella packages ship in coordinated release +# First-party stella packages ship in coordinated release # waves, so allow fresh internal versions immediately. minimumReleaseAgeExcludes = [ "@stll/typescript-config", diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 00000000..251b36f0 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,26 @@ +# Portable stella Rust lint configuration. Keep repo-specific wrappers and +# exceptions in consuming repositories. + +disallowed-macros = [ + { path = "std::dbg", reason = "remove debug output before commit" }, + { path = "std::print", reason = "library crates should return data or use structured logging" }, + { path = "std::println", reason = "library crates should return data or use structured logging" }, + { path = "std::eprint", reason = "library crates should return data or use structured logging" }, + { path = "std::eprintln", reason = "library crates should return data or use structured logging" }, +] + +disallowed-methods = [ + { path = "std::mem::forget", reason = "leaks ownership; use a narrow ownership type or ManuallyDrop with justification" }, + { path = "std::string::String::from_utf8_lossy", reason = "lossy decoding can corrupt offsets and user data; validate or keep bytes" }, +] + +disallowed-types = [ + { path = "std::collections::LinkedList", reason = "prefer Vec, VecDeque, or an explicit intrusive structure" }, +] + +pass-by-value-size-limit = 64 +stack-size-threshold = 131072 +enum-variant-size-threshold = 128 +too-large-for-stack = 4096 +avoid-breaking-exported-api = false +accept-comment-above-attributes = true diff --git a/crates/anonymize-adapter-contract/Cargo.toml b/crates/anonymize-adapter-contract/Cargo.toml new file mode 100644 index 00000000..9a1728e8 --- /dev/null +++ b/crates/anonymize-adapter-contract/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "stella-anonymize-adapter-contract" +version.workspace = true +edition.workspace = true +description = "Shared adapter contract for stella anonymization bindings" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[dependencies] +bincode = { version = "2", features = ["serde"] } +blake3 = "1" +serde = { version = "1", features = ["derive"] } +stella-anonymize-core = { path = "../anonymize-core" } +zstd = "0.13" + +[dev-dependencies] +serde_json = "1" + +[lints] +workspace = true diff --git a/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs b/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs new file mode 100644 index 00000000..0959a767 --- /dev/null +++ b/crates/anonymize-adapter-contract/examples/native_adapter_parity.rs @@ -0,0 +1,59 @@ +#![allow(clippy::print_stdout)] + +use std::{env, fs, io::Write}; + +use serde::Deserialize; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingPreparedSearchConfig, + BindingStaticRedactionResult, operator_config_from_binding, + prepared_search_config_from_binding, + static_redaction_result_to_utf16_binding, +}; +use stella_anonymize_core::PreparedSearch; + +#[derive(Deserialize)] +struct Payload { + config_json: String, + cases: Vec, +} + +#[derive(Deserialize)] +struct Case { + text: String, + operators_json: Option, +} + +fn main() -> Result<(), Box> { + let payload_path = env::var("STELLA_ANONYMIZE_PARITY_PAYLOAD")?; + let payload = fs::read_to_string(payload_path)?; + let payload = serde_json::from_str::(&payload)?; + let config = + serde_json::from_str::(&payload.config_json)?; + let prepared = + PreparedSearch::new(prepared_search_config_from_binding(config)?)?; + let results = payload + .cases + .iter() + .map(|case| run_case(&prepared, case)) + .collect::, _>>()?; + + let mut stdout = std::io::stdout().lock(); + writeln!(stdout, "{}", serde_json::to_string(&results)?)?; + Ok(()) +} + +fn run_case( + prepared: &PreparedSearch, + case: &Case, +) -> Result> { + let operators = case + .operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose()?; + let operators = operator_config_from_binding(operators)?; + let result = prepared.redact_static_entities(&case.text, &operators)?; + Ok(static_redaction_result_to_utf16_binding( + result, &case.text, + )?) +} diff --git a/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs new file mode 100644 index 00000000..528de76d --- /dev/null +++ b/crates/anonymize-adapter-contract/examples/native_adapter_perf.rs @@ -0,0 +1,78 @@ +#![allow(clippy::print_stdout)] + +use std::env; +use std::io::Write; +use std::time::Instant; + +use serde::Deserialize; +use serde_json::json; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingPreparedSearchConfig, + operator_config_from_binding, prepared_search_config_from_binding, +}; +use stella_anonymize_core::PreparedSearch; + +#[derive(Deserialize)] +struct Payload { + config_json: String, + iterations: usize, + cases: Vec, +} + +#[derive(Deserialize)] +struct Case { + text: String, + operators_json: Option, +} + +fn main() -> Result<(), Box> { + let payload = env::var("STELLA_ANONYMIZE_PERF_PAYLOAD")?; + let payload = serde_json::from_str::(&payload)?; + let config = + serde_json::from_str::(&payload.config_json)?; + + let prepare_start = Instant::now(); + let prepared = + PreparedSearch::new(prepared_search_config_from_binding(config)?)?; + let prepare_ms = elapsed_ms(prepare_start); + + let run_cases = payload + .cases + .iter() + .map(|item| -> Result<_, Box> { + let operators = item + .operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose()?; + let operators = operator_config_from_binding(operators)?; + Ok((item.text.as_str(), operators)) + }) + .collect::, _>>()?; + + let run_start = Instant::now(); + let mut entity_count = 0_usize; + for _ in 0..payload.iterations { + for (text, operators) in &run_cases { + let result = prepared.redact_static_entities(text, operators)?; + entity_count = entity_count.saturating_add(result.redaction.entity_count); + } + } + let run_ms = elapsed_ms(run_start); + + let mut stdout = std::io::stdout().lock(); + writeln!( + stdout, + "{}", + json!({ + "prepareMs": prepare_ms, + "runMs": run_ms, + "entityCount": entity_count, + }) + )?; + Ok(()) +} + +fn elapsed_ms(start: Instant) -> f64 { + start.elapsed().as_secs_f64() * 1_000.0 +} diff --git a/crates/anonymize-adapter-contract/src/lib.rs b/crates/anonymize-adapter-contract/src/lib.rs new file mode 100644 index 00000000..feb5b2b7 --- /dev/null +++ b/crates/anonymize-adapter-contract/src/lib.rs @@ -0,0 +1,2613 @@ +use std::borrow::Cow; +use std::collections::{BTreeMap, BTreeSet}; + +use serde::{Deserialize, Serialize}; +use stella_anonymize_core::{ + AddressContextData, AddressSeedData, AmountWordsData, CoreferenceData, + CoreferencePatternData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEvent, + DiagnosticEventKind, DiagnosticStage, FuzzySearchOptions, GazetteerMatchData, + HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, + MagnitudeSuffixData, MonetaryData, NameCorpusData, OperatorConfig, + OperatorType, PatternSlice, PreparedSearchConfig, PreparedSearchSlices, + RegexMatchMeta, RegexSearchOptions, SearchEngine, SearchOptions, + SearchPattern, ShareQuantityTermData, SigningPlaceGuardData, SourceDetail, + StaticRedactionDiagnosticResult, StaticRedactionDiagnostics, + StaticRedactionResult, StringGroups, TriggerData, TriggerRule, + TriggerStrategy, TriggerValidation, WrittenAmountPatternData, ZoneData, + ZonePatternData, ZoneSigningClauseData, +}; + +pub type Result = std::result::Result; + +const PREPARED_SEARCH_PACKAGE_HEADER: [u8; 8] = *b"ANONPKG1"; +const PREPARED_SEARCH_PACKAGE_VERSION: u32 = 11; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONPKZ1"; +const PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION: u32 = 9; +const PREPARED_SEARCH_CORE_PACKAGE_HEADER: [u8; 8] = *b"ANONCPK1"; +const PREPARED_SEARCH_CORE_PACKAGE_VERSION: u32 = 10; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER: [u8; 8] = *b"ANONCPZ1"; +const PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION: u32 = 10; +const PREPARED_SEARCH_PACKAGE_DIGEST_BYTES: usize = 32; +const PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL: i32 = 3; +const MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES: usize = 256 * 1024 * 1024; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ContractError { + CompactStringIndexOutOfRange { field: &'static str, index: u32 }, + FuzzyDistanceOutOfRange { distance: u32 }, + InvalidCompactStringGroups { field: &'static str, reason: String }, + InvalidBindingOffset { offset: u32 }, + InvalidPreparedSearchPackage { reason: String }, + MissingDenyListDataForLiteralPatterns, + UnsupportedOperator { value: String }, + UnsupportedSearchPatternKind { kind: String }, + UnsupportedSourceDetail { value: String }, +} + +impl std::fmt::Display for ContractError { + fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::CompactStringIndexOutOfRange { field, index } => { + write!( + formatter, + "Compact string index out of range in {field}: {index}" + ) + } + Self::FuzzyDistanceOutOfRange { distance } => { + write!(formatter, "Fuzzy distance exceeds u8 range: {distance}") + } + Self::InvalidCompactStringGroups { field, reason } => { + write!( + formatter, + "Compact string groups are invalid in {field}: {reason}" + ) + } + Self::InvalidBindingOffset { offset } => { + write!( + formatter, + "Byte offset is not on a character boundary: {offset}" + ) + } + Self::InvalidPreparedSearchPackage { reason } => { + write!(formatter, "Prepared search package is invalid: {reason}") + } + Self::MissingDenyListDataForLiteralPatterns => formatter.write_str( + "Deny-list data is required when literal patterns are derived from it", + ), + Self::UnsupportedOperator { value } => { + write!(formatter, "Unsupported anonymization operator: {value}") + } + Self::UnsupportedSearchPatternKind { kind } => { + write!(formatter, "Unsupported search pattern kind: {kind}") + } + Self::UnsupportedSourceDetail { value } => { + write!(formatter, "Unsupported source detail: {value}") + } + } + } +} + +impl std::error::Error for ContractError {} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSearchPattern { + pub kind: String, + pub pattern: String, + pub distance: Option, + pub case_insensitive: Option, + pub whole_words: Option, + pub lazy: Option, + pub prefilter_any: Option>, + pub prefilter_case_insensitive: Option, + pub prefilter_regex: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSearchOptions { + pub literal_case_insensitive: Option, + pub literal_whole_words: Option, + pub regex_whole_words: Option, + pub regex_overlap_all: Option, + pub fuzzy_case_insensitive: Option, + pub fuzzy_whole_words: Option, + pub fuzzy_normalize_diacritics: Option, +} + +#[derive( + Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize, +)] +pub struct BindingPatternSlice { + pub start: u32, + pub end: u32, +} + +#[derive( + Clone, Copy, Debug, Default, Deserialize, Eq, PartialEq, Serialize, +)] +pub struct BindingPreparedSearchSlices { + pub regex: Option, + pub custom_regex: Option, + pub legal_forms: Option, + pub triggers: Option, + pub deny_list: Option, + pub street_types: Option, + pub gazetteer: Option, + pub countries: Option, + pub hotwords: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingRegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: Option, + pub validator_id: Option, + pub validator_input: Option, + pub min_byte_length: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingGazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCountryMatchData { + pub labels: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingHotwordRuleData { + #[serde(default)] + pub rules: Vec, + #[serde(default)] + pub pattern_rule_indices: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingHotwordRule { + #[serde(default)] + pub hotwords: Vec, + #[serde(default)] + pub target_labels: Vec, + pub score_adjustment: f64, + pub reclassify_to: Option, + pub proximity_before: u32, + pub proximity_after: u32, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingTriggerData { + pub rules: Vec, + #[serde(default)] + pub address_stop_keywords: Vec, + #[serde(default)] + pub party_position_terms: Vec, + #[serde(default)] + pub post_nominals: Vec, + #[serde(default)] + pub sentence_terminal_currency_terms: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingTriggerRule { + pub trigger: String, + pub label: String, + pub strategy: BindingTriggerStrategy, + pub validations: Vec, + pub include_trigger: bool, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(tag = "type", rename_all = "kebab-case")] +pub enum BindingTriggerStrategy { + ToNextComma { + #[serde(default)] + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(tag = "type", rename_all = "kebab-case")] +pub enum BindingTriggerValidation { + StartsUppercase, + MinLength { + min: u32, + }, + MaxLength { + max: u32, + }, + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingLegalFormData { + #[serde(default)] + pub suffixes: Vec, + #[serde(default)] + pub normalized_boundary_suffixes: Vec, + #[serde(default)] + pub normalized_in_name_words: Vec, + #[serde(default)] + pub normalized_suffix_words: Vec, + #[serde(default)] + pub role_heads: Vec, + #[serde(default)] + pub sentence_verb_indicators: Vec, + #[serde(default)] + pub clause_noun_heads: Vec, + #[serde(default)] + pub connector_prose_heads: Vec, + #[serde(default)] + pub structural_single_cap_prefixes: Vec, + #[serde(default)] + pub leading_clause_phrases: Vec, + #[serde(default)] + pub leading_clause_direct_prefixes: Vec, + #[serde(default)] + pub connector_words: Vec, + #[serde(default)] + pub and_connector_words: Vec, + #[serde(default)] + pub in_name_prepositions: Vec, + #[serde(default)] + pub company_suffix_words: Vec, + #[serde(default)] + pub comma_gated_direct_prefixes: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDateData { + #[serde(default)] + pub month_names_by_language: BTreeMap>, + #[serde(default)] + pub year_words_by_language: BTreeMap>, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingMonetaryData { + #[serde(default)] + pub currencies: BindingCurrencyData, + #[serde(default)] + pub amount_words: BindingAmountWordsData, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCurrencyData { + #[serde(default)] + pub codes: Vec, + #[serde(default)] + pub symbols: Vec, + #[serde(default)] + pub local_names: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAmountWordsData { + #[serde(default)] + pub written_amount_patterns: Vec, + #[serde(default)] + pub magnitude_suffixes: Vec, + #[serde(default)] + pub share_quantity_terms: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingWrittenAmountPatternData { + #[serde(default)] + pub keywords: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingMagnitudeSuffixData { + #[serde(default)] + pub words: Vec, + #[serde(default)] + pub abbreviations_case_insensitive: Vec, + #[serde(default)] + pub abbreviations_case_sensitive: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingShareQuantityTermData { + #[serde(default)] + pub modifiers: Vec, + #[serde(default)] + pub nouns: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAddressSeedData { + #[serde(default)] + pub boundary_words: Vec, + #[serde(default)] + pub br_cep_cue_words: Vec, + #[serde(default)] + pub unit_abbreviations: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingAddressContextData { + #[serde(default)] + pub address_prepositions: Vec, + #[serde(default)] + pub temporal_prepositions: Vec, + #[serde(default)] + pub street_abbreviations: Vec, + #[serde(default)] + pub bare_house_stopwords: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZoneData { + #[serde(default)] + pub section_heading_patterns: Vec, + #[serde(default)] + pub signing_clauses: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZonePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingZoneSigningClauseData { + #[serde(default)] + pub prefix: String, + #[serde(default)] + pub suffix: String, + #[serde(default)] + pub prepositions: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCoreferenceData { + #[serde(default)] + pub definition_patterns: Vec, + #[serde(default)] + pub role_stop_terms: Vec, + #[serde(default)] + pub legal_form_aliases: Vec, + #[serde(default)] + pub organization_suffixes: Vec, + #[serde(default)] + pub organization_determiners: Vec, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingCoreferencePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingNameCorpusData { + #[serde(default)] + pub first_names: Vec, + #[serde(default)] + pub surnames: Vec, + #[serde(default)] + pub title_tokens: Vec, + #[serde(default)] + pub title_abbreviations: Vec, + #[serde(default)] + pub excluded_words: Vec, + #[serde(default)] + pub common_words: Vec, + #[serde(default)] + pub non_western_names: Vec, + #[serde(default)] + pub excluded_all_caps: Vec, + #[serde(default)] + pub ja_suffixes: Vec, + #[serde(default)] + pub arabic_connectors: Vec, + #[serde(default)] + pub relation_connectors: Vec, + #[serde(default)] + pub hyphenated_prefixes: Vec, + #[serde(default)] + pub cjk_non_person_terms: Vec, + #[serde(default)] + pub cjk_surname_starters: Vec, + #[serde(default)] + pub organization_terms: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDenyListMatchData { + #[serde(default)] + pub labels: Vec>, + #[serde(default)] + pub label_table: Vec, + #[serde(default)] + pub label_indices: Vec>, + #[serde(default)] + pub custom_labels: Vec>, + #[serde(default)] + pub custom_label_indices: Vec>, + pub originals: Vec, + #[serde(default)] + pub sources: Vec>, + #[serde(default)] + pub source_table: Vec, + #[serde(default)] + pub source_indices: Vec>, + pub filters: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingDenyListFilterData { + pub stopwords: Vec, + pub allow_list: Vec, + pub person_stopwords: Vec, + #[serde(default)] + pub person_trailing_nouns: Vec, + pub address_stopwords: Vec, + #[serde(default)] + pub address_jurisdiction_prefixes: Vec, + pub street_types: Vec, + #[serde(default)] + pub address_component_terms: Vec, + #[serde(default)] + pub ambiguous_street_type_terms: Vec, + pub first_names: Vec, + pub generic_roles: Vec, + #[serde(default)] + pub number_abbrev_prefixes: Vec, + pub sentence_starters: Vec, + pub trailing_address_word_exclusions: Vec, + #[serde(default)] + pub document_heading_words: Vec, + #[serde(default)] + pub document_heading_ordinal_markers: Vec, + pub defined_term_cues: Vec, + #[serde(default)] + pub signing_place_guards: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingSigningPlaceGuardData { + #[serde(default)] + pub prefix_phrases: Vec, + #[serde(default)] + pub suffix_phrases: Vec, +} + +#[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] +pub struct BindingPreparedSearchConfig { + #[serde(default)] + pub regex_patterns: Vec, + #[serde(default)] + pub custom_regex_patterns: Vec, + #[serde(default)] + pub literal_patterns: Vec, + #[serde(default)] + pub regex_options: Option, + #[serde(default)] + pub custom_regex_options: Option, + #[serde(default)] + pub literal_options: Option, + #[serde(default)] + pub literal_patterns_from_deny_list_data: bool, + #[serde(default)] + pub allowed_labels: Vec, + #[serde(default)] + pub threshold: f64, + #[serde(default)] + pub confidence_boost: bool, + #[serde(default)] + pub slices: BindingPreparedSearchSlices, + #[serde(default)] + pub regex_meta: Vec, + #[serde(default)] + pub custom_regex_meta: Vec, + #[serde(default)] + pub deny_list_data: Option, + #[serde(default)] + pub false_positive_filters: Option, + #[serde(default)] + pub gazetteer_data: Option, + #[serde(default)] + pub country_data: Option, + #[serde(default)] + pub hotword_data: Option, + #[serde(default)] + pub trigger_data: Option, + #[serde(default)] + pub legal_form_data: Option, + #[serde(default)] + pub address_seed_data: Option, + #[serde(default)] + pub zone_data: Option, + #[serde(default)] + pub address_context_data: Option, + #[serde(default)] + pub coreference_data: Option, + #[serde(default)] + pub name_corpus_data: Option, + #[serde(default)] + pub date_data: Option, + #[serde(default)] + pub monetary_data: Option, +} + +#[derive(Deserialize)] +struct BinaryPreparedSearchPackageOwned { + config: BinaryPreparedSearchConfig, + artifacts: Vec, +} + +#[derive(Serialize)] +struct BinaryPreparedSearchPackageRef<'a> { + config: BinaryPreparedSearchConfig, + artifacts: &'a [u8], +} + +#[derive(Clone, Debug, PartialEq)] +pub struct BindingPreparedSearchPackage { + pub config: BindingPreparedSearchConfig, + pub artifacts: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct CorePreparedSearchPackage { + pub config: PreparedSearchConfig, + pub artifacts: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct CorePreparedSearchPackageView<'a> { + pub config: PreparedSearchConfig, + pub artifacts: Cow<'a, [u8]>, +} + +#[derive(Deserialize, Serialize)] +struct BinaryPreparedSearchConfig { + regex_patterns: Vec, + custom_regex_patterns: Vec, + literal_patterns: Vec, + regex_options: Option, + custom_regex_options: Option, + literal_options: Option, + literal_patterns_from_deny_list_data: bool, + allowed_labels: Vec, + threshold: f64, + confidence_boost: bool, + slices: BindingPreparedSearchSlices, + regex_meta: Vec, + custom_regex_meta: Vec, + deny_list_data: Option, + false_positive_filters: Option, + gazetteer_data: Option, + country_data: Option, + hotword_data: Option, + trigger_data: Option, + legal_form_data: Option, + address_seed_data: Option, + zone_data: Option, + address_context_data: Option, + coreference_data: Option, + name_corpus_data: Option, + date_data: Option, + monetary_data: Option, +} + +#[derive(Deserialize, Serialize)] +struct BinaryTriggerData { + rules: Vec, + address_stop_keywords: Vec, + party_position_terms: Vec, + #[serde(default)] + post_nominals: Vec, + sentence_terminal_currency_terms: Vec, +} + +#[derive(Deserialize, Serialize)] +struct BinaryTriggerRule { + trigger: String, + label: String, + strategy: BinaryTriggerStrategy, + validations: Vec, + include_trigger: bool, +} + +#[derive(Deserialize, Serialize)] +enum BinaryTriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Deserialize, Serialize)] +enum BinaryTriggerValidation { + StartsUppercase, + MinLength { + min: u32, + }, + MaxLength { + max: u32, + }, + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +pub fn prepared_search_package_to_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; + Ok(prepared_search_package_raw_payload_to_bytes( + PREPARED_SEARCH_PACKAGE_HEADER, + PREPARED_SEARCH_PACKAGE_VERSION, + &payload, + )) +} + +pub fn prepared_search_package_to_compressed_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = prepared_search_package_payload_to_bytes(config, artifacts)?; + prepared_search_package_compress_payload( + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + &payload, + ) +} + +pub fn prepared_search_core_package_to_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = + prepared_search_core_package_payload_to_bytes(config, artifacts)?; + Ok(prepared_search_package_raw_payload_to_bytes( + PREPARED_SEARCH_CORE_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_PACKAGE_VERSION, + &payload, + )) +} + +pub fn prepared_search_core_package_to_compressed_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let payload = + prepared_search_core_package_payload_to_bytes(config, artifacts)?; + prepared_search_package_compress_payload( + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + &payload, + ) +} + +#[must_use] +pub fn prepared_search_package_has_core_payload(bytes: &[u8]) -> bool { + bytes + .get(..PREPARED_SEARCH_CORE_PACKAGE_HEADER.len()) + .is_some_and(|header| { + header == PREPARED_SEARCH_CORE_PACKAGE_HEADER + || header == PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER + }) +} + +pub fn prepared_search_package_digest(bytes: &[u8]) -> Result<[u8; 32]> { + Ok(prepared_search_package_parts(bytes)?.digest()) +} + +pub fn prepared_search_package_from_bytes( + bytes: &[u8], +) -> Result { + let parts = prepared_search_package_parts(bytes)?; + if parts.is_core() { + return Err(invalid_prepared_search_package( + "package does not contain a binding payload", + )); + } + let digest = parts.digest(); + let payload = parts.into_payload()?; + verify_prepared_search_package_digest(digest, payload.as_ref())?; + let (package, read) = bincode::serde::decode_from_slice::< + BinaryPreparedSearchPackageOwned, + _, + >(payload.as_ref(), package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + if read != payload.as_ref().len() { + return Err(invalid_prepared_search_package("trailing payload data")); + } + Ok(BindingPreparedSearchPackage { + config: BindingPreparedSearchConfig::from(package.config), + artifacts: package.artifacts, + }) +} + +pub fn prepared_search_core_package_from_bytes( + bytes: &[u8], +) -> Result { + let package = prepared_search_core_package_view_from_bytes(bytes)?; + Ok(CorePreparedSearchPackage { + config: package.config, + artifacts: package.artifacts.into_owned(), + }) +} + +pub fn prepared_search_core_package_view_from_bytes( + bytes: &[u8], +) -> Result> { + let parts = prepared_search_package_parts(bytes)?; + if !parts.is_core() { + return Err(invalid_prepared_search_package( + "package does not contain a core payload", + )); + } + let digest = parts.digest(); + let payload = parts.into_payload()?; + verify_prepared_search_package_digest(digest, payload.as_ref())?; + core_package_view_from_payload(payload) +} + +impl From for BinaryPreparedSearchConfig { + fn from(config: BindingPreparedSearchConfig) -> Self { + Self { + regex_patterns: config.regex_patterns, + custom_regex_patterns: config.custom_regex_patterns, + literal_patterns: config.literal_patterns, + regex_options: config.regex_options, + custom_regex_options: config.custom_regex_options, + literal_options: config.literal_options, + literal_patterns_from_deny_list_data: config + .literal_patterns_from_deny_list_data, + allowed_labels: config.allowed_labels, + threshold: config.threshold, + confidence_boost: config.confidence_boost, + slices: config.slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + hotword_data: config.hotword_data, + trigger_data: config.trigger_data.map(BinaryTriggerData::from), + legal_form_data: config.legal_form_data, + address_seed_data: config.address_seed_data, + zone_data: config.zone_data, + address_context_data: config.address_context_data, + coreference_data: config.coreference_data, + name_corpus_data: config.name_corpus_data, + date_data: config.date_data, + monetary_data: config.monetary_data, + } + } +} + +impl From for BindingPreparedSearchConfig { + fn from(config: BinaryPreparedSearchConfig) -> Self { + Self { + regex_patterns: config.regex_patterns, + custom_regex_patterns: config.custom_regex_patterns, + literal_patterns: config.literal_patterns, + regex_options: config.regex_options, + custom_regex_options: config.custom_regex_options, + literal_options: config.literal_options, + literal_patterns_from_deny_list_data: config + .literal_patterns_from_deny_list_data, + allowed_labels: config.allowed_labels, + threshold: config.threshold, + confidence_boost: config.confidence_boost, + slices: config.slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + hotword_data: config.hotword_data, + trigger_data: config.trigger_data.map(BindingTriggerData::from), + legal_form_data: config.legal_form_data, + address_seed_data: config.address_seed_data, + zone_data: config.zone_data, + address_context_data: config.address_context_data, + coreference_data: config.coreference_data, + name_corpus_data: config.name_corpus_data, + date_data: config.date_data, + monetary_data: config.monetary_data, + } + } +} + +impl From for BinaryTriggerData { + fn from(data: BindingTriggerData) -> Self { + Self { + rules: data + .rules + .into_iter() + .map(BinaryTriggerRule::from) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + post_nominals: data.post_nominals, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, + } + } +} + +impl From for BindingTriggerData { + fn from(data: BinaryTriggerData) -> Self { + Self { + rules: data + .rules + .into_iter() + .map(BindingTriggerRule::from) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + post_nominals: data.post_nominals, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, + } + } +} + +impl From for BinaryTriggerRule { + fn from(rule: BindingTriggerRule) -> Self { + Self { + trigger: rule.trigger, + label: rule.label, + strategy: BinaryTriggerStrategy::from(rule.strategy), + validations: rule + .validations + .into_iter() + .map(BinaryTriggerValidation::from) + .collect(), + include_trigger: rule.include_trigger, + } + } +} + +impl From for BindingTriggerRule { + fn from(rule: BinaryTriggerRule) -> Self { + Self { + trigger: rule.trigger, + label: rule.label, + strategy: BindingTriggerStrategy::from(rule.strategy), + validations: rule + .validations + .into_iter() + .map(BindingTriggerValidation::from) + .collect(), + include_trigger: rule.include_trigger, + } + } +} + +impl From for BinaryTriggerStrategy { + fn from(strategy: BindingTriggerStrategy) -> Self { + match strategy { + BindingTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length, + }, + BindingTriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + BindingTriggerStrategy::NWords { count } => Self::NWords { count }, + BindingTriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + BindingTriggerStrategy::Address { max_chars } => { + Self::Address { max_chars } + } + BindingTriggerStrategy::MatchPattern { pattern, flags } => { + Self::MatchPattern { pattern, flags } + } + } + } +} + +impl From for BindingTriggerStrategy { + fn from(strategy: BinaryTriggerStrategy) -> Self { + match strategy { + BinaryTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length, + }, + BinaryTriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + BinaryTriggerStrategy::NWords { count } => Self::NWords { count }, + BinaryTriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + BinaryTriggerStrategy::Address { max_chars } => { + Self::Address { max_chars } + } + BinaryTriggerStrategy::MatchPattern { pattern, flags } => { + Self::MatchPattern { pattern, flags } + } + } + } +} + +impl From for BinaryTriggerValidation { + fn from(validation: BindingTriggerValidation) -> Self { + match validation { + BindingTriggerValidation::StartsUppercase => Self::StartsUppercase, + BindingTriggerValidation::MinLength { min } => Self::MinLength { min }, + BindingTriggerValidation::MaxLength { max } => Self::MaxLength { max }, + BindingTriggerValidation::NoDigits => Self::NoDigits, + BindingTriggerValidation::HasDigits => Self::HasDigits, + BindingTriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { pattern, flags } + } + BindingTriggerValidation::ValidId { validator } => { + Self::ValidId { validator } + } + } + } +} + +impl From for BindingTriggerValidation { + fn from(validation: BinaryTriggerValidation) -> Self { + match validation { + BinaryTriggerValidation::StartsUppercase => Self::StartsUppercase, + BinaryTriggerValidation::MinLength { min } => Self::MinLength { min }, + BinaryTriggerValidation::MaxLength { max } => Self::MaxLength { max }, + BinaryTriggerValidation::NoDigits => Self::NoDigits, + BinaryTriggerValidation::HasDigits => Self::HasDigits, + BinaryTriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { pattern, flags } + } + BinaryTriggerValidation::ValidId { validator } => { + Self::ValidId { validator } + } + } + } +} + +fn prepared_search_package_payload_to_bytes( + config: &BindingPreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + bincode::serde::encode_to_vec( + BinaryPreparedSearchPackageRef { + config: BinaryPreparedSearchConfig::from(config.clone()), + artifacts, + }, + package_bincode_config(), + ) + .map_err(|error| invalid_prepared_search_package(error.to_string())) +} + +fn prepared_search_core_package_payload_to_bytes( + config: &PreparedSearchConfig, + artifacts: &[u8], +) -> Result> { + let mut config = config.clone(); + if core_literal_patterns_are_identity_mapped(&config) { + config.literal_patterns.clear(); + } + let config_bytes = + bincode::serde::encode_to_vec(config, package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + let config_len = u64::try_from(config_bytes.len()).map_err(|_| { + invalid_prepared_search_package("core config length overflow") + })?; + let mut bytes = Vec::with_capacity( + std::mem::size_of::() + .saturating_add(config_bytes.len()) + .saturating_add(artifacts.len()), + ); + bytes.extend_from_slice(&config_len.to_le_bytes()); + bytes.extend_from_slice(&config_bytes); + bytes.extend_from_slice(artifacts); + Ok(bytes) +} + +fn core_package_view_from_payload( + payload: Cow<'_, [u8]>, +) -> Result> { + let len_end = std::mem::size_of::(); + let len_bytes = payload.as_ref().get(..len_end).ok_or_else(|| { + invalid_prepared_search_package("truncated config length") + })?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed config length"))?; + let config_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("config length overflow"))?; + let config_end = len_end + .checked_add(config_len) + .ok_or_else(|| invalid_prepared_search_package("config length overflow"))?; + let config_bytes = payload + .as_ref() + .get(len_end..config_end) + .ok_or_else(|| invalid_prepared_search_package("truncated config"))?; + let (config, read) = bincode::serde::decode_from_slice::< + PreparedSearchConfig, + _, + >(config_bytes, package_bincode_config()) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + if read != config_bytes.len() { + return Err(invalid_prepared_search_package("trailing config data")); + } + + let artifacts = match payload { + Cow::Borrowed(bytes) => Cow::Borrowed( + bytes + .get(config_end..) + .ok_or_else(|| invalid_prepared_search_package("missing artifacts"))?, + ), + Cow::Owned(bytes) => Cow::Owned( + bytes + .get(config_end..) + .ok_or_else(|| invalid_prepared_search_package("missing artifacts"))? + .to_vec(), + ), + }; + + Ok(CorePreparedSearchPackageView { config, artifacts }) +} + +fn core_literal_patterns_are_identity_mapped( + config: &PreparedSearchConfig, +) -> bool { + !config.literal_patterns.is_empty() + && config + .literal_patterns + .iter() + .all(|pattern| matches!(pattern, SearchPattern::Literal(_))) +} + +fn prepared_search_package_raw_payload_to_bytes( + header: [u8; 8], + version: u32, + payload: &[u8], +) -> Vec { + let digest = blake3::hash(payload); + let mut bytes = Vec::with_capacity(raw_package_header_len(payload)); + write_package_header(&mut bytes, header, version, digest.as_bytes()); + bytes.extend_from_slice(payload); + bytes +} + +fn prepared_search_package_compress_payload( + header: [u8; 8], + version: u32, + payload: &[u8], +) -> Result> { + let compressed = + zstd::bulk::compress(payload, PREPARED_SEARCH_PACKAGE_ZSTD_LEVEL) + .map_err(|error| invalid_prepared_search_package(error.to_string()))?; + let digest = blake3::hash(payload); + let mut bytes = Vec::with_capacity( + raw_package_header_len(&compressed) + .saturating_add(std::mem::size_of::()), + ); + write_package_header(&mut bytes, header, version, digest.as_bytes()); + let payload_len = u64::try_from(payload.len()) + .map_err(|_| invalid_prepared_search_package("payload length overflow"))?; + bytes.extend_from_slice(&payload_len.to_le_bytes()); + bytes.extend_from_slice(&compressed); + Ok(bytes) +} + +const fn raw_package_header_len(payload: &[u8]) -> usize { + PREPARED_SEARCH_PACKAGE_HEADER + .len() + .saturating_add(std::mem::size_of::()) + .saturating_add(PREPARED_SEARCH_PACKAGE_DIGEST_BYTES) + .saturating_add(payload.len()) +} + +fn write_package_header( + bytes: &mut Vec, + header: [u8; 8], + version: u32, + digest: &[u8; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES], +) { + bytes.extend_from_slice(&header); + bytes.extend_from_slice(&version.to_le_bytes()); + bytes.extend_from_slice(digest); +} + +#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)] +pub struct BindingOperatorConfig { + pub operators: Option>, + #[serde(default, alias = "redactString")] + pub redact_string: Option, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingRedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingOperatorEntry { + pub placeholder: String, + pub operator: String, +} + +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct BindingRedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: usize, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingPipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: String, + pub source_detail: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionResult { + pub resolved_entities: Vec, + pub redaction: BindingRedactionResult, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingDiagnosticEvent { + pub stage: String, + pub kind: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub engine: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub pattern: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub source: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_detail: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub label: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub start: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub end: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub text: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub score: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub span_valid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub elapsed_us: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub input_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionDiagnostics { + pub events: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +pub struct BindingStaticRedactionDiagnosticResult { + pub result: BindingStaticRedactionResult, + pub diagnostics: BindingStaticRedactionDiagnostics, +} + +pub fn prepared_search_config_from_binding( + config: BindingPreparedSearchConfig, +) -> Result { + let deny_list_data = config.deny_list_data; + let literal_patterns = literal_patterns_from_binding( + config.literal_patterns, + config.literal_patterns_from_deny_list_data, + deny_list_data.as_ref(), + )?; + let legal_form_data = config.legal_form_data.map(|data| LegalFormData { + suffixes: data.suffixes, + normalized_boundary_suffixes: data.normalized_boundary_suffixes, + normalized_in_name_words: data.normalized_in_name_words, + normalized_suffix_words: data.normalized_suffix_words, + role_heads: data.role_heads, + sentence_verb_indicators: data.sentence_verb_indicators, + clause_noun_heads: data.clause_noun_heads, + connector_prose_heads: data.connector_prose_heads, + structural_single_cap_prefixes: data.structural_single_cap_prefixes, + leading_clause_phrases: data.leading_clause_phrases, + leading_clause_direct_prefixes: data.leading_clause_direct_prefixes, + connector_words: data.connector_words, + and_connector_words: data.and_connector_words, + in_name_prepositions: data.in_name_prepositions, + company_suffix_words: data.company_suffix_words, + comma_gated_direct_prefixes: data.comma_gated_direct_prefixes, + }); + let legal_form_suffixes = legal_form_data + .as_ref() + .map_or_else(Vec::new, |data| data.suffixes.clone()); + Ok(PreparedSearchConfig { + regex_patterns: search_patterns_from_binding(config.regex_patterns)?, + custom_regex_patterns: search_patterns_from_binding( + config.custom_regex_patterns, + )?, + literal_patterns, + regex_options: search_options_from_binding(config.regex_options), + custom_regex_options: search_options_from_binding( + config.custom_regex_options, + ), + literal_options: search_options_from_binding(config.literal_options), + allowed_labels: config.allowed_labels, + threshold: config.threshold, + confidence_boost: config.confidence_boost, + slices: slices_from_binding(&config.slices), + regex_meta: regex_meta_from_binding(config.regex_meta)?, + custom_regex_meta: regex_meta_from_binding(config.custom_regex_meta)?, + deny_list_data: deny_list_data + .map(deny_list_data_from_binding) + .transpose()?, + false_positive_filters: config + .false_positive_filters + .map(deny_list_filters_from_binding), + gazetteer_data: config.gazetteer_data.map(|data| GazetteerMatchData { + labels: data.labels, + is_fuzzy: data.is_fuzzy, + }), + country_data: config.country_data.map(|data| CountryMatchData { + labels: data.labels, + }), + hotword_data: config.hotword_data.map(hotword_data_from_binding), + trigger_data: config + .trigger_data + .map(|data| trigger_data_from_binding(data, legal_form_suffixes)), + legal_form_data, + address_seed_data: config.address_seed_data.map(|data| AddressSeedData { + boundary_words: data.boundary_words, + br_cep_cue_words: data.br_cep_cue_words, + unit_abbreviations: data.unit_abbreviations, + }), + zone_data: config.zone_data.map(zone_data_from_binding), + address_context_data: config.address_context_data.map(|data| { + AddressContextData { + address_prepositions: data.address_prepositions, + temporal_prepositions: data.temporal_prepositions, + street_abbreviations: data.street_abbreviations, + bare_house_stopwords: data.bare_house_stopwords, + } + }), + coreference_data: config + .coreference_data + .map(coreference_data_from_binding), + name_corpus_data: config + .name_corpus_data + .map(name_corpus_data_from_binding), + date_data: config.date_data.map(|data| DateData { + month_names_by_language: data.month_names_by_language, + year_words_by_language: data.year_words_by_language, + }), + monetary_data: config.monetary_data.map(monetary_data_from_binding), + }) +} + +enum PreparedSearchPackageParts<'a> { + Raw { + core: bool, + digest: [u8; 32], + payload: &'a [u8], + }, + Compressed { + core: bool, + digest: [u8; 32], + uncompressed_len: usize, + payload: &'a [u8], + }, +} + +impl<'a> PreparedSearchPackageParts<'a> { + const fn digest(&self) -> [u8; 32] { + match self { + Self::Raw { digest, .. } | Self::Compressed { digest, .. } => *digest, + } + } + + const fn is_core(&self) -> bool { + match self { + Self::Raw { core, .. } | Self::Compressed { core, .. } => *core, + } + } + + fn into_payload(self) -> Result> { + match self { + Self::Raw { payload, .. } => Ok(Cow::Borrowed(payload)), + Self::Compressed { + uncompressed_len, + payload, + .. + } => { + if uncompressed_len > MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES { + return Err(invalid_prepared_search_package( + "uncompressed payload length exceeds limit", + )); + } + zstd::bulk::decompress(payload, uncompressed_len) + .map(Cow::Owned) + .map_err(|error| invalid_prepared_search_package(error.to_string())) + } + } + } +} + +struct RawPackageHeader<'a> { + digest: [u8; 32], + payload: &'a [u8], +} + +fn prepared_search_package_parts( + bytes: &[u8], +) -> Result> { + let header = bytes + .get(..PREPARED_SEARCH_PACKAGE_HEADER.len()) + .ok_or_else(|| invalid_prepared_search_package("truncated header"))?; + if header == PREPARED_SEARCH_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_PACKAGE_VERSION, + PREPARED_SEARCH_PACKAGE_HEADER.len(), + )?; + return Ok(PreparedSearchPackageParts::Raw { + core: false, + digest: raw.digest, + payload: raw.payload, + }); + } + if header == PREPARED_SEARCH_CORE_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_CORE_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_PACKAGE_HEADER.len(), + )?; + return Ok(PreparedSearchPackageParts::Raw { + core: true, + digest: raw.digest, + payload: raw.payload, + }); + } + if header == PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER.len(), + )?; + let len_end = std::mem::size_of::(); + let len_bytes = raw + .payload + .get(..len_end) + .ok_or_else(|| invalid_prepared_search_package("truncated length"))?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed length"))?; + let uncompressed_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("length overflow"))?; + let payload = raw + .payload + .get(len_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + return Ok(PreparedSearchPackageParts::Compressed { + core: false, + digest: raw.digest, + uncompressed_len, + payload, + }); + } + if header == PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER { + let raw = raw_package_header( + bytes, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER.len(), + )?; + let len_end = std::mem::size_of::(); + let len_bytes = raw + .payload + .get(..len_end) + .ok_or_else(|| invalid_prepared_search_package("truncated length"))?; + let len_array = <[u8; 8]>::try_from(len_bytes) + .map_err(|_| invalid_prepared_search_package("malformed length"))?; + let uncompressed_len = usize::try_from(u64::from_le_bytes(len_array)) + .map_err(|_| invalid_prepared_search_package("length overflow"))?; + let payload = raw + .payload + .get(len_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + return Ok(PreparedSearchPackageParts::Compressed { + core: true, + digest: raw.digest, + uncompressed_len, + payload, + }); + } + Err(invalid_prepared_search_package("unexpected header")) +} + +fn raw_package_header( + bytes: &[u8], + expected_version: u32, + header_len: usize, +) -> Result> { + let version_start = header_len; + let version_end = version_start.saturating_add(std::mem::size_of::()); + let version_bytes = bytes + .get(version_start..version_end) + .ok_or_else(|| invalid_prepared_search_package("truncated version"))?; + let version_array = <[u8; 4]>::try_from(version_bytes) + .map_err(|_| invalid_prepared_search_package("malformed version"))?; + let version = u32::from_le_bytes(version_array); + if version != expected_version { + return Err(invalid_prepared_search_package("unsupported version")); + } + let digest_end = + version_end.saturating_add(PREPARED_SEARCH_PACKAGE_DIGEST_BYTES); + let digest_bytes = bytes + .get(version_end..digest_end) + .ok_or_else(|| invalid_prepared_search_package("truncated digest"))?; + let digest = + <[u8; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES]>::try_from(digest_bytes) + .map_err(|_| invalid_prepared_search_package("malformed digest"))?; + let payload = bytes + .get(digest_end..) + .ok_or_else(|| invalid_prepared_search_package("missing payload"))?; + Ok(RawPackageHeader { digest, payload }) +} + +fn verify_prepared_search_package_digest( + expected: [u8; 32], + payload: &[u8], +) -> Result<()> { + let actual = blake3::hash(payload); + if actual.as_bytes() != &expected { + return Err(invalid_prepared_search_package("digest mismatch")); + } + Ok(()) +} + +fn package_bincode_config() -> impl bincode::config::Config { + bincode::config::standard() + .with_little_endian() + .with_variable_int_encoding() +} + +fn invalid_prepared_search_package(reason: impl Into) -> ContractError { + ContractError::InvalidPreparedSearchPackage { + reason: reason.into(), + } +} + +fn deny_list_data_from_binding( + data: BindingDenyListMatchData, +) -> Result { + let pattern_count = data.originals.len(); + Ok(DenyListMatchData { + labels: string_groups_from_binding( + data.labels, + data.label_indices, + data.label_table.clone(), + pattern_count, + "deny_list.label_indices", + )?, + custom_labels: string_groups_from_binding( + data.custom_labels, + data.custom_label_indices, + data.label_table, + pattern_count, + "deny_list.custom_label_indices", + )?, + originals: data.originals, + sources: string_groups_from_binding( + data.sources, + data.source_indices, + data.source_table, + pattern_count, + "deny_list.source_indices", + )?, + filters: data.filters.map(deny_list_filters_from_binding), + }) +} + +fn string_groups_from_binding( + groups: Vec>, + indices: Vec>, + table: Vec, + pattern_count: usize, + field: &'static str, +) -> Result { + if !indices.is_empty() { + validate_compact_string_indices(&indices, &table, field)?; + return StringGroups::from_table_indices(table, indices, field).map_err( + |error| ContractError::InvalidCompactStringGroups { + field, + reason: error.to_string(), + }, + ); + } + + if !groups.is_empty() { + return Ok(StringGroups::from_groups(groups)); + } + + Ok(StringGroups::empty_groups(pattern_count)) +} + +fn validate_compact_string_indices( + groups: &[Vec], + table: &[String], + field: &'static str, +) -> Result<()> { + for group in groups { + for &index in group { + let Ok(index_usize) = usize::try_from(index) else { + return Err(ContractError::CompactStringIndexOutOfRange { + field, + index, + }); + }; + if index_usize >= table.len() { + return Err(ContractError::CompactStringIndexOutOfRange { + field, + index, + }); + } + } + } + + Ok(()) +} + +fn monetary_data_from_binding(data: BindingMonetaryData) -> MonetaryData { + MonetaryData { + currencies: CurrencyData { + codes: data.currencies.codes, + symbols: data.currencies.symbols, + local_names: data.currencies.local_names, + }, + amount_words: AmountWordsData { + written_amount_patterns: data + .amount_words + .written_amount_patterns + .into_iter() + .map(|entry| WrittenAmountPatternData { + keywords: entry.keywords, + }) + .collect(), + magnitude_suffixes: data + .amount_words + .magnitude_suffixes + .into_iter() + .map(|entry| MagnitudeSuffixData { + words: entry.words, + abbreviations_case_insensitive: entry.abbreviations_case_insensitive, + abbreviations_case_sensitive: entry.abbreviations_case_sensitive, + }) + .collect(), + share_quantity_terms: data + .amount_words + .share_quantity_terms + .into_iter() + .map(|entry| ShareQuantityTermData { + modifiers: entry.modifiers, + nouns: entry.nouns, + }) + .collect(), + }, + } +} + +fn hotword_data_from_binding(data: BindingHotwordRuleData) -> HotwordRuleData { + HotwordRuleData { + rules: data + .rules + .into_iter() + .map(|rule| HotwordRule { + hotwords: rule.hotwords, + target_labels: rule.target_labels, + score_adjustment: rule.score_adjustment, + reclassify_to: rule.reclassify_to, + proximity_before: rule.proximity_before, + proximity_after: rule.proximity_after, + }) + .collect(), + pattern_rule_indices: data.pattern_rule_indices, + } +} + +fn coreference_data_from_binding( + data: BindingCoreferenceData, +) -> CoreferenceData { + CoreferenceData { + definition_patterns: data + .definition_patterns + .into_iter() + .map(|pattern| CoreferencePatternData { + pattern: pattern.pattern, + flags: pattern.flags, + }) + .collect(), + role_stop_terms: data.role_stop_terms, + legal_form_aliases: data.legal_form_aliases, + organization_suffixes: data.organization_suffixes, + organization_determiners: data.organization_determiners, + } +} + +fn name_corpus_data_from_binding( + data: BindingNameCorpusData, +) -> NameCorpusData { + NameCorpusData { + first_names: data.first_names, + surnames: data.surnames, + title_tokens: data.title_tokens, + title_abbreviations: data.title_abbreviations, + excluded_words: data.excluded_words, + common_words: data.common_words, + non_western_names: data.non_western_names, + excluded_all_caps: data.excluded_all_caps, + ja_suffixes: data.ja_suffixes, + arabic_connectors: data.arabic_connectors, + relation_connectors: data.relation_connectors, + hyphenated_prefixes: data.hyphenated_prefixes, + cjk_non_person_terms: data.cjk_non_person_terms, + cjk_surname_starters: data.cjk_surname_starters, + organization_terms: data.organization_terms, + } +} + +fn zone_data_from_binding(data: BindingZoneData) -> ZoneData { + ZoneData { + section_heading_patterns: data + .section_heading_patterns + .into_iter() + .map(|pattern| ZonePatternData { + pattern: pattern.pattern, + flags: pattern.flags, + }) + .collect(), + signing_clauses: data + .signing_clauses + .into_iter() + .map(|clause| ZoneSigningClauseData { + prefix: clause.prefix, + suffix: clause.suffix, + prepositions: clause.prepositions, + }) + .collect(), + } +} + +pub fn operator_config_from_binding( + config: Option, +) -> Result { + let Some(config) = config else { + return Ok(OperatorConfig::default()); + }; + + let mut operators = BTreeMap::new(); + for (label, value) in config.operators.unwrap_or_default() { + operators.insert(label, operator_type_from_binding(&value)?); + } + + Ok(OperatorConfig { + operators, + redact_string: config + .redact_string + .unwrap_or_else(|| String::from("[REDACTED]")), + }) +} + +#[must_use] +pub fn static_redaction_result_to_binding( + result: StaticRedactionResult, +) -> BindingStaticRedactionResult { + BindingStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(|entity| BindingPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: detection_source_name(entity.source), + source_detail: entity.source_detail.map(source_detail_name), + }) + .collect(), + redaction: BindingRedactionResult { + redacted_text: result.redaction.redacted_text, + redaction_map: result + .redaction + .redaction_map + .into_iter() + .map(|entry| BindingRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: result + .redaction + .operator_map + .into_iter() + .map(|entry| BindingOperatorEntry { + placeholder: entry.placeholder, + operator: operator_name(entry.operator), + }) + .collect(), + entity_count: result.redaction.entity_count, + }, + } +} + +pub fn static_redaction_result_to_utf16_binding( + result: StaticRedactionResult, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut result = static_redaction_result_to_binding(result); + convert_pipeline_entity_offsets(&mut result.resolved_entities, &offsets)?; + Ok(result) +} + +#[must_use] +pub fn static_redaction_diagnostic_result_to_binding( + result: StaticRedactionDiagnosticResult, +) -> BindingStaticRedactionDiagnosticResult { + BindingStaticRedactionDiagnosticResult { + result: static_redaction_result_to_binding(result.result), + diagnostics: static_redaction_diagnostics_to_binding(result.diagnostics), + } +} + +pub fn static_redaction_diagnostic_result_to_utf16_binding( + result: StaticRedactionDiagnosticResult, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut result = static_redaction_diagnostic_result_to_binding(result); + convert_pipeline_entity_offsets( + &mut result.result.resolved_entities, + &offsets, + )?; + convert_diagnostic_offsets(&mut result.diagnostics.events, &offsets)?; + Ok(result) +} + +#[must_use] +pub fn static_redaction_diagnostics_to_binding( + diagnostics: StaticRedactionDiagnostics, +) -> BindingStaticRedactionDiagnostics { + BindingStaticRedactionDiagnostics { + events: diagnostics + .events + .into_iter() + .map(diagnostic_event_to_binding) + .collect(), + } +} + +pub fn static_redaction_diagnostics_to_utf16_binding( + diagnostics: StaticRedactionDiagnostics, + full_text: &str, +) -> Result { + let offsets = Utf16OffsetMap::new(full_text)?; + let mut diagnostics = static_redaction_diagnostics_to_binding(diagnostics); + convert_diagnostic_offsets(&mut diagnostics.events, &offsets)?; + Ok(diagnostics) +} + +fn diagnostic_event_to_binding( + event: DiagnosticEvent, +) -> BindingDiagnosticEvent { + BindingDiagnosticEvent { + stage: diagnostic_stage_name(event.stage), + kind: diagnostic_event_kind_name(event.kind), + count: event.count, + engine: event.engine.map(search_engine_name), + pattern: event.pattern, + source: event.source.map(detection_source_name), + source_detail: event.source_detail.map(source_detail_name), + label: event.label, + start: event.start, + end: event.end, + text: event.text, + score: event.score, + span_valid: event.span_valid, + elapsed_us: event.elapsed_us, + input_bytes: event.input_bytes, + reason: event.reason, + } +} + +fn convert_pipeline_entity_offsets( + entities: &mut [BindingPipelineEntity], + offsets: &Utf16OffsetMap, +) -> Result<()> { + for entity in entities { + entity.start = offsets.convert(entity.start)?; + entity.end = offsets.convert(entity.end)?; + } + Ok(()) +} + +fn convert_diagnostic_offsets( + events: &mut [BindingDiagnosticEvent], + offsets: &Utf16OffsetMap, +) -> Result<()> { + for event in events { + if let Some(start) = event.start { + event.start = Some(offsets.convert(start)?); + } + if let Some(end) = event.end { + event.end = Some(offsets.convert(end)?); + } + } + Ok(()) +} + +struct Utf16OffsetMap { + boundaries: Vec<(u32, u32)>, +} + +impl Utf16OffsetMap { + fn new(text: &str) -> Result { + let mut boundaries = Vec::new(); + let mut utf16_offset = 0_u32; + boundaries.push((0, 0)); + + for (byte_start, ch) in text.char_indices() { + utf16_offset = utf16_offset + .checked_add(char_utf16_width(ch)) + .ok_or_else(|| ContractError::InvalidPreparedSearchPackage { + reason: String::from("UTF-16 offset exceeds u32 range"), + })?; + let byte_end = byte_start.saturating_add(ch.len_utf8()); + boundaries.push((u32_from_usize(byte_end)?, utf16_offset)); + } + + Ok(Self { boundaries }) + } + + fn convert(&self, offset: u32) -> Result { + self + .try_convert(offset) + .ok_or(ContractError::InvalidBindingOffset { offset }) + } + + fn try_convert(&self, offset: u32) -> Option { + let index = self + .boundaries + .binary_search_by_key(&offset, |(byte_offset, _)| *byte_offset) + .ok()?; + self + .boundaries + .get(index) + .map(|(_, utf16_offset)| *utf16_offset) + } +} + +const fn char_utf16_width(ch: char) -> u32 { + if ch.len_utf16() == 1 { 1 } else { 2 } +} + +fn u32_from_usize(value: usize) -> Result { + u32::try_from(value).map_err(|_| { + ContractError::InvalidPreparedSearchPackage { + reason: format!("Offset exceeds u32 range: {value}"), + } + }) +} + +fn deny_list_filters_from_binding( + filters: BindingDenyListFilterData, +) -> DenyListFilterData { + DenyListFilterData { + stopwords: lower_set(filters.stopwords), + allow_list: lower_set(filters.allow_list), + person_stopwords: lower_set(filters.person_stopwords), + person_trailing_nouns: lower_set(filters.person_trailing_nouns), + address_stopwords: lower_set(filters.address_stopwords), + address_jurisdiction_prefixes: lower_set( + filters.address_jurisdiction_prefixes, + ), + street_types: lower_set(filters.street_types), + address_component_terms: lower_set(filters.address_component_terms), + ambiguous_street_type_terms: lower_set(filters.ambiguous_street_type_terms), + first_names: lower_set(filters.first_names), + generic_roles: lower_set(filters.generic_roles), + number_abbrev_prefixes: lower_set(filters.number_abbrev_prefixes), + sentence_starters: lower_set(filters.sentence_starters), + trailing_address_word_exclusions: lower_set( + filters.trailing_address_word_exclusions, + ), + document_heading_words: lower_set(filters.document_heading_words), + document_heading_ordinal_markers: lower_set( + filters.document_heading_ordinal_markers, + ), + defined_term_cues: lower_set(filters.defined_term_cues), + signing_place_guards: filters + .signing_place_guards + .into_iter() + .map(|guard| SigningPlaceGuardData { + prefix_phrases: lower_set(guard.prefix_phrases), + suffix_phrases: lower_set(guard.suffix_phrases), + }) + .collect(), + } +} + +fn trigger_data_from_binding( + data: BindingTriggerData, + legal_form_suffixes: Vec, +) -> TriggerData { + TriggerData { + rules: data + .rules + .into_iter() + .map(trigger_rule_from_binding) + .collect(), + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + legal_form_suffixes, + post_nominals: data.post_nominals, + sentence_terminal_currency_terms: data.sentence_terminal_currency_terms, + } +} + +fn trigger_rule_from_binding(rule: BindingTriggerRule) -> TriggerRule { + TriggerRule { + trigger: rule.trigger, + label: rule.label, + strategy: trigger_strategy_from_binding(rule.strategy), + validations: rule + .validations + .into_iter() + .map(trigger_validation_from_binding) + .collect(), + include_trigger: rule.include_trigger, + } +} + +fn trigger_strategy_from_binding( + strategy: BindingTriggerStrategy, +) -> TriggerStrategy { + match strategy { + BindingTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => TriggerStrategy::ToNextComma { + stop_words, + max_length, + }, + BindingTriggerStrategy::ToEndOfLine => TriggerStrategy::ToEndOfLine, + BindingTriggerStrategy::NWords { count } => { + TriggerStrategy::NWords { count } + } + BindingTriggerStrategy::CompanyIdValue => TriggerStrategy::CompanyIdValue, + BindingTriggerStrategy::Address { max_chars } => { + TriggerStrategy::Address { max_chars } + } + BindingTriggerStrategy::MatchPattern { pattern, flags } => { + TriggerStrategy::MatchPattern { pattern, flags } + } + } +} + +fn trigger_validation_from_binding( + validation: BindingTriggerValidation, +) -> TriggerValidation { + match validation { + BindingTriggerValidation::StartsUppercase => { + TriggerValidation::StartsUppercase + } + BindingTriggerValidation::MinLength { min } => { + TriggerValidation::MinLength(min) + } + BindingTriggerValidation::MaxLength { max } => { + TriggerValidation::MaxLength(max) + } + BindingTriggerValidation::NoDigits => TriggerValidation::NoDigits, + BindingTriggerValidation::HasDigits => TriggerValidation::HasDigits, + BindingTriggerValidation::MatchesPattern { pattern, flags } => { + TriggerValidation::MatchesPattern { pattern, flags } + } + BindingTriggerValidation::ValidId { validator } => { + TriggerValidation::ValidId { validator } + } + } +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn search_patterns_from_binding( + patterns: Vec, +) -> Result> { + patterns + .into_iter() + .map(search_pattern_from_binding) + .collect() +} + +fn literal_patterns_from_binding( + patterns: Vec, + from_deny_list_data: bool, + deny_list_data: Option<&BindingDenyListMatchData>, +) -> Result> { + let mut literal_patterns = search_patterns_from_binding(patterns)?; + if !from_deny_list_data { + return Ok(literal_patterns); + } + + let Some(data) = deny_list_data else { + return Err(ContractError::MissingDenyListDataForLiteralPatterns); + }; + let mut from_data = Vec::with_capacity( + data.originals.len().saturating_add(literal_patterns.len()), + ); + from_data.extend(data.originals.iter().cloned().map(SearchPattern::Literal)); + from_data.append(&mut literal_patterns); + Ok(from_data) +} + +fn search_pattern_from_binding( + pattern: BindingSearchPattern, +) -> Result { + match pattern.kind.as_str() { + "literal" => Ok(SearchPattern::Literal(pattern.pattern)), + "literal-with-options" => Ok(SearchPattern::LiteralWithOptions { + pattern: pattern.pattern, + case_insensitive: pattern.case_insensitive, + whole_words: pattern.whole_words, + }), + "regex" => { + if pattern.lazy.is_some() + || pattern.prefilter_any.is_some() + || pattern.prefilter_case_insensitive.is_some() + || pattern.prefilter_regex.is_some() + { + return Ok(SearchPattern::RegexWithOptions { + pattern: pattern.pattern, + lazy: pattern.lazy.unwrap_or(false), + prefilter_any: pattern.prefilter_any.unwrap_or_default(), + prefilter_case_insensitive: pattern.prefilter_case_insensitive, + prefilter_regex: pattern.prefilter_regex, + }); + } + Ok(SearchPattern::Regex(pattern.pattern)) + } + "fuzzy" => Ok(SearchPattern::Fuzzy { + pattern: pattern.pattern, + distance: pattern + .distance + .map(|distance| { + u8::try_from(distance) + .map_err(|_| ContractError::FuzzyDistanceOutOfRange { distance }) + }) + .transpose()?, + }), + _ => { + Err(ContractError::UnsupportedSearchPatternKind { kind: pattern.kind }) + } + } +} + +fn search_options_from_binding( + options: Option, +) -> SearchOptions { + let Some(options) = options else { + return SearchOptions::default(); + }; + + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: options.literal_case_insensitive.unwrap_or(false), + whole_words: options.literal_whole_words.unwrap_or(false), + }, + regex: RegexSearchOptions { + whole_words: options.regex_whole_words.unwrap_or(false), + overlap_all: options.regex_overlap_all.unwrap_or(false), + }, + fuzzy: FuzzySearchOptions { + case_insensitive: options.fuzzy_case_insensitive.unwrap_or(false), + whole_words: options.fuzzy_whole_words.unwrap_or(true), + normalize_diacritics: options.fuzzy_normalize_diacritics.unwrap_or(false), + }, + } +} + +fn slices_from_binding( + slices: &BindingPreparedSearchSlices, +) -> PreparedSearchSlices { + PreparedSearchSlices { + regex: slice_from_binding(slices.regex), + custom_regex: slice_from_binding(slices.custom_regex), + legal_forms: slice_from_binding(slices.legal_forms), + triggers: slice_from_binding(slices.triggers), + deny_list: slice_from_binding(slices.deny_list), + street_types: slice_from_binding(slices.street_types), + gazetteer: slice_from_binding(slices.gazetteer), + countries: slice_from_binding(slices.countries), + hotwords: slice_from_binding(slices.hotwords), + } +} + +fn slice_from_binding(slice: Option) -> PatternSlice { + slice.map_or_else(PatternSlice::default, |slice| PatternSlice { + start: slice.start, + end: slice.end, + }) +} + +fn regex_meta_from_binding( + meta: Vec, +) -> Result> { + meta + .into_iter() + .map(|entry| { + Ok(RegexMatchMeta { + label: entry.label, + score: entry.score, + source_detail: entry + .source_detail + .map(|value| source_detail_from_binding(&value)) + .transpose()?, + requires_validation: entry.requires_validation.unwrap_or(false), + validator_id: entry.validator_id, + validator_input: entry.validator_input, + min_byte_length: entry.min_byte_length, + }) + }) + .collect() +} + +fn source_detail_from_binding(value: &str) -> Result { + match value { + "custom-deny-list" => Ok(SourceDetail::CustomDenyList), + "custom-regex" => Ok(SourceDetail::CustomRegex), + "gazetteer-extension" => Ok(SourceDetail::GazetteerExtension), + "address-context" => Ok(SourceDetail::AddressContext), + _ => Err(ContractError::UnsupportedSourceDetail { + value: value.to_owned(), + }), + } +} + +fn operator_type_from_binding(value: &str) -> Result { + match value { + "replace" => Ok(OperatorType::Replace), + "redact" => Ok(OperatorType::Redact), + _ => Err(ContractError::UnsupportedOperator { + value: value.to_owned(), + }), + } +} + +fn detection_source_name(source: DetectionSource) -> String { + match source { + DetectionSource::Trigger => "trigger", + DetectionSource::Regex => "regex", + DetectionSource::DenyList => "deny-list", + DetectionSource::LegalForm => "legal-form", + DetectionSource::Gazetteer => "gazetteer", + DetectionSource::Country => "country", + DetectionSource::Ner => "ner", + DetectionSource::Coreference => "coreference", + } + .to_owned() +} + +fn source_detail_name(detail: SourceDetail) -> String { + match detail { + SourceDetail::CustomDenyList => "custom-deny-list", + SourceDetail::CustomRegex => "custom-regex", + SourceDetail::GazetteerExtension => "gazetteer-extension", + SourceDetail::AddressContext => "address-context", + } + .to_owned() +} + +fn search_engine_name(engine: SearchEngine) -> String { + match engine { + SearchEngine::Literal => "literal", + SearchEngine::Regex => "regex", + SearchEngine::Fuzzy => "fuzzy", + SearchEngine::Text => "text-search", + } + .to_owned() +} + +fn diagnostic_stage_name(stage: DiagnosticStage) -> String { + match stage { + DiagnosticStage::PrepareCacheHit => "prepare.cache.hit", + DiagnosticStage::PrepareCacheMiss => "prepare.cache.miss", + DiagnosticStage::PrepareBindingParse => "prepare.binding.parse", + DiagnosticStage::PreparePackageDecode => "prepare.package.decode", + DiagnosticStage::PrepareBindingConvert => "prepare.binding.convert", + DiagnosticStage::PrepareArtifactsDecode => "prepare.artifacts.decode", + DiagnosticStage::PrepareTotal => "prepare.total", + DiagnosticStage::PrepareRegex => "prepare.regex", + DiagnosticStage::PrepareCustomRegex => "prepare.custom-regex", + DiagnosticStage::PrepareAnchored => "prepare.anchored", + DiagnosticStage::PrepareLegalFormSearch => "prepare.legal-form-search", + DiagnosticStage::PrepareTriggerSearch => "prepare.trigger-search", + DiagnosticStage::PrepareLiteral => "prepare.literal", + DiagnosticStage::Normalize => "normalize", + DiagnosticStage::FindMatches => "find-matches", + DiagnosticStage::FindRegex => "find.regex", + DiagnosticStage::FindCustomRegex => "find.custom-regex", + DiagnosticStage::FindLiteral => "find.literal", + DiagnosticStage::SearchRegex => "search.regex", + DiagnosticStage::SearchCustomRegex => "search.custom-regex", + DiagnosticStage::SearchLegalForm => "search.legal-form", + DiagnosticStage::SearchTrigger => "search.trigger", + DiagnosticStage::SearchLiteral => "search.literal", + DiagnosticStage::EntityRegex => "entity.regex", + DiagnosticStage::EntityCustomRegex => "entity.custom-regex", + DiagnosticStage::EntityAnchored => "entity.anchored", + DiagnosticStage::EntityDenyList => "entity.deny-list", + DiagnosticStage::EntityGazetteer => "entity.gazetteer", + DiagnosticStage::EntityCountry => "entity.country", + DiagnosticStage::EntityTrigger => "entity.trigger", + DiagnosticStage::EntitySignature => "entity.signature", + DiagnosticStage::EntityLegalForm => "entity.legal-form", + DiagnosticStage::EntityAddressSeed => "entity.address-seed", + DiagnosticStage::EntityNameCorpus => "entity.name-corpus", + DiagnosticStage::EntityZoneAdjustment => "entity.zone-adjustment", + DiagnosticStage::EntityAddressContext => "entity.address-context", + DiagnosticStage::EntityCoreference => "entity.coreference", + DiagnosticStage::Merge => "resolution.merge", + DiagnosticStage::Boundary => "resolution.boundary", + DiagnosticStage::Sanitize => "resolution.sanitize", + DiagnosticStage::Redaction => "redaction", + } + .to_owned() +} + +fn diagnostic_event_kind_name(kind: DiagnosticEventKind) -> String { + match kind { + DiagnosticEventKind::StageSummary => "stage-summary", + DiagnosticEventKind::SearchMatch => "search-match", + DiagnosticEventKind::Entity => "entity", + DiagnosticEventKind::Rejection => "rejection", + } + .to_owned() +} + +fn operator_name(operator: OperatorType) -> String { + match operator { + OperatorType::Replace => "replace", + OperatorType::Redact => "redact", + } + .to_owned() +} + +#[cfg(test)] +mod tests { + #![allow(clippy::unwrap_used)] + + use super::{ + BindingOperatorConfig, BindingPreparedSearchConfig, BindingSearchOptions, + BindingSearchPattern, ContractError, + MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES, + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + PREPARED_SEARCH_PACKAGE_DIGEST_BYTES, operator_config_from_binding, + prepared_search_config_from_binding, + prepared_search_core_package_from_bytes, + prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_package_from_bytes, + prepared_search_package_has_core_payload, prepared_search_package_to_bytes, + prepared_search_package_to_compressed_bytes, + static_redaction_diagnostics_to_utf16_binding, write_package_header, + }; + use stella_anonymize_core::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, + StaticRedactionDiagnostics, + }; + + #[test] + fn prepared_search_package_roundtrips_config_and_artifacts() { + let config = package_test_config(); + let artifacts = b"prepared-artifacts"; + + let bytes = prepared_search_package_to_bytes(&config, artifacts).unwrap(); + let package = prepared_search_package_from_bytes(&bytes).unwrap(); + + assert_eq!(package.config, config); + assert_eq!(package.artifacts, artifacts); + } + + #[test] + fn prepared_search_package_rejects_invalid_bytes() { + let error = prepared_search_package_from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "invalid package bytes should fail before config construction" + ); + } + + #[test] + fn prepared_search_package_rejects_digest_mismatch() { + let config = BindingPreparedSearchConfig::default(); + let mut bytes = + prepared_search_package_to_bytes(&config, b"artifact").unwrap(); + let last = bytes.last_mut().unwrap(); + *last ^= 0x01; + + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "corrupted package payload should fail digest verification" + ); + } + + #[test] + fn binding_operator_config_accepts_camel_case_redact_string() { + let config = serde_json::from_str::( + r#"{"operators":{"country":"redact"},"redactString":"***"}"#, + ) + .unwrap(); + let operators = operator_config_from_binding(Some(config)).unwrap(); + + assert_eq!(operators.redact_string, "***"); + } + + #[test] + fn binding_search_options_accept_regex_overlap_all() { + let config = BindingPreparedSearchConfig { + custom_regex_options: Some(BindingSearchOptions { + regex_overlap_all: Some(true), + ..BindingSearchOptions::default() + }), + ..BindingPreparedSearchConfig::default() + }; + let core = prepared_search_config_from_binding(config).unwrap(); + + assert!(core.custom_regex_options.regex.overlap_all); + } + + #[test] + fn utf16_diagnostics_reject_invalid_byte_offsets() { + let diagnostics = StaticRedactionDiagnostics { + events: vec![DiagnosticEvent { + stage: DiagnosticStage::EntityRegex, + kind: DiagnosticEventKind::Entity, + count: None, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: Some(1), + end: Some(2), + text: None, + score: None, + span_valid: None, + elapsed_us: None, + input_bytes: None, + reason: None, + }], + }; + + let error = static_redaction_diagnostics_to_utf16_binding(diagnostics, "á") + .unwrap_err(); + + assert!(matches!( + error, + ContractError::InvalidBindingOffset { offset: 1 } + )); + } + + #[test] + fn prepared_search_compressed_package_roundtrips_config_and_artifacts() { + let config = package_test_config(); + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_package_to_compressed_bytes(&config, artifacts).unwrap(); + let package = prepared_search_package_from_bytes(&bytes).unwrap(); + + assert_eq!(package.config, config); + assert_eq!(package.artifacts, artifacts); + } + + #[test] + fn prepared_search_compressed_package_rejects_digest_mismatch() { + let config = BindingPreparedSearchConfig::default(); + let mut bytes = + prepared_search_package_to_compressed_bytes(&config, b"artifact") + .unwrap(); + let last = bytes.last_mut().unwrap(); + *last ^= 0x01; + + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!( + matches!(error, ContractError::InvalidPreparedSearchPackage { .. }), + "corrupted compressed package should fail digest verification" + ); + } + + #[test] + fn prepared_search_compressed_package_rejects_oversized_payload_len() { + let bytes = compressed_package_with_len( + PREPARED_SEARCH_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_COMPRESSED_PACKAGE_VERSION, + oversized_payload_len(), + ); + let error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert_invalid_package_reason( + error, + "uncompressed payload length exceeds limit", + ); + } + + #[test] + fn prepared_search_core_compressed_package_rejects_oversized_payload_len() { + let bytes = compressed_package_with_len( + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_HEADER, + PREPARED_SEARCH_CORE_COMPRESSED_PACKAGE_VERSION, + oversized_payload_len(), + ); + let error = prepared_search_core_package_from_bytes(&bytes).unwrap_err(); + + assert_invalid_package_reason( + error, + "uncompressed payload length exceeds limit", + ); + } + + #[test] + fn prepared_search_core_package_roundtrips_config_and_artifacts() { + let config = + prepared_search_config_from_binding(package_test_config()).unwrap(); + let mut compact_config = config.clone(); + compact_config.literal_patterns.clear(); + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_core_package_to_bytes(&config, artifacts).unwrap(); + let package = prepared_search_core_package_from_bytes(&bytes).unwrap(); + let binding_error = prepared_search_package_from_bytes(&bytes).unwrap_err(); + + assert!(prepared_search_package_has_core_payload(&bytes)); + assert_eq!(package.config, compact_config); + assert_eq!(package.artifacts, artifacts); + assert!( + matches!( + binding_error, + ContractError::InvalidPreparedSearchPackage { .. } + ), + "binding package loader should reject core payloads" + ); + } + + #[test] + fn prepared_search_core_compressed_package_roundtrips_config_and_artifacts() { + let config = + prepared_search_config_from_binding(package_test_config()).unwrap(); + let mut compact_config = config.clone(); + compact_config.literal_patterns.clear(); + let artifacts = b"prepared-artifacts"; + + let bytes = + prepared_search_core_package_to_compressed_bytes(&config, artifacts) + .unwrap(); + let package = prepared_search_core_package_from_bytes(&bytes).unwrap(); + + assert!(prepared_search_package_has_core_payload(&bytes)); + assert_eq!(package.config, compact_config); + assert_eq!(package.artifacts, artifacts); + } + + fn package_test_config() -> BindingPreparedSearchConfig { + BindingPreparedSearchConfig { + literal_patterns: vec![BindingSearchPattern { + kind: String::from("literal"), + pattern: String::from("Acme"), + distance: None, + case_insensitive: None, + whole_words: None, + lazy: None, + prefilter_any: None, + prefilter_case_insensitive: None, + prefilter_regex: None, + }], + ..BindingPreparedSearchConfig::default() + } + } + + fn compressed_package_with_len( + header: [u8; 8], + version: u32, + uncompressed_len: u64, + ) -> Vec { + let digest = [0; PREPARED_SEARCH_PACKAGE_DIGEST_BYTES]; + let mut bytes = Vec::new(); + write_package_header(&mut bytes, header, version, &digest); + bytes.extend_from_slice(&uncompressed_len.to_le_bytes()); + bytes + } + + fn oversized_payload_len() -> u64 { + u64::try_from(MAX_PREPARED_SEARCH_PACKAGE_PAYLOAD_BYTES) + .unwrap() + .checked_add(1) + .unwrap() + } + + fn assert_invalid_package_reason(error: ContractError, expected: &str) { + assert!( + matches!( + error, + ContractError::InvalidPreparedSearchPackage { reason } + if reason == expected + ), + "expected invalid package reason: {expected}" + ); + } +} diff --git a/crates/anonymize-core/Cargo.toml b/crates/anonymize-core/Cargo.toml new file mode 100644 index 00000000..38a213c7 --- /dev/null +++ b/crates/anonymize-core/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "stella-anonymize-core" +version.workspace = true +edition.workspace = true +description = "Core anonymization semantics" +license.workspace = true +publish.workspace = true +repository.workspace = true +keywords = ["anonymization", "pii", "redaction", "text"] +categories = ["text-processing"] + +[dependencies] +bon = "3.9.3" +fancy-regex = "0.18" +regex = "1" +serde = { version = "1", features = ["derive"] } +stella-stdnum-core = { version = "2.1.1", git = "https://github.com/stella/stdnum", rev = "2f3c3f107e3976ac059cc438d77916a592595d59" } +stella-text-search-core = { version = "1.0.6", git = "https://github.com/stella/text-search", rev = "0e44094dbcd027218a767439ded062bf615015d0" } + +[dev-dependencies] +proptest = "1" + +[lints] +workspace = true diff --git a/crates/anonymize-core/data/address-final-abbrevs.txt b/crates/anonymize-core/data/address-final-abbrevs.txt new file mode 100644 index 00000000..83b517a7 --- /dev/null +++ b/crates/anonymize-core/data/address-final-abbrevs.txt @@ -0,0 +1,5 @@ +St. +Ave. +Rd. +Blvd. +Sq. diff --git a/crates/anonymize-core/data/legal-period-suffixes.txt b/crates/anonymize-core/data/legal-period-suffixes.txt new file mode 100644 index 00000000..81f05910 --- /dev/null +++ b/crates/anonymize-core/data/legal-period-suffixes.txt @@ -0,0 +1,13 @@ +Inc. +Ltd. +Corp. +N.A. +Kft. +S.A. +a.s. +a. s. +s.r.o. +spol. s r.o. +Pty Ltd. +Ltda. +S.a.s. diff --git a/crates/anonymize-core/src/address_context.rs b/crates/anonymize-core/src/address_context.rs new file mode 100644 index 00000000..b707e5a0 --- /dev/null +++ b/crates/anonymize-core/src/address_context.rs @@ -0,0 +1,738 @@ +use std::collections::BTreeSet; + +use regex::Regex; + +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const HEADER_ZONE_PERCENT: usize = 15; +const STREET_CONTEXT_WINDOW: u32 = 200; +const BARE_HOUSE_CONTEXT_WINDOW: u32 = 50; +const MAX_BACKWARD_WORDS: usize = 5; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct AddressContextData { + #[serde(default)] + pub address_prepositions: Vec, + #[serde(default)] + pub temporal_prepositions: Vec, + #[serde(default)] + pub street_abbreviations: Vec, + #[serde(default)] + pub bare_house_stopwords: Vec, +} + +pub(crate) struct PreparedAddressContextData { + address_prepositions: BTreeSet, + temporal_prepositions: BTreeSet, + street_abbreviations: BTreeSet, + bare_house_stopwords: BTreeSet, + slash_house_number: Regex, + bare_house_number: Regex, + orphan_street_line: Regex, +} + +struct WordBefore { + start: usize, + raw: String, + normalized: String, + has_dot: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ScanRange { + start: usize, + end: usize, +} + +impl PreparedAddressContextData { + pub(crate) fn new(data: AddressContextData) -> Result { + Ok(Self { + address_prepositions: lowercased_set(data.address_prepositions), + temporal_prepositions: lowercased_set(data.temporal_prepositions), + street_abbreviations: lowercased_set(data.street_abbreviations), + bare_house_stopwords: lowercased_set(data.bare_house_stopwords), + slash_house_number: compile_regex( + "address_context.slash_house_number", + r"(?u)\b(?:\d{1,4}/\d+[A-Za-z]\b|\d{3,4}/\d+\b|(?:1[3-9]|[2-9]\d)/\d{3,}\b)", + )?, + bare_house_number: compile_regex( + "address_context.bare_house_number", + r"(?u)(?:^|\s)(?P\p{Lu}\p{Ll}[\p{Ll}\p{Lu}]+\s+\d{1,3})\b", + )?, + orphan_street_line: compile_regex( + "address_context.orphan_street_line", + r"(?um)^[^\S\n]*(?P\p{Lu}[\p{Ll}\p{Lu}]+(?:[^\S\n]+[\p{Lu}\p{Ll}][\p{Ll}]+)*[^\S\n]+\d{2,4}[A-Za-z]?)[^\S\n]*$", + )?, + }) + } + + pub(crate) fn process( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut results = self + .detect_street_patterns_near_addresses(full_text, existing_entities)?; + let mut orphan_context = + Vec::with_capacity(existing_entities.len().saturating_add(results.len())); + orphan_context.extend_from_slice(existing_entities); + orphan_context.extend(results.iter().cloned()); + results + .extend(self.detect_orphan_street_lines(full_text, &orphan_context)?); + Ok(results) + } + + fn detect_street_patterns_near_addresses( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut results = Vec::new(); + let address_entities = existing_entities + .iter() + .filter(|entity| entity.label == "address") + .filter(|entity| !is_caller_owned_entity(entity)) + .collect::>(); + let header_end = header_end(full_text); + let offsets = ByteOffsets::new(full_text); + let scan_ranges = address_context_scan_ranges( + full_text, + &offsets, + header_end, + &address_entities, + )?; + + for range in scan_ranges { + let Some(segment) = full_text.get(range.start..range.end) else { + continue; + }; + for found in self.slash_house_number.find_iter(segment) { + let num_start_byte = range.start.saturating_add(found.start()); + let num_end_byte = range.start.saturating_add(found.end()); + if !self.full_slash_house_match_is_identical( + full_text, + num_start_byte, + num_end_byte, + ) { + continue; + } + let num_start = + usize_to_u32("address_context.num_start", num_start_byte)?; + let num_end = usize_to_u32("address_context.num_end", num_end_byte)?; + if covered_by(existing_entities, num_start, num_end) { + continue; + } + + let in_header = num_start < header_end; + let near_address = address_entities.iter().any(|entity| { + within_context_window(&offsets, entity, num_start, num_end) + }); + if !in_header && !near_address { + continue; + } + + let Some(scan_start) = skip_whitespace_back(full_text, num_start_byte) + else { + continue; + }; + let Some((street_start, has_temporal_prep)) = + self.scan_street_start(full_text, scan_start)? + else { + continue; + }; + let street_start_u32 = + usize_to_u32("address_context.street_start", street_start)?; + if has_temporal_prep { + continue; + } + if covered_by(existing_entities, street_start_u32, num_end) { + continue; + } + + let street_text = text_slice(full_text, street_start_u32, num_end)?; + if street_text.len() < 4 { + continue; + } + let score = address_context_score(full_text, street_start, in_header); + results.push(address_context_entity( + street_start_u32, + num_end, + "address", + street_text, + score, + DetectionSource::Regex, + )); + } + } + + self.detect_bare_house_numbers( + full_text, + existing_entities, + &mut results, + )?; + Ok(results) + } + + fn full_slash_house_match_is_identical( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + self + .slash_house_number + .find_at(full_text, start) + .is_some_and(|found| found.start() == start && found.end() == end) + } + + fn scan_street_start( + &self, + full_text: &str, + mut scan_pos: usize, + ) -> Result> { + let mut has_temporal_prep = false; + let mut street_start = scan_pos; + let mut word_count = 0usize; + + while word_count < MAX_BACKWARD_WORDS { + let Some(word) = word_before(full_text, scan_pos)? else { + break; + }; + if word.normalized.is_empty() { + break; + } + + let is_street_abbrev = word.has_dot + && self.street_abbreviations.contains(&word.raw.to_lowercase()); + let lower_word = word.normalized.to_lowercase(); + let is_prep = self.address_prepositions.contains(&lower_word); + let is_upper = word + .normalized + .chars() + .next() + .is_some_and(char::is_uppercase); + let is_digit_token = is_short_ascii_digit_token(&word.normalized); + if !is_upper && !is_prep && !is_street_abbrev && !is_digit_token { + break; + } + if is_prep && self.temporal_prepositions.contains(&lower_word) { + has_temporal_prep = true; + } + + street_start = word.start; + word_count = word_count.saturating_add(1); + + let before_word = skip_whitespace_back(full_text, word.start); + let Some(next_scan_pos) = before_word else { + break; + }; + let Some((_, previous)) = previous_char(full_text, next_scan_pos) else { + break; + }; + if matches!(previous, '\n' | '\t' | ';' | ',') { + break; + } + scan_pos = next_scan_pos; + } + + if word_count == 0 { + return Ok(None); + } + Ok(Some((street_start, has_temporal_prep))) + } + + fn detect_bare_house_numbers( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + results: &mut Vec, + ) -> Result<()> { + let offsets = ByteOffsets::new(full_text); + let ranges = + bare_house_scan_ranges(full_text, &offsets, existing_entities, results)?; + for range in ranges { + let Some(segment) = full_text.get(range.start..range.end) else { + continue; + }; + for captures in self.bare_house_number.captures_iter(segment) { + let Some(full_match) = captures.get(0) else { + continue; + }; + let match_start = range.start.saturating_add(full_match.start()); + let match_end = range.start.saturating_add(full_match.end()); + if !self.full_bare_house_match_is_identical( + full_text, + match_start, + match_end, + ) { + continue; + } + let Some(captured) = captures.name("value") else { + continue; + }; + let start = usize_to_u32( + "address_context.bare_start", + range.start.saturating_add(captured.start()), + )?; + let end = usize_to_u32( + "address_context.bare_end", + range.start.saturating_add(captured.end()), + )?; + if !near_confirmed_address_same_line( + full_text, + existing_entities, + results, + start, + end, + )? { + continue; + } + + let word = captured + .as_str() + .split_whitespace() + .next() + .unwrap_or("") + .to_lowercase(); + if self.bare_house_stopwords.contains(&word) { + continue; + } + if overlaps_any(existing_entities, start, end) + || overlaps_any(results, start, end) + { + continue; + } + + results.push(address_context_entity( + start, + end, + "address", + captured.as_str(), + 0.75, + DetectionSource::Regex, + )); + } + } + Ok(()) + } + + fn detect_orphan_street_lines( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let header_end = header_end(full_text); + let offsets = ByteOffsets::new(full_text); + let header_scan_end = header_scan_end(full_text, &offsets, header_end)?; + let header = + full_text + .get(..header_scan_end) + .ok_or_else(|| Error::InvalidSpan { + start: 0, + end: u32::try_from(header_scan_end).unwrap_or(u32::MAX), + })?; + let context_entities = existing_entities + .iter() + .filter(|entity| { + !(entity.label == "address" && is_caller_owned_entity(entity)) + }) + .collect::>(); + let mut results = Vec::new(); + + for captures in self.orphan_street_line.captures_iter(header) { + let Some(captured) = captures.name("value") else { + continue; + }; + let start = + usize_to_u32("address_context.orphan_start", captured.start())?; + let end = usize_to_u32("address_context.orphan_end", captured.end())?; + if start >= header_end || covered_by(existing_entities, start, end) { + continue; + } + let has_context = context_entities + .iter() + .any(|entity| within_context_window(&offsets, entity, start, end)); + if !has_context { + continue; + } + + results.push(address_context_entity( + start, + end, + "address", + captured.as_str(), + 0.85, + DetectionSource::Regex, + )); + } + Ok(results) + } + + fn full_bare_house_match_is_identical( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + self + .bare_house_number + .find_at(full_text, start) + .is_some_and(|found| found.start() == start && found.end() == end) + } +} + +fn lowercased_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn address_context_scan_ranges( + full_text: &str, + offsets: &ByteOffsets<'_>, + header_end: u32, + address_entities: &[&PipelineEntity], +) -> Result> { + let mut ranges = Vec::new(); + let header_end = offsets.validate_offset(header_end)?; + if header_end > 0 { + ranges.push(ScanRange { + start: 0, + end: header_end, + }); + } + + for entity in address_entities { + let start = + offsets.offset_before_utf16_units(entity.start, STREET_CONTEXT_WINDOW)?; + let end = + offsets.offset_after_utf16_units(entity.end, STREET_CONTEXT_WINDOW)?; + push_scan_range(full_text, &mut ranges, start, end)?; + } + + Ok(merge_scan_ranges(ranges)) +} + +fn bare_house_scan_ranges( + full_text: &str, + offsets: &ByteOffsets<'_>, + existing_entities: &[PipelineEntity], + new_entities: &[PipelineEntity], +) -> Result> { + let mut ranges = Vec::new(); + for entity in existing_entities.iter().chain(new_entities.iter()) { + if entity.label != "address" || is_caller_owned_entity(entity) { + continue; + } + let start = offsets + .offset_before_utf16_units(entity.start, BARE_HOUSE_CONTEXT_WINDOW)?; + let end = offsets + .offset_after_utf16_units(entity.end, BARE_HOUSE_CONTEXT_WINDOW)?; + ranges.push(line_expanded_scan_range(full_text, offsets, start, end)?); + } + Ok(merge_scan_ranges(ranges)) +} + +fn push_scan_range( + full_text: &str, + ranges: &mut Vec, + start: u32, + end: u32, +) -> Result<()> { + if start >= end { + return Ok(()); + } + let start = usize::try_from(start) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: start })?; + let end = usize::try_from(end) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: end })?; + if start > full_text.len() || end > full_text.len() { + return Err(Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + ranges.push(ScanRange { start, end }); + Ok(()) +} + +fn merge_scan_ranges(mut ranges: Vec) -> Vec { + ranges.sort_by_key(|range| (range.start, range.end)); + let mut merged = Vec::::new(); + for range in ranges { + let Some(last) = merged.last_mut() else { + merged.push(range); + continue; + }; + if range.start <= last.end { + last.end = last.end.max(range.end); + continue; + } + merged.push(range); + } + merged +} + +fn line_expanded_scan_range( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, +) -> Result { + let start = offsets.validate_offset(start)?; + let end = offsets.validate_offset(end)?; + let line_start = full_text + .get(..start) + .and_then(|prefix| prefix.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + let line_end = full_text + .get(end..) + .and_then(|suffix| suffix.find('\n').map(|index| end.saturating_add(index))) + .unwrap_or(full_text.len()); + Ok(ScanRange { + start: line_start, + end: line_end, + }) +} + +fn header_scan_end( + full_text: &str, + offsets: &ByteOffsets<'_>, + header_end: u32, +) -> Result { + let header_end = offsets.validate_offset(header_end)?; + let tail = full_text.get(header_end..).ok_or(Error::InvalidSpan { + start: u32::try_from(header_end).unwrap_or(u32::MAX), + end: offsets.len()?, + })?; + let Some(relative_newline) = tail.find('\n') else { + return Ok(full_text.len()); + }; + Ok(header_end.saturating_add(relative_newline)) +} + +fn compile_regex(field: &'static str, pattern: &str) -> Result { + Regex::new(pattern).map_err(|error| Error::InvalidStaticData { + field, + reason: error.to_string(), + }) +} + +fn header_end(full_text: &str) -> u32 { + let text_len = full_text.chars().map(char::len_utf16).sum::(); + let cutoff = text_len.saturating_mul(HEADER_ZONE_PERCENT).div_euclid(100); + let mut units = 0usize; + for (byte, ch) in full_text.char_indices() { + if units >= cutoff { + return u32::try_from(byte).unwrap_or(u32::MAX); + } + units = units.saturating_add(ch.len_utf16()); + } + u32::try_from(full_text.len()).unwrap_or(u32::MAX) +} + +const fn is_caller_owned_entity(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn covered_by(entities: &[PipelineEntity], start: u32, end: u32) -> bool { + entities + .iter() + .any(|entity| entity.start <= start && entity.end >= end) +} + +fn overlaps_any(entities: &[PipelineEntity], start: u32, end: u32) -> bool { + entities + .iter() + .any(|entity| entity.start < end && entity.end > start) +} + +fn address_context_entity( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source: DetectionSource, +) -> PipelineEntity { + let mut entity = + PipelineEntity::detected(start, end, label, text, score, source); + entity.source_detail = Some(SourceDetail::AddressContext); + entity +} + +fn skip_whitespace_back(full_text: &str, mut pos: usize) -> Option { + while let Some((index, ch)) = previous_char(full_text, pos) { + if !is_space(ch) { + return Some(pos); + } + pos = index; + } + None +} + +fn previous_char(full_text: &str, pos: usize) -> Option<(usize, char)> { + full_text.get(..pos)?.char_indices().next_back() +} + +fn word_before(full_text: &str, pos: usize) -> Result> { + let Some((last_index, last_ch)) = previous_char(full_text, pos) else { + return Ok(None); + }; + let mut scan_pos = pos; + let has_dot = last_ch == '.'; + if has_dot { + scan_pos = last_index; + } + + let mut word_start = scan_pos; + while let Some((previous_index, previous_ch)) = + previous_char(full_text, word_start) + { + if !is_word_char(previous_ch) { + break; + } + word_start = previous_index; + } + + let raw = full_text + .get(word_start..pos) + .ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(word_start).unwrap_or(u32::MAX), + end: u32::try_from(pos).unwrap_or(u32::MAX), + })? + .to_owned(); + let normalized = full_text + .get(word_start..scan_pos) + .ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(word_start).unwrap_or(u32::MAX), + end: u32::try_from(scan_pos).unwrap_or(u32::MAX), + })? + .to_owned(); + Ok(Some(WordBefore { + start: word_start, + raw, + normalized, + has_dot, + })) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphabetic() || ch.is_ascii_digit() || is_combining_mark(ch) +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + +const fn is_space(ch: char) -> bool { + ch.is_whitespace() || ch == '\u{00a0}' +} + +fn near_confirmed_address_same_line( + full_text: &str, + existing_entities: &[PipelineEntity], + results: &[PipelineEntity], + start: u32, + end: u32, +) -> Result { + let offsets = ByteOffsets::new(full_text); + for entity in existing_entities.iter().chain(results.iter()) { + if entity.label != "address" || is_caller_owned_entity(entity) { + continue; + } + let dist = span_gap_utf16_units(&offsets, entity, start, end)?; + if dist > BARE_HOUSE_CONTEXT_WINDOW { + continue; + } + let lo = entity.start.min(start); + let hi = entity.end.max(end); + if !text_slice(full_text, lo, hi)?.contains('\n') { + return Ok(true); + } + } + Ok(false) +} + +fn span_gap_utf16_units( + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + start: u32, + end: u32, +) -> Result { + if entity.end <= start { + return offsets.utf16_units_between(entity.end, start); + } + if end <= entity.start { + return offsets.utf16_units_between(end, entity.start); + } + Ok(0) +} + +fn within_context_window( + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + start: u32, + end: u32, +) -> bool { + span_gap_utf16_units(offsets, entity, start, end) + .is_ok_and(|distance| distance < STREET_CONTEXT_WINDOW) +} + +fn is_short_ascii_digit_token(value: &str) -> bool { + let mut count = 0usize; + for ch in value.chars() { + if !ch.is_ascii_digit() { + return false; + } + count = count.saturating_add(1); + } + (1..=2).contains(&count) +} + +fn address_context_score( + full_text: &str, + street_start: usize, + in_header: bool, +) -> f64 { + let before_start = street_start.saturating_sub(5); + let has_colon = full_text + .get(before_start..street_start) + .is_some_and(|before| before.contains(':')); + if has_colon { + return 0.95; + } + if in_header { + return 0.85; + } + 0.8 +} + +fn text_slice(full_text: &str, start: u32, end: u32) -> Result<&str> { + let start_usize = usize::try_from(start) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: start })?; + let end_usize = usize::try_from(end) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: end })?; + full_text + .get(start_usize..end_usize) + .ok_or(Error::InvalidSpan { start, end }) +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: "span offset exceeds u32 range".to_owned(), + }) +} diff --git a/crates/anonymize-core/src/address_seeds.rs b/crates/anonymize-core/src/address_seeds.rs new file mode 100644 index 00000000..7651ab9f --- /dev/null +++ b/crates/anonymize-core/src/address_seeds.rs @@ -0,0 +1,1355 @@ +use std::collections::BTreeSet; + +use regex::Regex; + +use crate::processors::PatternSlice; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::types::{Error, Result, SearchEngine, SearchMatch}; + +const ADDRESS_SCORE_BASE: f64 = 0.5; +const ADDRESS_SCORE_MAX: f64 = 0.95; +const ADDRESS_CLUSTER_MAX_GAP: usize = 150; +const ADDRESS_RIGHT_EXPAND_LIMIT: usize = 200; +const BR_CEP_CONTEXT_WINDOW: usize = 200; +const PLAIN_POSTAL_CONTEXT_WINDOW: usize = 120; +const US_ZIP_CONTEXT_WINDOW: usize = 120; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct AddressSeedData { + #[serde(default)] + pub boundary_words: Vec, + #[serde(default)] + pub br_cep_cue_words: Vec, + #[serde(default)] + pub unit_abbreviations: Vec, +} + +pub(crate) struct PreparedAddressSeedData { + boundary_search: Option, + br_cep_cue_search: Option, + unit_abbreviations: BTreeSet, + postal_code_re: Regex, + br_cep_shape_re: Regex, + us_zip_plus_four_shape_re: Regex, + us_state_before_zip_re: Regex, + house_number_before_street_re: Regex, + house_number_after_street_re: Regex, + italian_cap_re: Regex, + street_number_re: Regex, +} + +impl PreparedAddressSeedData { + pub(crate) fn new(data: AddressSeedData) -> Result { + Ok(Self { + boundary_search: literal_search(data.boundary_words)?, + br_cep_cue_search: literal_search(data.br_cep_cue_words)?, + unit_abbreviations: lowercased_set(data.unit_abbreviations), + postal_code_re: compile_regex( + r"(?u)(?:\d{5}[-‐‑‒–—―]\d{4}|\d{5}[-‐‑‒–—―]\d{3}|\d{3}\s\d{2}|\d{2}[-‐‑‒–—―]\d{3}|\d{5})", + )?, + br_cep_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{3}$")?, + us_zip_plus_four_shape_re: compile_regex(r"(?u)^\d{5}[-‐‑‒–—―]\d{4}$")?, + us_state_before_zip_re: compile_regex( + r"(?u)(?:^|[^A-Za-z0-9])(?PA[KLRZ]|C[AOT]|D[CE]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[AR]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s*,?\s*$", + )?, + house_number_before_street_re: compile_regex( + r"(?u)\b\d{1,6}(?:[-/]\d{1,6})?\s+(?:\p{Lu}\p{L}+[^\S\n\t]+){0,4}$", + )?, + house_number_after_street_re: compile_regex( + r"(?u)^[^\S\n\t]+\d{1,6}(?:[-/]\d{1,6})?\b", + )?, + italian_cap_re: compile_regex(r"(?u)\b(?P\d{5})\s+\p{Lu}\p{L}+")?, + street_number_re: compile_regex( + r"(?u)\b(?P\p{Lu}\p{Ll}{2,})\s+(?P\d{1,5}(?:/\d{1,5})?)\s*[,\n]", + )?, + }) + } + + pub(crate) fn process( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let seeds = self.collect_seeds( + matches, + street_type_slice, + full_text, + existing_entities, + )?; + let clusters = cluster_seeds(&seeds, full_text, existing_entities); + let mut results = Vec::new(); + + for cluster in clusters { + let score = score_cluster(&cluster); + if score < 0.6 { + continue; + } + let span = self.expand_cluster(full_text, &cluster, existing_entities); + let Some(raw_text) = full_text.get(span.start..span.end) else { + continue; + }; + let resolution = resolve_newline_boundary(span.start, raw_text, &cluster); + if resolution == NewlineBoundaryResolution::Drop { + continue; + } + let relative_end = match resolution { + NewlineBoundaryResolution::Keep => raw_text.len(), + NewlineBoundaryResolution::Drop => 0, + NewlineBoundaryResolution::Trim { relative_end } => relative_end, + }; + let effective_raw = raw_text.get(..relative_end).unwrap_or_default(); + let leading = effective_raw + .len() + .saturating_sub(effective_raw.trim_start().len()); + let start = span.start.saturating_add(leading); + let end = trim_address_tail( + full_text, + start, + span.start.saturating_add(effective_raw.len()), + ); + let effective_text = full_text.get(start..end).unwrap_or_default(); + let effective_len = text_units(effective_text); + if !(5..=300).contains(&effective_len) { + continue; + } + results.push(PipelineEntity::detected( + u32::try_from(start).unwrap_or(u32::MAX), + u32::try_from(end).unwrap_or(u32::MAX), + "address", + effective_text, + score, + DetectionSource::Regex, + )); + } + + Ok(results) + } + + fn collect_seeds( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + let mut seeds = + self.collect_street_type_seeds(matches, street_type_slice, full_text)?; + collect_existing_entity_seeds(&mut seeds, full_text, existing_entities); + self.collect_street_number_seeds(&mut seeds, full_text, existing_entities); + self.collect_postal_code_seeds(&mut seeds, full_text); + self.collect_italian_cap_seeds(&mut seeds, full_text); + seeds.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| left.end.cmp(&right.end)) + .then_with(|| left.kind.cmp(&right.kind)) + }); + Ok(seeds) + } + + fn collect_street_type_seeds( + &self, + matches: &[SearchMatch], + street_type_slice: PatternSlice, + full_text: &str, + ) -> Result> { + let mut seeds = Vec::new(); + for found in matches { + if street_type_slice.local_index(found.pattern()).is_none() { + continue; + } + let Some(seed) = seed_from_match(full_text, found, SeedType::StreetWord)? + else { + continue; + }; + if is_lowercase_street_word_in_prose(full_text, &seed, self) { + continue; + } + seeds.push(seed); + } + Ok(seeds) + } + + fn collect_postal_code_seeds(&self, seeds: &mut Vec, full_text: &str) { + for found in self.postal_code_re.find_iter(full_text) { + let start = found.start(); + let end = found.end(); + let text = found.as_str(); + if !postal_boundaries(full_text, start, end) { + continue; + } + let is_plain_five_digit = is_plain_five_digit_postal_code(text); + if seed_covered(seeds, start, end) && !is_plain_five_digit { + continue; + } + if is_plain_five_digit + && !self.has_plain_postal_context(full_text, start, end, seeds) + { + continue; + } + if self.br_cep_shape_re.is_match(text) + && !self.has_br_cue_nearby(full_text, start, end) + { + continue; + } + if self.us_zip_plus_four_shape_re.is_match(text) { + let context = self.us_zip_plus_four_context(full_text, start, seeds); + if !context.has_context { + continue; + } + if let Some(state_seed) = context.state_seed + && !seed_covered(seeds, state_seed.start, state_seed.end) + { + seeds.push(state_seed); + } + } + seeds.push(Seed { + kind: SeedType::PostalCode, + start, + end, + text: text.to_owned(), + }); + } + } + + fn has_plain_postal_context( + &self, + full_text: &str, + start: usize, + end: usize, + seeds: &[Seed], + ) -> bool { + seeds.iter().any(|seed| { + within_text_window( + full_text, + seed.start, + start, + PLAIN_POSTAL_CONTEXT_WINDOW, + ) && match seed.kind { + SeedType::AddressTrigger => true, + SeedType::City | SeedType::State => { + seed.end >= start && seed.start <= end.saturating_add(4) + || seed.end <= start + && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) + } + SeedType::StreetWord => { + has_house_number_near_street_word(full_text, seed, self) + } + SeedType::PostalCode => false, + } + }) + } + + fn collect_italian_cap_seeds(&self, seeds: &mut Vec, full_text: &str) { + for captures in self.italian_cap_re.captures_iter(full_text) { + let Some(found) = captures.name("cap") else { + continue; + }; + let start = found.start(); + let end = found.end(); + if seed_covered(seeds, start, end) { + continue; + } + if !has_nearby_italian_cap_evidence(full_text, seeds, start) { + continue; + } + seeds.push(Seed { + kind: SeedType::PostalCode, + start, + end, + text: found.as_str().to_owned(), + }); + } + } + + fn collect_street_number_seeds( + &self, + seeds: &mut Vec, + full_text: &str, + existing_entities: &[PipelineEntity], + ) { + for captures in self.street_number_re.captures_iter(full_text) { + let Some(full) = captures.get(0) else { + continue; + }; + let Some(street) = captures.name("street") else { + continue; + }; + let Some(number) = captures.name("num") else { + continue; + }; + let start = full.start(); + let end = number.end(); + if range_overlaps_non_address(start, end, existing_entities) { + continue; + } + seeds.push(Seed { + kind: SeedType::StreetWord, + start, + end, + text: format!("{} {}", street.as_str(), number.as_str()), + }); + } + } + + fn has_br_cue_nearby( + &self, + full_text: &str, + start: usize, + end: usize, + ) -> bool { + let Some(search) = &self.br_cep_cue_search else { + return false; + }; + let window_start = + offset_before_text_units(full_text, start, BR_CEP_CONTEXT_WINDOW); + let window_end = + offset_after_text_units(full_text, end, BR_CEP_CONTEXT_WINDOW); + full_text + .get(window_start..window_end) + .is_some_and(|window| search.is_match(window).unwrap_or(false)) + } + + fn us_zip_plus_four_context( + &self, + full_text: &str, + start: usize, + seeds: &[Seed], + ) -> UsZipPlusFourContext { + if let Some(state_seed) = self.us_state_seed_before_zip(full_text, start) { + return UsZipPlusFourContext { + state_seed: Some(state_seed), + has_context: true, + }; + } + + let has_context = seeds.iter().any(|seed| { + within_text_window(full_text, seed.start, start, US_ZIP_CONTEXT_WINDOW) + && match seed.kind { + SeedType::AddressTrigger => true, + SeedType::City => { + seed.end <= start + && full_text.get(seed.end..start).is_some_and(is_city_zip_gap) + } + SeedType::StreetWord => { + has_house_number_near_street_word(full_text, seed, self) + } + SeedType::PostalCode | SeedType::State => false, + } + }); + + UsZipPlusFourContext { + state_seed: None, + has_context, + } + } + + fn us_state_seed_before_zip( + &self, + full_text: &str, + start: usize, + ) -> Option { + let window_start = floor_char_boundary(full_text, start.saturating_sub(24)); + let window = full_text.get(window_start..start)?; + let captures = self.us_state_before_zip_re.captures(window)?; + let state = captures.name("state")?; + Some(Seed { + kind: SeedType::State, + start: window_start.saturating_add(state.start()), + end: window_start.saturating_add(state.end()), + text: state.as_str().to_owned(), + }) + } + + fn expand_cluster( + &self, + full_text: &str, + cluster: &SeedCluster, + existing_entities: &[PipelineEntity], + ) -> Span { + let left_bound = nearest_left_non_address( + full_text, + cluster.start, + existing_entities, + cluster_starts_with_street_type_word(cluster), + ); + let left_pos = expand_left(full_text, cluster.start, left_bound); + if !cluster.has_expandable_address_context() { + return Span { + start: left_pos.min(cluster.start), + end: cluster.end, + }; + } + + let right_pos = self.expand_right(full_text, cluster, existing_entities); + Span { + start: left_pos.min(cluster.start), + end: right_pos.max(cluster.end), + } + } + + fn expand_right( + &self, + full_text: &str, + cluster: &SeedCluster, + existing_entities: &[PipelineEntity], + ) -> usize { + let right_pos = cluster.end; + let remaining = full_text.get(right_pos..).unwrap_or_default(); + let mut nearest_boundary = + utf16_cap_at_char_boundary(remaining, ADDRESS_RIGHT_EXPAND_LIMIT); + + if let Some(boundary) = self.nearest_boundary_word(full_text, right_pos) { + nearest_boundary = nearest_boundary.min(boundary); + } + if let Some(entity_boundary) = + nearest_right_non_address(right_pos, existing_entities) + { + nearest_boundary = nearest_boundary.min(entity_boundary); + } + if let Some(double_newline) = remaining.find("\n\n") { + nearest_boundary = nearest_boundary.min(double_newline); + } + if let Some(sentence_boundary) = + sentence_boundary(remaining, &self.unit_abbreviations) + { + nearest_boundary = nearest_boundary.min(sentence_boundary); + } + + let end = right_pos.saturating_add(nearest_boundary); + trim_address_tail(full_text, right_pos, end) + } + + fn nearest_boundary_word( + &self, + full_text: &str, + right_pos: usize, + ) -> Option { + let search = self.boundary_search.as_ref()?; + search + .find_iter(full_text) + .ok()? + .into_iter() + .filter_map(|found| { + let start = usize::try_from(found.start()).ok()?; + (start >= right_pos).then_some(start.saturating_sub(right_pos)) + }) + .min() + } +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +enum SeedType { + StreetWord, + PostalCode, + City, + State, + AddressTrigger, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct Seed { + kind: SeedType, + start: usize, + end: usize, + text: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct SeedCluster { + seeds: Vec, + start: usize, + end: usize, +} + +impl SeedCluster { + fn has_expandable_address_context(&self) -> bool { + self.seeds.iter().any(|seed| { + matches!( + seed.kind, + SeedType::StreetWord | SeedType::PostalCode | SeedType::AddressTrigger + ) + }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Span { + start: usize, + end: usize, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct UsZipPlusFourContext { + state_seed: Option, + has_context: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum NewlineBoundaryResolution { + Keep, + Drop, + Trim { relative_end: usize }, +} + +fn literal_search(patterns: Vec) -> Result> { + let patterns = patterns + .into_iter() + .filter(|pattern| !pattern.is_empty()) + .map(|pattern| SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(true), + }) + .collect::>(); + if patterns.is_empty() { + return Ok(None); + } + Ok(Some(SearchIndex::new(patterns, SearchOptions::default())?)) +} + +fn lowercased_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn compile_regex(pattern: &str) -> Result { + Regex::new(pattern).map_err(|error| Error::Search { + engine: SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn seed_from_match( + full_text: &str, + found: &SearchMatch, + kind: SeedType, +) -> Result> { + let start = usize::try_from(found.start()).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + let end = usize::try_from(found.end()).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + let Some(text) = full_text.get(start..end) else { + return Ok(None); + }; + Ok(Some(Seed { + kind, + start, + end, + text: text.to_owned(), + })) +} + +fn collect_existing_entity_seeds( + seeds: &mut Vec, + full_text: &str, + existing_entities: &[PipelineEntity], +) { + for entity in existing_entities { + if entity.label != "address" { + continue; + } + if entity.source_detail == Some(SourceDetail::CustomDenyList) { + continue; + } + if overlaps_non_address(entity, existing_entities) { + continue; + } + let Some(kind) = kind_for_existing_entity(entity) else { + continue; + }; + if let Some(seed) = postal_seed_from_existing_address(full_text, entity) { + seeds.push(seed); + } + seeds.push(Seed { + kind, + start: usize::try_from(entity.start).unwrap_or(usize::MAX), + end: usize::try_from(entity.end).unwrap_or(usize::MAX), + text: entity.text.clone(), + }); + } +} + +fn postal_seed_from_existing_address( + full_text: &str, + entity: &PipelineEntity, +) -> Option { + if entity.source != DetectionSource::DenyList { + return None; + } + let mut start = usize::try_from(entity.start).ok()?; + let entity_end = usize::try_from(entity.end).ok()?; + while let Some((previous_start, ch)) = previous_char(full_text, start) { + if !ch.is_ascii_digit() { + break; + } + start = previous_start; + } + + let mut end = start; + while let Some((next_start, ch)) = next_char(full_text, end) { + if !ch.is_ascii_digit() { + break; + } + end = next_start.saturating_add(ch.len_utf8()); + } + if end > entity_end { + return None; + } + let text = full_text.get(start..end)?; + if !is_plain_five_digit_postal_code(text) { + return None; + } + Some(Seed { + kind: SeedType::PostalCode, + start, + end, + text: text.to_owned(), + }) +} + +fn kind_for_existing_entity(entity: &PipelineEntity) -> Option { + match entity.source { + DetectionSource::DenyList => Some(SeedType::City), + DetectionSource::Trigger if starts_with_digit(&entity.text) => { + Some(SeedType::PostalCode) + } + DetectionSource::Trigger => Some(SeedType::AddressTrigger), + _ => None, + } +} + +fn starts_with_digit(text: &str) -> bool { + text.chars().next().is_some_and(|ch| ch.is_ascii_digit()) +} + +fn is_lowercase_street_word_in_prose( + full_text: &str, + seed: &Seed, + data: &PreparedAddressSeedData, +) -> bool { + starts_lowercase(&seed.text) + && full_text + .get(seed.end..) + .is_some_and(starts_with_whitespace_then_lowercase) + && !has_house_number_near_street_word(full_text, seed, data) +} + +fn starts_lowercase(text: &str) -> bool { + text.chars().next().is_some_and(char::is_lowercase) +} + +fn starts_with_whitespace_then_lowercase(text: &str) -> bool { + let mut saw_whitespace = false; + for ch in text.chars() { + if ch.is_whitespace() { + saw_whitespace = true; + continue; + } + return saw_whitespace && ch.is_lowercase(); + } + false +} + +fn has_house_number_near_street_word( + full_text: &str, + seed: &Seed, + data: &PreparedAddressSeedData, +) -> bool { + if seed.text.chars().any(|ch| ch.is_ascii_digit()) { + return true; + } + + let before_start = + floor_char_boundary(full_text, seed.start.saturating_sub(50)); + let before = full_text.get(before_start..seed.start).unwrap_or_default(); + if data.house_number_before_street_re.is_match(before) { + return true; + } + + let after_end = ceil_char_boundary( + full_text, + seed.end.saturating_add(24).min(full_text.len()), + ); + let after = full_text.get(seed.end..after_end).unwrap_or_default(); + data.house_number_after_street_re.is_match(after) +} + +fn postal_boundaries(full_text: &str, start: usize, end: usize) -> bool { + let before_ok = previous_char(full_text, start) + .is_none_or(|(_, ch)| !is_postal_adjacent(ch)); + let after_ok = + next_char(full_text, end).is_none_or(|(_, ch)| !is_postal_adjacent(ch)); + before_ok && after_ok +} + +fn is_postal_adjacent(ch: char) -> bool { + ch.is_alphanumeric() || ch == '_' || is_dash(ch) +} + +fn is_plain_five_digit_postal_code(text: &str) -> bool { + text.len() == 5 && text.chars().all(|ch| ch.is_ascii_digit()) +} + +const fn is_dash(ch: char) -> bool { + matches!(ch, '-' | '‐' | '‑' | '‒' | '–' | '—' | '―') +} + +fn seed_covered(seeds: &[Seed], start: usize, end: usize) -> bool { + seeds + .iter() + .any(|seed| seed.start <= start && seed.end >= end) +} + +fn has_nearby_italian_cap_evidence( + full_text: &str, + seeds: &[Seed], + start: usize, +) -> bool { + seeds.iter().any(|seed| { + within_text_window(full_text, seed.start, start, 80) + && match seed.kind { + SeedType::AddressTrigger | SeedType::City | SeedType::PostalCode => { + true + } + SeedType::StreetWord => seed.text.to_lowercase() != "via", + SeedType::State => false, + } + }) +} + +fn is_city_zip_gap(text: &str) -> bool { + !text.is_empty() && text.chars().all(|ch| ch.is_whitespace() || ch == ',') +} + +fn cluster_seeds( + seeds: &[Seed], + full_text: &str, + existing_entities: &[PipelineEntity], +) -> Vec { + let Some(first) = seeds.first() else { + return Vec::new(); + }; + + let mut clusters = Vec::new(); + let mut current = SeedCluster { + seeds: vec![first.clone()], + start: first.start, + end: first.end, + }; + + for seed in seeds.iter().skip(1) { + let gap_ok = within_text_window( + full_text, + current.end, + seed.start, + ADDRESS_CLUSTER_MAX_GAP, + ) && !has_cluster_barrier( + full_text, + current.end, + seed.start, + existing_entities, + ); + if gap_ok { + current.seeds.push(seed.clone()); + current.end = current.end.max(seed.end); + continue; + } + clusters.push(current); + current = SeedCluster { + seeds: vec![seed.clone()], + start: seed.start, + end: seed.end, + }; + } + clusters.push(current); + clusters +} + +fn within_text_window( + full_text: &str, + left: usize, + right: usize, + max_units: usize, +) -> bool { + let start = left.min(right); + let end = left.max(right); + full_text + .get(start..end) + .is_some_and(|gap| text_units(gap) <= max_units) +} + +fn text_units(text: &str) -> usize { + text.chars().map(char::len_utf16).sum() +} + +fn offset_before_text_units( + full_text: &str, + end: usize, + max_units: usize, +) -> usize { + let Some(prefix) = full_text.get(..end) else { + return 0; + }; + let mut units = 0usize; + for (index, ch) in prefix.char_indices().rev() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return index.saturating_add(ch.len_utf8()); + } + units = units.saturating_add(width); + } + 0 +} + +fn offset_after_text_units( + full_text: &str, + start: usize, + max_units: usize, +) -> usize { + let Some(tail) = full_text.get(start..) else { + return full_text.len(); + }; + let mut units = 0usize; + for (relative, ch) in tail.char_indices() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return start.saturating_add(relative); + } + units = units.saturating_add(width); + } + full_text.len() +} + +fn has_cluster_barrier( + full_text: &str, + gap_start: usize, + gap_end: usize, + existing_entities: &[PipelineEntity], +) -> bool { + full_text + .get(gap_start..gap_end) + .is_some_and(has_paragraph_break) + || existing_entities.iter().any(|entity| { + non_address_label(&entity.label) + && usize::try_from(entity.start) + .is_ok_and(|start| start >= gap_start && start < gap_end) + && usize::try_from(entity.end).is_ok_and(|end| end > gap_start) + }) +} + +fn overlaps_non_address( + entity: &PipelineEntity, + existing_entities: &[PipelineEntity], +) -> bool { + let start = usize::try_from(entity.start).unwrap_or(usize::MAX); + let end = usize::try_from(entity.end).unwrap_or(usize::MAX); + range_overlaps_non_address(start, end, existing_entities) +} + +fn range_overlaps_non_address( + start: usize, + end: usize, + existing_entities: &[PipelineEntity], +) -> bool { + existing_entities.iter().any(|existing| { + non_address_label(&existing.label) + && usize::try_from(existing.end).is_ok_and(|existing_end| { + existing_end > start + && usize::try_from(existing.start) + .is_ok_and(|existing_start| existing_start < end) + }) + }) +} + +fn has_paragraph_break(text: &str) -> bool { + let mut saw_newline = false; + for ch in text.chars() { + if ch == '\n' { + if saw_newline { + return true; + } + saw_newline = true; + continue; + } + if !ch.is_whitespace() { + saw_newline = false; + } + } + false +} + +fn score_cluster(cluster: &SeedCluster) -> f64 { + let mut has_street_word = false; + let mut has_postal_code = false; + let mut has_city = false; + let mut has_state = false; + let mut has_address_trigger = false; + + for seed in &cluster.seeds { + match seed.kind { + SeedType::StreetWord => has_street_word = true, + SeedType::PostalCode => has_postal_code = true, + SeedType::City => has_city = true, + SeedType::State => has_state = true, + SeedType::AddressTrigger => has_address_trigger = true, + } + } + + let type_count = [ + has_street_word, + has_postal_code, + has_city, + has_state, + has_address_trigger, + ] + .into_iter() + .filter(|seen| *seen) + .count(); + if type_count < 2 { + return 0.0; + } + + let mut score = ADDRESS_SCORE_BASE; + if has_postal_code { + score += 0.15; + } + if has_city { + score += 0.15; + } + if has_state { + score += 0.15; + } + if has_street_word { + score += 0.15; + } + if has_address_trigger { + score += 0.1; + } + score.min(ADDRESS_SCORE_MAX) +} + +fn nearest_left_non_address( + full_text: &str, + start: usize, + existing_entities: &[PipelineEntity], + ignore_date_prefix: bool, +) -> usize { + existing_entities + .iter() + .filter_map(|entity| { + if !non_address_label(&entity.label) { + return None; + } + let end = usize::try_from(entity.end).ok()?; + if ignore_date_prefix + && date_label(&entity.label) + && date_can_prefix_street_name(full_text, end, start) + { + return None; + } + (end <= start).then_some(end) + }) + .max() + .unwrap_or(0) +} + +fn nearest_right_non_address( + right_pos: usize, + existing_entities: &[PipelineEntity], +) -> Option { + existing_entities + .iter() + .filter(|entity| non_address_label(&entity.label)) + .filter_map(|entity| { + let start = usize::try_from(entity.start).ok()?; + let offset = start.saturating_sub(right_pos); + (offset > 0).then_some(offset) + }) + .min() +} + +fn non_address_label(label: &str) -> bool { + matches!( + label, + "registration number" + | "tax identification number" + | "national identification number" + | "social security number" + | "birth number" + | "identity card number" + | "date" + | "date of birth" + | "person" + | "bank account number" + | "email address" + | "phone number" + | "organization" + | "iban" + ) +} + +fn date_label(label: &str) -> bool { + matches!(label, "date" | "date of birth") +} + +fn cluster_starts_with_street_type_word(cluster: &SeedCluster) -> bool { + cluster.seeds.iter().any(|seed| { + seed.start == cluster.start + && seed.kind == SeedType::StreetWord + && !seed.text.chars().any(|ch| ch.is_ascii_digit()) + }) +} + +fn date_can_prefix_street_name( + full_text: &str, + date_end: usize, + street_start: usize, +) -> bool { + if date_end > street_start { + return false; + } + full_text.get(date_end..street_start).is_some_and(|gap| { + !gap.contains('\n') && gap.chars().all(char::is_whitespace) + }) +} + +fn expand_left(full_text: &str, start: usize, left_bound: usize) -> usize { + let mut left_pos = start; + while left_pos > left_bound { + let Some((word_start, word_end, word)) = + word_before_for_address(full_text, left_pos, left_bound) + else { + break; + }; + if word.len() < 2 + || !starts_uppercase_or_digit(word) + || is_left_address_label(word) + { + break; + } + if full_text + .get(word_start..left_pos) + .is_some_and(|slice| slice.contains('\n')) + { + break; + } + left_pos = word_start; + if word_end <= left_bound { + break; + } + } + left_pos +} + +fn word_before_for_address( + text: &str, + pos: usize, + left_bound: usize, +) -> Option<(usize, usize, &str)> { + let mut end = pos; + while end > left_bound { + let Some((prev_start, ch)) = previous_char(text, end) else { + break; + }; + if ch == ' ' || ch == ',' { + end = prev_start; + continue; + } + break; + } + if end <= left_bound { + return None; + } + + let mut start = end; + while start > left_bound { + let Some((prev_start, ch)) = previous_char(text, start) else { + break; + }; + if ch.is_whitespace() { + break; + } + start = prev_start; + } + let word = text.get(start..end)?; + Some((start, end, word)) +} + +fn starts_uppercase_or_digit(text: &str) -> bool { + text + .chars() + .next() + .is_some_and(|ch| ch.is_uppercase() || ch.is_ascii_digit()) +} + +fn is_left_address_label(text: &str) -> bool { + text.ends_with(':') +} + +fn trim_address_tail(full_text: &str, start: usize, mut end: usize) -> usize { + while end > start { + let Some((prev_start, ch)) = previous_char(full_text, end) else { + break; + }; + if is_address_trailing_trim(ch) { + end = prev_start; + continue; + } + break; + } + end +} + +fn sentence_boundary( + text: &str, + unit_abbreviations: &BTreeSet, +) -> Option { + let mut iter = text.char_indices().peekable(); + while let Some((index, ch)) = iter.next() { + if !matches!(ch, '.' | '!' | '?') { + continue; + } + if ch == '.' && is_unit_abbreviation(text, index, unit_abbreviations) { + continue; + } + let mut saw_whitespace = false; + while let Some((_, next)) = iter.peek().copied() { + if !next.is_whitespace() { + break; + } + saw_whitespace = true; + iter.next(); + } + let Some((_, next)) = iter.peek().copied() else { + return Some(index); + }; + if saw_whitespace && (next.is_uppercase() || next.is_ascii_digit()) { + return Some(index); + } + } + None +} + +fn is_unit_abbreviation( + text: &str, + dot_index: usize, + unit_abbreviations: &BTreeSet, +) -> bool { + let mut start = dot_index; + while let Some((previous_start, ch)) = previous_char(text, start) { + if ch.is_alphanumeric() || ch == '.' { + start = previous_start; + continue; + } + break; + } + if start == dot_index { + return false; + } + text + .get(start..dot_index.saturating_add(1)) + .is_some_and(|token| unit_abbreviations.contains(&token.to_lowercase())) +} + +const fn is_address_trailing_trim(ch: char) -> bool { + ch.is_whitespace() + || matches!( + ch, + ',' + | ';' + | ':' + | '(' + | '[' + | '{' + | '"' + | '\'' + | '“' + | '”' + | '‘' + | '’' + | '′' + ) +} + +fn resolve_newline_boundary( + span_start: usize, + text: &str, + cluster: &SeedCluster, +) -> NewlineBoundaryResolution { + let mut newline_positions = text.match_indices('\n').map(|(index, _)| index); + let Some(relative_newline) = newline_positions.next() else { + return NewlineBoundaryResolution::Keep; + }; + if newline_positions.next().is_some() { + return NewlineBoundaryResolution::Drop; + } + + let newline_abs = span_start.saturating_add(relative_newline); + let mut street_above = false; + let mut street_below = false; + let mut destination_above = false; + let mut destination_below = false; + + for seed in &cluster.seeds { + let is_above = seed.end <= newline_abs; + let is_street = matches!(seed.kind, SeedType::StreetWord); + let is_destination = + matches!(seed.kind, SeedType::PostalCode | SeedType::City); + if is_street && is_above { + street_above = true; + } + if is_street && !is_above { + street_below = true; + } + if is_destination && is_above { + destination_above = true; + } + if is_destination && !is_above { + destination_below = true; + } + } + + if (street_above && destination_below) || (street_below && destination_above) + { + return NewlineBoundaryResolution::Keep; + } + if street_above && destination_above { + return NewlineBoundaryResolution::Trim { + relative_end: relative_newline, + }; + } + NewlineBoundaryResolution::Drop +} + +fn utf16_cap_at_char_boundary(text: &str, cap: usize) -> usize { + let mut units = 0usize; + for (index, ch) in text.char_indices() { + let width = ch.len_utf16(); + if units.saturating_add(width) > cap { + return index; + } + units = units.saturating_add(width); + } + text.len() +} + +fn floor_char_boundary(text: &str, mut byte: usize) -> usize { + byte = byte.min(text.len()); + while byte > 0 && !text.is_char_boundary(byte) { + byte = byte.saturating_sub(1); + } + byte +} + +fn ceil_char_boundary(text: &str, mut byte: usize) -> usize { + byte = byte.min(text.len()); + while byte < text.len() && !text.is_char_boundary(byte) { + byte = byte.saturating_add(1); + } + byte +} + +fn previous_char(text: &str, byte: usize) -> Option<(usize, char)> { + text.get(..byte)?.char_indices().next_back() +} + +fn next_char(text: &str, byte: usize) -> Option<(usize, char)> { + let suffix = text.get(byte..)?; + let (relative, ch) = suffix.char_indices().next()?; + Some((byte.saturating_add(relative), ch)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn entity( + full_text: &str, + text: &str, + label: &str, + source: DetectionSource, + ) -> Result { + let Some(start) = full_text.find(text) else { + return Err(Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture text should exist"), + }); + }; + let end = start.saturating_add(text.len()); + Ok(PipelineEntity::detected( + u32::try_from(start).map_err(|_| Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture start should fit u32"), + })?, + u32::try_from(end).map_err(|_| Error::InvalidStaticData { + field: "address_seed_test_fixture", + reason: String::from("fixture end should fit u32"), + })?, + label, + text, + 0.9, + source, + )) + } + + #[test] + fn expands_compound_street_with_plain_postal_city() -> Result<()> { + let data = PreparedAddressSeedData::new(AddressSeedData { + boundary_words: vec![String::from("steuer-id")], + br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), + })?; + let full_text = concat!( + "(2) Frau Karoline M. Brentano,\n", + " geboren am 09. Juli 1982,\n", + " wohnhaft Bismarckring 18, 65183 Wiesbaden,\n", + " Steuer-ID: 78 123 456 789", + ); + let existing = vec![ + entity( + full_text, + "Frau Karoline M. Brentano", + "person", + DetectionSource::DenyList, + )?, + entity( + full_text, + "09. Juli 1982", + "date of birth", + DetectionSource::Trigger, + )?, + entity( + full_text, + "5183 Wiesbaden", + "address", + DetectionSource::DenyList, + )?, + ]; + + let result = + data.process(&[], PatternSlice::default(), full_text, &existing)?; + + assert!( + result + .iter() + .any(|entity| entity.text == "Bismarckring 18, 65183 Wiesbaden"), + "address seed entities: {result:?}", + ); + Ok(()) + } +} diff --git a/crates/anonymize-core/src/anchored.rs b/crates/anonymize-core/src/anchored.rs new file mode 100644 index 00000000..b249b499 --- /dev/null +++ b/crates/anonymize-core/src/anchored.rs @@ -0,0 +1,162 @@ +use crate::resolution::PipelineEntity; +use crate::search::{SearchIndex, SearchOptions, SearchPattern}; +use crate::types::{Result, SearchMatch}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct AnchorSpan { + pub start: usize, + pub end: usize, +} + +pub(crate) struct AnchorTerm { + text: String, + case_insensitive: bool, + whole_words: bool, +} + +impl AnchorTerm { + pub(crate) const fn new( + text: String, + case_insensitive: bool, + whole_words: bool, + ) -> Self { + Self { + text, + case_insensitive, + whole_words, + } + } + + pub(crate) const fn word_case_insensitive(text: String) -> Self { + Self { + text, + case_insensitive: true, + whole_words: true, + } + } + + pub(crate) const fn word_case_sensitive(text: String) -> Self { + Self { + text, + case_insensitive: false, + whole_words: true, + } + } + + pub(crate) const fn symbol(text: String) -> Self { + Self { + text, + case_insensitive: false, + whole_words: false, + } + } +} + +pub(crate) trait AnchoredRule { + fn anchor_terms(&self) -> Vec; + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result>; +} + +pub(crate) struct AnchoredExtractor { + search: SearchIndex, + rule: R, +} + +impl AnchoredExtractor { + pub(crate) fn new(rule: R) -> Result> { + let anchors = rule.anchor_terms(); + if anchors.is_empty() { + return Ok(None); + } + + Ok(Some(Self { + search: SearchIndex::new( + anchors + .into_iter() + .map(|anchor| SearchPattern::LiteralWithOptions { + pattern: anchor.text, + case_insensitive: Some(anchor.case_insensitive), + whole_words: Some(anchor.whole_words), + }) + .collect(), + SearchOptions::default(), + )?, + rule, + })) + } + + pub(crate) fn extract(&self, full_text: &str) -> Result> { + let mut entities = Vec::new(); + for found in self.search.find_iter(full_text)? { + let anchor = anchor_span(&found); + entities.extend(self.rule.extract(full_text, anchor)?); + } + Ok(select_anchored_entities(entities)) + } + + pub(crate) const fn rule(&self) -> &R { + &self.rule + } +} + +fn anchor_span(found: &SearchMatch) -> AnchorSpan { + AnchorSpan { + start: usize::try_from(found.start()).unwrap_or(usize::MAX), + end: usize::try_from(found.end()).unwrap_or(usize::MAX), + } +} + +fn select_anchored_entities( + mut entities: Vec, +) -> Vec { + if entities.len() < 2 { + return entities; + } + + entities.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + .then_with(|| left.label.cmp(&right.label)) + }); + + let mut selected = Vec::new(); + for entity in entities { + if selected.iter().any(|existing| { + same_bucket(existing, &entity) && contains(existing, &entity) + }) { + continue; + } + + selected.retain(|existing| { + !same_bucket(&entity, existing) || !contains(&entity, existing) + }); + selected.push(entity); + } + + selected.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| left.end.cmp(&right.end)) + .then_with(|| left.label.cmp(&right.label)) + }); + selected +} + +fn same_bucket(left: &PipelineEntity, right: &PipelineEntity) -> bool { + left.label == right.label + && left.source == right.source + && left.source_detail == right.source_detail + && left.kind == right.kind +} + +const fn contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.start <= inner.start && outer.end >= inner.end +} diff --git a/crates/anonymize-core/src/artifact_bytes.rs b/crates/anonymize-core/src/artifact_bytes.rs new file mode 100644 index 00000000..350702dd --- /dev/null +++ b/crates/anonymize-core/src/artifact_bytes.rs @@ -0,0 +1,122 @@ +use crate::types::{Error, Result}; + +pub(crate) struct ArtifactWriter { + bytes: Vec, +} + +impl ArtifactWriter { + pub(crate) fn new(header: [u8; 8], version: u32) -> Self { + let mut bytes = Vec::new(); + bytes.extend_from_slice(&header); + write_u32(&mut bytes, version); + Self { bytes } + } + + pub(crate) fn write_len( + &mut self, + len: usize, + field: &'static str, + ) -> Result<()> { + write_u32(&mut self.bytes, checked_len_u32(len, field)?); + Ok(()) + } + + pub(crate) fn write_len_prefixed_bytes( + &mut self, + field: &'static str, + bytes: &[u8], + ) -> Result<()> { + self.write_len(bytes.len(), field)?; + self.bytes.extend_from_slice(bytes); + Ok(()) + } + + pub(crate) fn into_bytes(self) -> Vec { + self.bytes + } +} + +pub(crate) struct ArtifactReader<'a> { + bytes: &'a [u8], + offset: usize, + field: &'static str, +} + +impl<'a> ArtifactReader<'a> { + pub(crate) fn new( + bytes: &'a [u8], + header: [u8; 8], + version: u32, + field: &'static str, + ) -> Result { + let mut reader = Self { + bytes, + offset: 0, + field, + }; + let actual_header = reader.read_bytes(header.len())?; + if actual_header != header { + return Err(invalid_artifact(field, "unexpected header")); + } + let actual_version = reader.read_u32()?; + if actual_version != version { + return Err(invalid_artifact(field, "unsupported version")); + } + Ok(reader) + } + + pub(crate) fn read_usize(&mut self) -> Result { + usize::try_from(self.read_u32()?) + .map_err(|_| invalid_artifact(self.field, "length is not addressable")) + } + + pub(crate) fn read_len_prefixed_bytes(&mut self) -> Result<&'a [u8]> { + let len = self.read_usize()?; + self.read_bytes(len) + } + + pub(crate) fn finish(&self) -> Result<()> { + if self.offset == self.bytes.len() { + return Ok(()); + } + Err(invalid_artifact(self.field, "trailing data")) + } + + fn read_u32(&mut self) -> Result { + let bytes = self.read_bytes(4)?; + let array = <[u8; 4]>::try_from(bytes) + .map_err(|_| invalid_artifact(self.field, "malformed u32"))?; + Ok(u32::from_le_bytes(array)) + } + + fn read_bytes(&mut self, len: usize) -> Result<&'a [u8]> { + let end = self + .offset + .checked_add(len) + .ok_or_else(|| invalid_artifact(self.field, "length overflow"))?; + let bytes = self + .bytes + .get(self.offset..end) + .ok_or_else(|| invalid_artifact(self.field, "truncated data"))?; + self.offset = end; + Ok(bytes) + } +} + +fn write_u32(bytes: &mut Vec, value: u32) { + bytes.extend_from_slice(&value.to_le_bytes()); +} + +fn checked_len_u32(len: usize, field: &'static str) -> Result { + u32::try_from(len).map_err(|_| Error::InvalidStaticData { + field, + reason: format!("length exceeds u32: {len}"), + }) +} + +fn invalid_artifact(field: &'static str, reason: impl Into) -> Error { + Error::InvalidStaticData { + field, + reason: reason.into(), + } +} diff --git a/crates/anonymize-core/src/byte_offsets.rs b/crates/anonymize-core/src/byte_offsets.rs new file mode 100644 index 00000000..492c040e --- /dev/null +++ b/crates/anonymize-core/src/byte_offsets.rs @@ -0,0 +1,126 @@ +use crate::types::{Error, Result}; + +pub(crate) struct ByteOffsets<'a> { + text: &'a str, +} + +impl<'a> ByteOffsets<'a> { + pub(crate) const fn new(text: &'a str) -> Self { + Self { text } + } + + pub(crate) fn len(&self) -> Result { + u32::try_from(self.text.len()) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn validate_offset(&self, offset: u32) -> Result { + let index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + if index > self.text.len() { + return Err(Error::ByteOffsetOutOfBounds { offset }); + } + if !self.text.is_char_boundary(index) { + return Err(Error::ByteOffsetInsideCodepoint { offset }); + } + Ok(index) + } + + pub(crate) fn floor_offset(&self, offset: u32) -> Result { + let mut index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + if index > self.text.len() { + index = self.text.len(); + } + while index > 0 && !self.text.is_char_boundary(index) { + index = index.saturating_sub(1); + } + u32::try_from(index) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn slice(&self, start: u32, end: u32) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let start_byte = self.validate_offset(start)?; + let end_byte = self.validate_offset(end)?; + + Ok( + self + .text + .get(start_byte..end_byte) + .ok_or(Error::InvalidSpan { start, end })? + .to_owned(), + ) + } + + pub(crate) fn utf16_units_between( + &self, + start: u32, + end: u32, + ) -> Result { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let start_byte = self.validate_offset(start)?; + let end_byte = self.validate_offset(end)?; + let units = self + .text + .get(start_byte..end_byte) + .ok_or(Error::InvalidSpan { start, end })? + .chars() + .map(char::len_utf16) + .sum::(); + u32::try_from(units) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) + } + + pub(crate) fn offset_after_utf16_units( + &self, + start: u32, + max_units: u32, + ) -> Result { + let start_byte = self.validate_offset(start)?; + let mut units = 0_u32; + let tail = self.text.get(start_byte..).ok_or(Error::InvalidSpan { + start, + end: self.len()?, + })?; + for (relative, ch) in tail.char_indices() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + if units.saturating_add(width) > max_units { + let offset = start_byte.saturating_add(relative); + return u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + units = units.saturating_add(width); + } + self.len() + } + + pub(crate) fn offset_before_utf16_units( + &self, + end: u32, + max_units: u32, + ) -> Result { + let end_byte = self.validate_offset(end)?; + let prefix = self + .text + .get(..end_byte) + .ok_or(Error::InvalidSpan { start: 0, end })?; + let mut units = 0_u32; + for (byte, ch) in prefix.char_indices().rev() { + let width = u32::try_from(ch.len_utf16()).unwrap_or(u32::MAX); + if units.saturating_add(width) > max_units { + let offset = byte.saturating_add(ch.len_utf8()); + return u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }); + } + units = units.saturating_add(width); + } + Ok(0) + } +} diff --git a/crates/anonymize-core/src/coreference.rs b/crates/anonymize-core/src/coreference.rs new file mode 100644 index 00000000..0f4fb6d5 --- /dev/null +++ b/crates/anonymize-core/src/coreference.rs @@ -0,0 +1,595 @@ +use regex::{Regex, RegexBuilder}; +use std::collections::{BTreeMap, BTreeSet}; + +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const SEARCH_WINDOW: u32 = 200; +const COREFERENCE_SCORE: f64 = 0.95; +const ORG_PROPAGATION_SCORE: f64 = 0.9; +const ORG_DETERMINER_LOOKBACK: usize = 40; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct CoreferenceData { + #[serde(default)] + pub definition_patterns: Vec, + #[serde(default)] + pub role_stop_terms: Vec, + #[serde(default)] + pub legal_form_aliases: Vec, + #[serde(default)] + pub organization_suffixes: Vec, + #[serde(default)] + pub organization_determiners: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct CoreferencePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +pub(crate) struct PreparedCoreferenceData { + definition_patterns: Vec, + role_stop_terms: BTreeSet, + legal_form_aliases: BTreeSet, + legal_form_suffixes: Vec, + org_determiner: Option, +} + +struct DefinedTerm { + alias: String, + label: String, + source_text: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct OrgSeed { + base_name: String, + label: String, + source_text: String, +} + +impl PreparedCoreferenceData { + pub(crate) fn new(data: CoreferenceData) -> Result { + let mut definition_patterns = + Vec::with_capacity(data.definition_patterns.len()); + for pattern in &data.definition_patterns { + definition_patterns.push(compile_definition_pattern(pattern)?); + } + + let mut legal_form_suffixes = if data.organization_suffixes.is_empty() { + data.legal_form_aliases.clone() + } else { + data.organization_suffixes.clone() + }; + legal_form_suffixes.sort_by_key(|suffix| std::cmp::Reverse(suffix.len())); + + Ok(Self { + definition_patterns, + role_stop_terms: lower_set(data.role_stop_terms), + legal_form_aliases: data + .legal_form_aliases + .into_iter() + .filter_map(|alias| normalized_legal_form_alias(&alias)) + .collect(), + legal_form_suffixes, + org_determiner: compile_org_determiner(&data.organization_determiners)?, + }) + } + + pub(crate) fn process( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + threshold: f64, + ) -> Result> { + let mut results = self.propagate_organization_names( + full_text, + existing_entities, + threshold, + )?; + + if !self.definition_patterns.is_empty() { + let terms = if results.is_empty() { + self.extract_defined_terms(full_text, existing_entities)? + } else { + let mut definition_entities = existing_entities.to_vec(); + definition_entities.extend(results.iter().cloned()); + self.extract_defined_terms(full_text, &definition_entities)? + }; + results.extend(Self::find_alias_spans(full_text, &terms)?); + } + + Ok(results) + } + + fn extract_defined_terms( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut sorted = entities + .iter() + .filter(|entity| !caller_owned(entity)) + .collect::>(); + sorted.sort_by_key(|entity| entity.start); + + let mut terms = Vec::new(); + let mut seen = BTreeSet::new(); + + for pattern in &self.definition_patterns { + for captures in pattern.captures_iter(full_text) { + let Some(alias_match) = captures.get(1) else { + continue; + }; + let alias = alias_match.as_str().trim(); + if alias.chars().count() < 2 { + continue; + } + if self.role_stop_terms.contains(&alias.to_lowercase()) { + continue; + } + if normalized_legal_form_alias(alias).is_some_and(|normalized| { + self.legal_form_aliases.contains(&normalized) + }) { + continue; + } + + let Some(full_match) = captures.get(0) else { + continue; + }; + let definition_start = + usize_to_u32("coreference.definition_start", full_match.start())?; + let Some(source) = + nearest_preceding_source(&sorted, &offsets, definition_start)? + else { + continue; + }; + let gap = offsets.slice(source.end, definition_start)?; + if has_clause_boundary(&gap) { + continue; + } + if !has_entity_similarity(alias, &source.text) { + continue; + } + + let key = format!("{}::{}", alias.to_lowercase(), source.label); + if !seen.insert(key) { + continue; + } + + terms.push(DefinedTerm { + alias: alias.to_owned(), + label: source.label.clone(), + source_text: source.text.clone(), + }); + } + } + + Ok(terms) + } + + fn find_alias_spans( + full_text: &str, + terms: &[DefinedTerm], + ) -> Result> { + let mut results = Vec::new(); + + for term in terms { + let mut search_from = 0; + while search_from < full_text.len() { + let Some(relative) = full_text + .get(search_from..) + .and_then(|tail| tail.find(&term.alias)) + else { + break; + }; + let start = search_from.saturating_add(relative); + let end = start.saturating_add(term.alias.len()); + if !is_word_boundary(full_text, start, end) { + search_from = next_char_boundary(full_text, start); + continue; + } + + let start_u32 = usize_to_u32("coreference.alias_start", start)?; + let end_u32 = usize_to_u32("coreference.alias_end", end)?; + results.push(PipelineEntity::coreference( + start_u32, + end_u32, + term.label.clone(), + term.alias.clone(), + COREFERENCE_SCORE, + term.source_text.clone(), + )); + search_from = end; + } + } + + Ok(results) + } + + fn propagate_organization_names( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + threshold: f64, + ) -> Result> { + if threshold > ORG_PROPAGATION_SCORE || self.legal_form_suffixes.is_empty() + { + return Ok(Vec::new()); + } + + let seeds = self.organization_seeds(existing_entities); + if seeds.is_empty() { + return Ok(Vec::new()); + } + + let mut covered = existing_entities + .iter() + .map(|entity| (entity.start, entity.end)) + .collect::>(); + let mut results = Vec::new(); + + for seed in seeds { + let mut search_from = 0usize; + while search_from < full_text.len() { + let Some(relative) = full_text + .get(search_from..) + .and_then(|tail| tail.find(&seed.base_name)) + else { + break; + }; + let start = search_from.saturating_add(relative); + let end = start.saturating_add(seed.base_name.len()); + if !is_word_boundary(full_text, start, end) { + search_from = next_char_boundary(full_text, start); + continue; + } + + let span_start = + self.determiner_start(full_text, start).unwrap_or(start); + let start_u32 = usize_to_u32("coreference.org_start", span_start)?; + let end_u32 = usize_to_u32("coreference.org_end", end)?; + if !span_overlaps(&covered, start_u32, end_u32) { + results.push(PipelineEntity::coreference( + start_u32, + end_u32, + seed.label.clone(), + full_text.get(span_start..end).unwrap_or_default(), + ORG_PROPAGATION_SCORE, + seed.source_text.clone(), + )); + covered.push((start_u32, end_u32)); + } + + search_from = end; + } + } + + Ok(results) + } + + fn organization_seeds( + &self, + existing_entities: &[PipelineEntity], + ) -> Vec { + let mut seed_by_base = BTreeMap::::new(); + + for entity in existing_entities { + if entity.label != "organization" || caller_owned(entity) { + continue; + } + let Some(base) = self.organization_base_name(&entity.text) else { + continue; + }; + let entry = seed_by_base.entry(base.clone()).or_insert_with(|| OrgSeed { + base_name: base.clone(), + label: entity.label.clone(), + source_text: entity.text.clone(), + }); + if entry.source_text != entity.text { + entry.source_text = base; + } + } + + seed_by_base.into_values().collect() + } + + fn organization_base_name(&self, text: &str) -> Option { + for suffix in &self.legal_form_suffixes { + let Some(base) = text.strip_suffix(suffix) else { + continue; + }; + let base = + base.trim_end_matches(|ch: char| ch == ',' || ch.is_whitespace()); + let base = base.trim(); + if text_units(base) >= 3 { + return Some(base.to_owned()); + } + } + None + } + + fn determiner_start( + &self, + full_text: &str, + match_start: usize, + ) -> Option { + let lookback_start = + offset_before_text_units(full_text, match_start, ORG_DETERMINER_LOOKBACK); + let lookback = full_text.get(lookback_start..match_start)?; + let captures = self.org_determiner.as_ref()?.captures(lookback)?; + let determiner = captures.get(1)?; + let start = lookback_start.saturating_add(determiner.start()); + previous_char(full_text, start) + .is_none_or(|ch| !is_word_char(ch)) + .then_some(start) + } +} + +fn compile_org_determiner(patterns: &[String]) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + + let pattern = format!("({})\\s+$", patterns.join("|")); + RegexBuilder::new(&pattern) + .case_insensitive(true) + .unicode(true) + .build() + .map(Some) + .map_err(|error| Error::InvalidStaticData { + field: "coreference_data.org_determiner", + reason: error.to_string(), + }) +} + +fn compile_definition_pattern(data: &CoreferencePatternData) -> Result { + let mut builder = RegexBuilder::new(&data.pattern); + for flag in data.flags.chars() { + match flag { + 'g' | 'u' => {} + 'i' => { + builder.case_insensitive(true); + } + 'm' => { + builder.multi_line(true); + } + 's' => { + builder.dot_matches_new_line(true); + } + _ => { + return Err(Error::InvalidStaticData { + field: "coreference_data.definition_patterns", + reason: format!("unsupported regex flag '{flag}'"), + }); + } + } + } + builder.build().map_err(|error| Error::InvalidStaticData { + field: "coreference_data.definition_patterns", + reason: error.to_string(), + }) +} + +fn nearest_preceding_source<'a>( + sorted: &[&'a PipelineEntity], + offsets: &ByteOffsets<'_>, + definition_start: u32, +) -> Result> { + for entity in sorted.iter().rev() { + if entity.end > definition_start { + continue; + } + if offsets.utf16_units_between(entity.end, definition_start)? + > SEARCH_WINDOW + { + break; + } + if matches!(entity.label.as_str(), "person" | "organization") { + return Ok(Some(*entity)); + } + } + Ok(None) +} + +fn has_clause_boundary(gap: &str) -> bool { + if gap.contains(';') { + return true; + } + + for (index, ch) in gap.char_indices() { + if ch != '.' { + continue; + } + let Some(after_dot) = gap.get(index.saturating_add(ch.len_utf8())..) else { + return true; + }; + let mut tail = after_dot.chars(); + let next = loop { + let Some(candidate) = tail.next() else { + return true; + }; + if candidate.is_whitespace() + || matches!(candidate, '"' | '\'' | '„' | '‚' | '(') + { + continue; + } + break candidate; + }; + if next.is_uppercase() { + return true; + } + } + + false +} + +fn has_entity_similarity(alias: &str, entity_text: &str) -> bool { + let alias_lower = alias.to_lowercase(); + let entity_lower = entity_text.to_lowercase(); + + if alias_lower.chars().count() >= 3 && entity_lower.contains(&alias_lower) { + return true; + } + if entity_lower.chars().count() >= 3 && alias_lower.contains(&entity_lower) { + return true; + } + + let alias_words = split_similarity_words(&alias_lower); + let entity_words = split_similarity_words(&entity_lower); + let entity_word_set = entity_words.iter().collect::>(); + if alias_words + .iter() + .any(|word| entity_word_set.contains(word)) + { + return true; + } + + if !is_all_uppercase(alias) || alias.chars().count() < 2 { + return false; + } + let alias_len = alias.chars().count(); + if alias_len > entity_words.len() { + return false; + } + for start in 0..=entity_words.len().saturating_sub(alias_len) { + let initials = entity_words + .iter() + .skip(start) + .take(alias_len) + .filter_map(|word| word.chars().next()) + .collect::(); + if initials == alias_lower { + return true; + } + } + + false +} + +fn split_similarity_words(text: &str) -> Vec { + text + .split(|ch: char| { + matches!( + ch, + ' ' + | '\t' + | '\n' + | '\r' + | '.' + | ',' + | ';' + | ':' + | '\'' + | '"' + | '(' + | ')' + | '/' + | '-' + ) + }) + .filter(|word| word.chars().count() >= 2) + .map(ToOwned::to_owned) + .collect() +} + +fn is_all_uppercase(text: &str) -> bool { + text.chars().all(char::is_uppercase) +} + +fn normalized_legal_form_alias(alias: &str) -> Option { + let normalized = alias.split_whitespace().collect::().to_lowercase(); + (!normalized.is_empty()).then_some(normalized) +} + +fn is_word_boundary(full_text: &str, start: usize, end: usize) -> bool { + previous_char(full_text, start).is_none_or(|ch| !is_word_char(ch)) + && next_char(full_text, end).is_none_or(|ch| !is_word_char(ch)) +} + +fn previous_char(full_text: &str, index: usize) -> Option { + full_text.get(..index)?.chars().next_back() +} + +fn next_char(full_text: &str, index: usize) -> Option { + full_text.get(index..)?.chars().next() +} + +fn next_char_boundary(full_text: &str, index: usize) -> usize { + let Some(ch) = next_char(full_text, index) else { + return full_text.len(); + }; + index.saturating_add(ch.len_utf8()) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphanumeric() || is_combining_mark(ch) +} + +fn span_overlaps(covered: &[(u32, u32)], start: u32, end: u32) -> bool { + covered.iter().any(|(covered_start, covered_end)| { + start < *covered_end && end > *covered_start + }) +} + +fn text_units(text: &str) -> usize { + text.chars().map(char::len_utf16).sum() +} + +fn offset_before_text_units( + full_text: &str, + end: usize, + max_units: usize, +) -> usize { + let Some(prefix) = full_text.get(..end) else { + return 0; + }; + let mut units = 0usize; + for (index, ch) in prefix.char_indices().rev() { + let width = ch.len_utf16(); + if units.saturating_add(width) > max_units { + return index.saturating_add(ch.len_utf8()); + } + units = units.saturating_add(width); + } + 0 +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} diff --git a/crates/anonymize-core/src/dates.rs b/crates/anonymize-core/src/dates.rs new file mode 100644 index 00000000..d961a3d7 --- /dev/null +++ b/crates/anonymize-core/src/dates.rs @@ -0,0 +1,473 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::anchored::{ + AnchorSpan, AnchorTerm, AnchoredExtractor, AnchoredRule, +}; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::Result; + +const DATE_LABEL: &str = "date"; +const DATE_SCORE: f64 = 1.0; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct DateData { + pub month_names_by_language: BTreeMap>, + pub year_words_by_language: BTreeMap>, +} + +pub(crate) struct PreparedDateData { + extractor: AnchoredExtractor, +} + +impl PreparedDateData { + pub(crate) fn new(data: &DateData) -> Result> { + AnchoredExtractor::new(DateRule::new(data)) + .map(|extractor| extractor.map(|extractor| Self { extractor })) + } + + pub(crate) fn process(&self, full_text: &str) -> Result> { + self.extractor.extract(full_text) + } +} + +struct DateRule { + month_names: BTreeSet, + year_words: BTreeSet, +} + +impl DateRule { + fn new(data: &DateData) -> Self { + Self { + month_names: unique_word_set(&data.month_names_by_language, 3), + year_words: unique_word_set(&data.year_words_by_language, 2), + } + } +} + +impl AnchoredRule for DateRule { + fn anchor_terms(&self) -> Vec { + self + .month_names + .iter() + .cloned() + .map(AnchorTerm::word_case_insensitive) + .chain( + self + .year_words + .iter() + .cloned() + .map(AnchorTerm::word_case_insensitive), + ) + .collect() + } + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result> { + let span = word_span(full_text, anchor); + let clean = str_slice(full_text, span.start, span.end) + .unwrap_or_default() + .trim_end_matches('.') + .to_lowercase(); + let mut spans = Vec::new(); + if self.month_names.contains(&clean) { + spans.extend( + date_spans_for_month(full_text, span.start, span.end) + .into_iter() + .map(|(start, end)| (start, end, DetectionSource::Regex)), + ); + } + if self.year_words.contains(&clean) + && let Some(year) = year_after_word_span(full_text, span.end) + { + spans.push((year.0, year.1, DetectionSource::Trigger)); + } + + Ok( + spans + .into_iter() + .filter_map(|(start, end, source)| { + date_entity(full_text, start, end, source) + }) + .collect(), + ) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Span { + start: usize, + end: usize, +} + +fn unique_word_set( + values_by_language: &BTreeMap>, + min_chars: usize, +) -> BTreeSet { + let mut seen = BTreeSet::new(); + for names in values_by_language.values() { + for name in names { + let clean = name.trim().trim_end_matches('.').to_lowercase(); + if clean.chars().count() >= min_chars { + seen.insert(clean); + } + } + } + seen +} + +fn word_span(full_text: &str, anchor: AnchorSpan) -> Span { + let mut end = anchor.end.min(full_text.len()); + if starts_with_at(full_text, end, ".") { + end = end.saturating_add(1); + } + Span { + start: anchor.start, + end, + } +} + +fn year_after_word_span(text: &str, word_end: usize) -> Option<(usize, usize)> { + let after_word = skip_horizontal_ws(text, word_end); + parse_year_forward(text, after_word) +} + +fn date_spans_for_month( + full_text: &str, + month_start: usize, + month_end: usize, +) -> Vec<(usize, usize)> { + let mut spans = Vec::new(); + + if let Some(span) = day_month_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = ordinal_day_month_span(full_text, month_start, month_end) + { + spans.push(span); + } + if let Some(span) = de_day_month_year_span(full_text, month_start, month_end) + { + spans.push(span); + } + if let Some(span) = month_day_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = month_year_span(full_text, month_start, month_end) { + spans.push(span); + } + if let Some(span) = year_month_day_span(full_text, month_start, month_end) { + spans.push(span); + } + + spans +} + +fn date_entity( + full_text: &str, + start: usize, + end: usize, + source: DetectionSource, +) -> Option { + let start_u32 = u32::try_from(start).unwrap_or(u32::MAX); + let end_u32 = u32::try_from(end).unwrap_or(u32::MAX); + Some(PipelineEntity::detected( + start_u32, + end_u32, + DATE_LABEL, + str_slice(full_text, start, end)?.to_owned(), + DATE_SCORE, + source, + )) +} + +fn day_month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let year = parse_year_forward(text, after_month)?; + let end = parse_time_suffix(text, year.1).unwrap_or(year.1); + Some((day.0, end)) +} + +fn ordinal_day_month_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = ordinal_day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let end = parse_year_forward(text, after_month).map_or(month_end, |year| { + parse_time_suffix(text, year.1).unwrap_or(year.1) + }); + Some((day.0, end)) +} + +fn de_day_month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let day = de_day_before_month(text, month_start)?; + let after_month = skip_horizontal_ws(text, month_end); + let after_de = parse_de_prefix(text, after_month).unwrap_or(after_month); + let year = parse_year_forward(text, after_de)?; + Some((day.0, year.1)) +} + +fn month_day_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let after_month = skip_horizontal_ws(text, month_end); + let day = parse_digits_forward(text, after_month, 1, 2)?; + let after_day = skip_date_year_separator(text, day.1); + if let Some(year) = parse_year_forward(text, after_day) { + return Some((month_start, year.1)); + } + right_date_boundary(text, day.1).then_some((month_start, day.1)) +} + +fn month_year_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let after_month = skip_horizontal_ws(text, month_end); + let year = parse_year_forward(text, after_month)?; + Some((month_start, year.1)) +} + +fn year_month_day_span( + text: &str, + month_start: usize, + month_end: usize, +) -> Option<(usize, usize)> { + let before_month = skip_horizontal_ws_backward(text, month_start); + if !ends_with_before(text, before_month, ".") { + return None; + } + let year_end = before_month.saturating_sub(1); + let year = parse_digits_backward(text, year_end, 4, 4)?; + if !left_date_boundary(text, year.0) { + return None; + } + + let after_month = skip_horizontal_ws(text, month_end); + let day = parse_digits_forward(text, after_month, 1, 2)?; + let end = if starts_with_at(text, day.1, ".") { + day.1.saturating_add(1) + } else { + day.1 + }; + Some((year.0, end)) +} + +fn day_before_month(text: &str, month_start: usize) -> Option<(usize, usize)> { + let mut end = skip_horizontal_ws_backward(text, month_start); + if end == month_start { + return None; + } + if ends_with_before(text, end, ".") { + end = end.saturating_sub(1); + } + let day = parse_digits_backward(text, end, 1, 2)?; + left_date_boundary(text, day.0).then_some(day) +} + +fn ordinal_day_before_month( + text: &str, + month_start: usize, +) -> Option<(usize, usize)> { + let end = skip_horizontal_ws_backward(text, month_start); + if end == month_start { + return None; + } + for suffix in ["st", "nd", "rd", "th"] { + if !ends_with_before_ascii_case_insensitive(text, end, suffix) { + continue; + } + let day_end = end.saturating_sub(suffix.len()); + let day = parse_digits_backward(text, day_end, 1, 2)?; + if left_date_boundary(text, day.0) { + return Some((day.0, end)); + } + } + None +} + +fn de_day_before_month( + text: &str, + month_start: usize, +) -> Option<(usize, usize)> { + let end = skip_horizontal_ws_backward(text, month_start); + let de_start = end.checked_sub(2)?; + if !str_slice(text, de_start, end)?.eq_ignore_ascii_case("de") { + return None; + } + let day_end = skip_horizontal_ws_backward(text, de_start); + let day = parse_digits_backward(text, day_end, 1, 2)?; + left_date_boundary(text, day.0).then_some((day.0, end)) +} + +fn parse_de_prefix(text: &str, index: usize) -> Option { + let end = index.saturating_add(2); + if !str_slice(text, index, end)?.eq_ignore_ascii_case("de") { + return None; + } + Some(skip_horizontal_ws(text, end)) +} + +fn parse_year_forward(text: &str, index: usize) -> Option<(usize, usize)> { + let year = parse_digits_forward(text, index, 4, 4)?; + right_date_boundary(text, year.1).then_some(year) +} + +fn parse_digits_forward( + text: &str, + index: usize, + min: usize, + max: usize, +) -> Option<(usize, usize)> { + let mut end = index; + let mut count = 0usize; + for ch in str_tail(text, index)?.chars() { + if !ch.is_ascii_digit() || count == max { + break; + } + end = end.saturating_add(ch.len_utf8()); + count = count.saturating_add(1); + } + (count >= min).then_some((index, end)) +} + +fn parse_digits_backward( + text: &str, + index: usize, + min: usize, + max: usize, +) -> Option<(usize, usize)> { + let mut start = index; + let mut count = 0usize; + for (char_start, ch) in str_head(text, index)?.char_indices().rev() { + if !ch.is_ascii_digit() || count == max { + break; + } + start = char_start; + count = count.saturating_add(1); + } + (count >= min).then_some((start, index)) +} + +fn parse_time_suffix(text: &str, index: usize) -> Option { + let start = skip_horizontal_ws(text, index); + let hour = parse_digits_forward(text, start, 1, 2)?; + if !starts_with_at(text, hour.1, ":") { + return None; + } + let minute = parse_digits_forward(text, hour.1.saturating_add(1), 2, 2)?; + if !starts_with_at(text, minute.1, ":") { + return Some(minute.1); + } + parse_digits_forward(text, minute.1.saturating_add(1), 2, 2) + .map(|second| second.1) +} + +fn skip_date_year_separator(text: &str, index: usize) -> usize { + if starts_with_at(text, index, ",") { + return skip_any_ws(text, index.saturating_add(1)); + } + skip_horizontal_ws(text, index) +} + +fn skip_any_ws(text: &str, mut index: usize) -> usize { + while let Some(ch) = + str_tail(text, index).and_then(|value| value.chars().next()) + { + if !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + } + index +} + +fn skip_horizontal_ws(text: &str, mut index: usize) -> usize { + while let Some(ch) = + str_tail(text, index).and_then(|value| value.chars().next()) + { + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + } + index +} + +fn skip_horizontal_ws_backward(text: &str, mut index: usize) -> usize { + while let Some((char_start, ch)) = + str_head(text, index).and_then(|value| value.char_indices().next_back()) + { + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = char_start; + } + index +} + +fn left_date_boundary(text: &str, index: usize) -> bool { + str_head(text, index) + .and_then(|value| value.chars().next_back()) + .is_none_or(|ch| !is_identifier_char(ch)) +} + +fn right_date_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| ch.is_whitespace() || ".,;!?)]".contains(ch)) +} + +fn is_identifier_char(ch: char) -> bool { + ch == '_' || ch.is_alphanumeric() +} + +fn starts_with_at(text: &str, index: usize, needle: &str) -> bool { + str_tail(text, index).is_some_and(|value| value.starts_with(needle)) +} + +fn ends_with_before(text: &str, index: usize, needle: &str) -> bool { + str_head(text, index).is_some_and(|value| value.ends_with(needle)) +} + +fn ends_with_before_ascii_case_insensitive( + text: &str, + index: usize, + needle: &str, +) -> bool { + let Some(start) = index.checked_sub(needle.len()) else { + return false; + }; + str_slice(text, start, index) + .is_some_and(|value| value.eq_ignore_ascii_case(needle)) +} + +fn str_head(text: &str, index: usize) -> Option<&str> { + text.get(..index) +} + +fn str_tail(text: &str, index: usize) -> Option<&str> { + text.get(index..) +} + +fn str_slice(text: &str, start: usize, end: usize) -> Option<&str> { + text.get(start..end) +} diff --git a/crates/anonymize-core/src/diagnostics.rs b/crates/anonymize-core/src/diagnostics.rs new file mode 100644 index 00000000..36c91b79 --- /dev/null +++ b/crates/anonymize-core/src/diagnostics.rs @@ -0,0 +1,250 @@ +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{RedactionResult, SearchEngine, SearchMatch}; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DiagnosticStage { + PrepareCacheHit, + PrepareCacheMiss, + PrepareBindingParse, + PreparePackageDecode, + PrepareBindingConvert, + PrepareArtifactsDecode, + PrepareTotal, + PrepareRegex, + PrepareCustomRegex, + PrepareAnchored, + PrepareLegalFormSearch, + PrepareTriggerSearch, + PrepareLiteral, + Normalize, + FindMatches, + FindRegex, + FindCustomRegex, + FindLiteral, + SearchRegex, + SearchCustomRegex, + SearchLegalForm, + SearchTrigger, + SearchLiteral, + EntityRegex, + EntityCustomRegex, + EntityAnchored, + EntityDenyList, + EntityGazetteer, + EntityCountry, + EntityTrigger, + EntitySignature, + EntityLegalForm, + EntityAddressSeed, + EntityNameCorpus, + EntityZoneAdjustment, + EntityAddressContext, + EntityCoreference, + Merge, + Boundary, + Sanitize, + Redaction, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DiagnosticEventKind { + StageSummary, + SearchMatch, + Entity, + Rejection, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct DiagnosticEvent { + pub stage: DiagnosticStage, + pub kind: DiagnosticEventKind, + pub count: Option, + pub engine: Option, + pub pattern: Option, + pub source: Option, + pub source_detail: Option, + pub label: Option, + pub start: Option, + pub end: Option, + pub text: Option, + pub score: Option, + pub span_valid: Option, + pub elapsed_us: Option, + pub input_bytes: Option, + pub reason: Option, +} + +#[derive(Clone, Debug, Default, PartialEq)] +pub struct StaticRedactionDiagnostics { + pub events: Vec, +} + +impl StaticRedactionDiagnostics { + pub(crate) fn record_search_matches( + &mut self, + stage: DiagnosticStage, + matches: &[SearchMatch], + full_text: &str, + elapsed_us: Option, + ) { + self.record_stage( + stage, + Some(matches.len()), + elapsed_us, + Some(full_text.len()), + ); + + let offsets = ByteOffsets::new(full_text); + for found in matches { + let span_valid = span_slices(&offsets, found.start(), found.end()); + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::SearchMatch, + count: None, + engine: Some(found.engine()), + pattern: Some(found.pattern()), + source: None, + source_detail: None, + label: None, + start: Some(found.start()), + end: Some(found.end()), + text: None, + score: None, + span_valid: Some(span_valid), + elapsed_us: None, + input_bytes: None, + reason: None, + }); + } + } + + pub(crate) fn record_entities( + &mut self, + stage: DiagnosticStage, + entities: &[PipelineEntity], + full_text: &str, + elapsed_us: Option, + ) { + self.record_stage( + stage, + Some(entities.len()), + elapsed_us, + Some(full_text.len()), + ); + + let offsets = ByteOffsets::new(full_text); + for entity in entities { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::Entity, + count: None, + engine: None, + pattern: None, + source: Some(entity.source), + source_detail: entity.source_detail, + label: Some(entity.label.clone()), + start: Some(entity.start), + end: Some(entity.end), + text: None, + score: Some(entity.score), + span_valid: Some(span_slices(&offsets, entity.start, entity.end)), + elapsed_us: None, + input_bytes: None, + reason: None, + }); + } + } + + pub(crate) fn record_redaction( + &mut self, + result: &RedactionResult, + elapsed_us: Option, + input_bytes: usize, + ) { + self.events.push(DiagnosticEvent { + stage: DiagnosticStage::Redaction, + kind: DiagnosticEventKind::StageSummary, + count: Some(result.entity_count), + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes: Some(input_bytes), + reason: None, + }); + } + + pub(crate) fn record_rejection( + &mut self, + stage: DiagnosticStage, + pattern: Option, + label: Option<&str>, + start: Option, + end: Option, + reason: &'static str, + ) { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::Rejection, + count: None, + engine: None, + pattern, + source: None, + source_detail: None, + label: label.map(str::to_owned), + start, + end, + text: None, + score: None, + span_valid: None, + elapsed_us: None, + input_bytes: None, + reason: Some(String::from(reason)), + }); + } + + pub(crate) fn record_stage( + &mut self, + stage: DiagnosticStage, + count: Option, + elapsed_us: Option, + input_bytes: Option, + ) { + self.events.push(DiagnosticEvent { + stage, + kind: DiagnosticEventKind::StageSummary, + count, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes, + reason: None, + }); + } + + pub fn extend(&mut self, other: Self) { + self.events.extend(other.events); + } +} + +fn span_slices(offsets: &ByteOffsets<'_>, start: u32, end: u32) -> bool { + start <= end + && offsets.validate_offset(start).is_ok() + && offsets.validate_offset(end).is_ok() +} diff --git a/crates/anonymize-core/src/false_positives.rs b/crates/anonymize-core/src/false_positives.rs new file mode 100644 index 00000000..02554372 --- /dev/null +++ b/crates/anonymize-core/src/false_positives.rs @@ -0,0 +1,1072 @@ +use std::sync::LazyLock; + +use regex::Regex; + +use crate::byte_offsets::ByteOffsets; +use crate::processors::DenyListFilterData; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result}; + +const ADDRESS_LABEL: &str = "address"; +const ORGANIZATION_LABEL: &str = "organization"; +const PERSON_LABEL: &str = "person"; +const REGISTRATION_NUMBER_LABEL: &str = "registration number"; +const MAX_ORGANIZATION_LENGTH: usize = 80; +const MAX_PERSON_LENGTH: usize = 60; +const MAX_OPEN_ENDED_ORGANIZATION_WORDS: usize = 8; +const ALL_CAPS_LINE_LETTER_THRESHOLD: usize = 5; +const ALL_CAPS_LINE_RATIO: f64 = 0.95; +const ALL_CAPS_LINE_PROSE_EXTRA_LETTERS: usize = 20; +const ALL_CAPS_LINE_HEADING_WORD_LIMIT: usize = 5; + +static POSTAL_CODE_RE: LazyLock> = + LazyLock::new(|| Regex::new(r"\d{3}\s?\d{2}").ok()); +static SECTION_NUMBER_RE: LazyLock> = + LazyLock::new(|| Regex::new(r"^(?:§\s*)?\d{1,3}(?:\.\d{1,3}){0,4}\.?$").ok()); + +pub(crate) fn filter_entity_false_positives( + entities: Vec, + full_text: &str, + filters: Option<&DenyListFilterData>, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut filtered = Vec::with_capacity(entities.len()); + for entity in entities { + if is_caller_owned(&entity) { + filtered.push(entity); + continue; + } + + let Some(normalized) = normalize_entity(&entity, &offsets, filters)? else { + continue; + }; + if should_reject_entity(&normalized, full_text, &offsets, filters)? { + continue; + } + filtered.push(normalized); + } + + Ok(filtered) +} + +fn normalize_entity( + entity: &PipelineEntity, + offsets: &ByteOffsets<'_>, + filters: Option<&DenyListFilterData>, +) -> Result> { + let raw_text = offsets.slice(entity.start, entity.end)?; + let mut start_byte = 0usize; + let mut end_byte = raw_text.len(); + + trim_leading_artifacts(&raw_text, &mut start_byte, end_byte); + trim_leading_whitespace(&raw_text, &mut start_byte, end_byte); + + if entity.label == ADDRESS_LABEL + && let Some(filters) = filters + { + if let Some(trimmed) = + address_role_prefix_len(slice(&raw_text, start_byte, end_byte)?, filters) + { + start_byte = start_byte.saturating_add(trimmed); + trim_leading_whitespace(&raw_text, &mut start_byte, end_byte); + } + + let address_text = slice(&raw_text, start_byte, end_byte)?; + if let Some(trimmed_end) = + trim_trailing_address_prose(address_text, filters) + { + end_byte = start_byte.saturating_add(trimmed_end); + } + } + + trim_trailing_separators(&raw_text, start_byte, &mut end_byte); + if start_byte >= end_byte { + return Ok(None); + } + + let cleaned_raw = slice(&raw_text, start_byte, end_byte)?; + if !cleaned_raw.chars().any(char::is_alphanumeric) { + return Ok(None); + } + + let mut normalized = entity.clone(); + normalized.start = entity + .start + .saturating_add(byte_len(raw_text.get(..start_byte).unwrap_or_default())); + normalized.end = normalized.start.saturating_add(byte_len(cleaned_raw)); + normalized.text = collapse_display_whitespace(cleaned_raw); + Ok(Some(normalized)) +} + +fn should_reject_entity( + entity: &PipelineEntity, + full_text: &str, + offsets: &ByteOffsets<'_>, + filters: Option<&DenyListFilterData>, +) -> Result { + let text = entity.text.trim(); + if is_template_placeholder(text) { + return Ok(true); + } + if exceeds_label_length(entity) { + return Ok(true); + } + if exceeds_open_ended_word_count(entity) { + return Ok(true); + } + if is_section_number(text) && entity.source != DetectionSource::Trigger { + return Ok(true); + } + if is_standalone_year(text) && entity.source != DetectionSource::Trigger { + return Ok(true); + } + if entity.source != DetectionSource::Trigger + && text.chars().next().is_some_and(|ch| ch.is_ascii_digit()) + && let Some(filters) = filters + && has_number_abbrev_prefix(full_text, offsets, entity, filters)? + { + return Ok(true); + } + if entity.label == REGISTRATION_NUMBER_LABEL && is_short_letter_run(text) { + return Ok(true); + } + if entity.label == PERSON_LABEL && text.chars().any(|ch| ch.is_ascii_digit()) + { + return Ok(true); + } + if let Some(filters) = filters { + if entity.label == PERSON_LABEL && is_single_person_stopword(text, filters) + { + return Ok(true); + } + if entity.label == PERSON_LABEL + && ends_in_person_trailing_noun(entity, filters) + { + return Ok(true); + } + if role_exact_match(entity, filters) { + return Ok(true); + } + } + if entity.label == ORGANIZATION_LABEL + && is_all_caps_candidate(text) + && is_all_caps_boilerplate_line(full_text, offsets, entity)? + { + return Ok(true); + } + if entity.label == ORGANIZATION_LABEL + && filters + .is_some_and(|filters| is_document_structure_heading(text, filters)) + { + return Ok(true); + } + if entity.label == ADDRESS_LABEL && should_reject_address(entity, filters) { + return Ok(true); + } + + Ok(false) +} + +fn should_reject_address( + entity: &PipelineEntity, + filters: Option<&DenyListFilterData>, +) -> bool { + let text = entity.text.trim(); + if filters.is_some_and(|filters| is_signing_place_address(text, filters)) { + return true; + } + + let has_digits = text.chars().any(|ch| ch.is_ascii_digit()); + let has_component = + filters.is_some_and(|filters| has_address_component(text, filters)); + if filters.is_some_and(|filters| is_jurisdiction_address(text, filters)) { + return false; + } + if entity.source == DetectionSource::Trigger && !has_digits { + if filters.is_some_and(|filters| is_only_ambiguous_component(text, filters)) + { + return true; + } + if !has_component { + return true; + } + } + + text.chars().count() > 40 + && !has_digits + && !regex_is_match(&POSTAL_CODE_RE, text) + && !has_component +} + +fn exceeds_label_length(entity: &PipelineEntity) -> bool { + if entity.source == DetectionSource::LegalForm { + return false; + } + let max = match entity.label.as_str() { + ORGANIZATION_LABEL => MAX_ORGANIZATION_LENGTH, + PERSON_LABEL => MAX_PERSON_LENGTH, + _ => return false, + }; + entity.text.chars().count() > max +} + +fn exceeds_open_ended_word_count(entity: &PipelineEntity) -> bool { + entity.label == ORGANIZATION_LABEL + && matches!( + entity.source, + DetectionSource::Trigger | DetectionSource::Coreference + ) + && word_count(&entity.text) > MAX_OPEN_ENDED_ORGANIZATION_WORDS +} + +fn is_template_placeholder(text: &str) -> bool { + let trimmed = text.trim(); + if trimmed.len() >= 3 && trimmed.chars().all(|ch| ch == '.' || ch == '_') { + return true; + } + let Some(inner) = bracketed_inner(trimmed, '[', ']') + .or_else(|| bracketed_inner(trimmed, '{', '}')) + else { + return false; + }; + !inner.is_empty() + && inner + .chars() + .all(|ch| ch == '_' || ch.is_alphanumeric() || ch.is_whitespace()) +} + +fn bracketed_inner(text: &str, open: char, close: char) -> Option<&str> { + let mut chars = text.chars(); + if chars.next()? != open || chars.next_back()? != close { + return None; + } + let start = open.len_utf8(); + let end = text.len().saturating_sub(close.len_utf8()); + text.get(start..end) +} + +fn is_section_number(text: &str) -> bool { + regex_is_match(&SECTION_NUMBER_RE, text.trim()) +} + +fn is_standalone_year(text: &str) -> bool { + let trimmed = text.trim(); + trimmed.len() == 4 + && trimmed.chars().all(|ch| ch.is_ascii_digit()) + && (trimmed.starts_with("19") || trimmed.starts_with("20")) +} + +fn has_number_abbrev_prefix( + full_text: &str, + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> Result { + let start = offsets.validate_offset(entity.start)?; + let before = full_text.get(..start).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + Ok(ends_with_number_abbrev(before, filters)) +} + +fn ends_with_number_abbrev(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.trim_end().to_lowercase(); + filters.number_abbrev_prefixes.iter().any(|prefix| { + let Some(before_prefix) = lower.strip_suffix(prefix) else { + return false; + }; + before_prefix + .chars() + .next_back() + .is_none_or(|ch| ch.is_whitespace() || ch == '(') + }) +} + +fn is_document_structure_heading( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let Some((word_end, word)) = first_word(text.trim_start()) else { + return false; + }; + if !filters + .document_heading_words + .contains(&word.to_lowercase()) + { + return false; + } + let Some(rest) = text.trim_start().get(word_end..) else { + return false; + }; + starts_with_ordinal_marker_digit(rest, filters) +} + +fn starts_with_ordinal_marker_digit( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let trimmed = text.trim_start(); + let lower = trimmed.to_lowercase(); + filters + .document_heading_ordinal_markers + .iter() + .any(|marker| { + if marker.is_empty() { + return false; + } + if !lower.starts_with(marker) { + return false; + } + let Some(rest) = trimmed.get(marker.len()..) else { + return false; + }; + rest + .trim_start() + .chars() + .next() + .is_some_and(|ch| ch.is_ascii_digit()) + }) +} + +fn is_short_letter_run(text: &str) -> bool { + let letters = text.trim(); + (1..=2).contains(&letters.chars().count()) + && letters.chars().all(char::is_alphabetic) +} + +fn is_single_person_stopword(text: &str, filters: &DenyListFilterData) -> bool { + let token = trim_token_punctuation(text); + !token.is_empty() + && !token.chars().any(char::is_whitespace) + && filters.person_stopwords.contains(&token.to_lowercase()) +} + +fn ends_in_person_trailing_noun( + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> bool { + if matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) { + return false; + } + + let mut words = entity + .text + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()); + if words.next().is_none() { + return false; + } + let Some(last) = words.next_back() else { + return false; + }; + filters.person_trailing_nouns.contains(&last.to_lowercase()) +} + +fn role_exact_match( + entity: &PipelineEntity, + filters: &DenyListFilterData, +) -> bool { + matches!(entity.label.as_str(), PERSON_LABEL | ORGANIZATION_LABEL) + && filters + .generic_roles + .contains(&entity.text.trim().to_lowercase()) +} + +fn is_all_caps_candidate(text: &str) -> bool { + let mut has_upper = false; + for ch in text.chars().filter(|ch| ch.is_alphabetic()) { + if ch.is_lowercase() { + return false; + } + has_upper |= ch.is_uppercase(); + } + has_upper +} + +fn is_all_caps_boilerplate_line( + full_text: &str, + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, +) -> Result { + let start = offsets.validate_offset(entity.start)?; + let end = offsets.validate_offset(entity.end)?; + let before = full_text.get(..start).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let line_start = before + .rfind('\n') + .map_or(0usize, |index| index.saturating_add('\n'.len_utf8())); + let after = full_text.get(end..).ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let line_end = after + .find('\n') + .map_or(full_text.len(), |index| end.saturating_add(index)); + let line = full_text + .get(line_start..line_end) + .ok_or(Error::InvalidSpan { + start: entity.start, + end: entity.end, + })?; + let entity_rel_start = start.saturating_sub(line_start); + let entity_rel_end = end.saturating_sub(line_start); + + let mut letter_count = 0usize; + let mut upper_count = 0usize; + let mut outside_entity_letters = 0usize; + for (index, ch) in line.char_indices() { + if !ch.is_alphabetic() { + continue; + } + letter_count = letter_count.saturating_add(1); + if ch.is_uppercase() { + upper_count = upper_count.saturating_add(1); + } + if index < entity_rel_start || index >= entity_rel_end { + outside_entity_letters = outside_entity_letters.saturating_add(1); + } + } + + if letter_count <= ALL_CAPS_LINE_LETTER_THRESHOLD { + return Ok(false); + } + if !uppercase_ratio_at_least(upper_count, letter_count) { + return Ok(false); + } + if starts_with_section_heading_prefix(line) { + return Ok(true); + } + if outside_entity_letters >= ALL_CAPS_LINE_PROSE_EXTRA_LETTERS { + return Ok(true); + } + Ok( + word_count(&entity.text) > ALL_CAPS_LINE_HEADING_WORD_LIMIT + && !entity.text.contains(','), + ) +} + +fn starts_with_section_heading_prefix(line: &str) -> bool { + let mut chars = line.trim_start().chars().peekable(); + if chars.peek().is_some_and(|ch| *ch == '§') { + chars.next(); + while chars.peek().is_some_and(|ch| ch.is_whitespace()) { + chars.next(); + } + } + + let mut saw_digit = false; + let mut group_digits = 0usize; + while let Some(ch) = chars.peek().copied() { + if ch.is_ascii_digit() { + saw_digit = true; + group_digits = group_digits.saturating_add(1); + if group_digits > 3 { + return false; + } + chars.next(); + continue; + } + if ch == '.' && saw_digit { + group_digits = 0; + chars.next(); + continue; + } + break; + } + if !saw_digit { + return false; + } + while chars.peek().is_some_and(|ch| ch.is_whitespace()) { + chars.next(); + } + chars.next().is_some_and(char::is_uppercase) +} + +fn trim_leading_artifacts(text: &str, start: &mut usize, end: usize) { + while let Some(rest) = text.get(*start..end) { + if !rest.starts_with('.') { + break; + } + let after_dot_start = '.'.len_utf8(); + let Some(after_dot) = rest.get(after_dot_start..) else { + break; + }; + let whitespace = leading_whitespace_len(after_dot); + if whitespace == 0 { + break; + } + *start = + (*start).saturating_add(after_dot_start.saturating_add(whitespace)); + } +} + +fn trim_leading_whitespace(text: &str, start: &mut usize, end: usize) { + let Some(rest) = text.get(*start..end) else { + return; + }; + *start = (*start).saturating_add(leading_whitespace_len(rest)); +} + +fn trim_trailing_separators(text: &str, start: usize, end: &mut usize) { + while let Some(slice) = text.get(start..*end) { + let Some((index, ch)) = slice.char_indices().next_back() else { + break; + }; + if ch.is_whitespace() || ch == ',' { + *end = start.saturating_add(index); + continue; + } + break; + } +} + +fn address_role_prefix_len( + text: &str, + filters: &DenyListFilterData, +) -> Option { + let (word_end, word) = first_word(text)?; + if !filters.generic_roles.contains(&word.to_lowercase()) { + return None; + } + let rest = text.get(word_end..)?; + let whitespace = leading_whitespace_len(rest); + if whitespace == 0 { + return None; + } + let candidate = rest.get(whitespace..)?; + if looks_like_address_start(candidate, filters) { + return Some(word_end.saturating_add(whitespace)); + } + None +} + +fn looks_like_address_start(text: &str, filters: &DenyListFilterData) -> bool { + let trimmed = text.trim_start(); + trimmed.chars().next().is_some_and(|ch| { + ch.is_ascii_digit() + || ch.is_uppercase() + || has_address_component(trimmed, filters) + }) +} + +fn trim_trailing_address_prose( + text: &str, + filters: &DenyListFilterData, +) -> Option { + for (index, ch) in text.char_indices() { + if ch != '.' { + continue; + } + let before = text.get(..index)?; + if !before.chars().any(|candidate| candidate.is_ascii_digit()) { + continue; + } + if text_ends_with_address_component(before.trim_end(), filters) { + continue; + } + let after = text + .get(index.saturating_add('.'.len_utf8())..)? + .trim_start(); + if after.len() < 5 || has_address_component(after, filters) { + continue; + } + if after.chars().next().is_some_and(char::is_uppercase) { + return Some(before.trim_end().len()); + } + } + None +} + +fn has_address_component(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters + .street_types + .iter() + .any(|component| contains_component(&lower, component)) + || filters + .address_component_terms + .iter() + .any(|component| contains_component(&lower, component)) +} + +fn is_only_ambiguous_component( + text: &str, + filters: &DenyListFilterData, +) -> bool { + filters + .ambiguous_street_type_terms + .iter() + .any(|term| is_only_ambiguous_component_term(text, filters, term)) +} + +fn is_only_ambiguous_component_term( + text: &str, + filters: &DenyListFilterData, + term: &str, +) -> bool { + if term.is_empty() { + return false; + } + let Some((start, end)) = find_ambiguous_component_occurrence(text, term) + else { + return false; + }; + if text + .get(end..) + .is_some_and(starts_with_capitalized_token_after_space) + { + return false; + } + let mut stripped = String::with_capacity(text.len()); + stripped.push_str(text.get(..start).unwrap_or_default()); + stripped.push(' '); + stripped.push_str(text.get(end..).unwrap_or_default()); + !has_address_component(&stripped, filters) +} + +fn find_ambiguous_component_occurrence( + text: &str, + term: &str, +) -> Option<(usize, usize)> { + text.char_indices().find_map(|(start, _)| { + let match_len = case_insensitive_prefix_len(text.get(start..)?, term)?; + let end = start.saturating_add(match_len); + let left_ok = text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary); + let right_ok = text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(is_right_component_boundary); + (left_ok && right_ok).then_some((start, end)) + }) +} + +fn case_insensitive_prefix_len(text: &str, prefix: &str) -> Option { + let mut consumed = 0usize; + for expected in prefix.chars() { + let actual = text.get(consumed..)?.chars().next()?; + if !actual.eq_ignore_ascii_case(&expected) { + return None; + } + consumed = consumed.saturating_add(actual.len_utf8()); + } + Some(consumed) +} + +fn starts_with_capitalized_token_after_space(text: &str) -> bool { + let leading = leading_whitespace_len(text); + if leading == 0 { + return false; + } + text + .get(leading..) + .and_then(|tail| tail.chars().next()) + .is_some_and(char::is_uppercase) +} + +fn is_jurisdiction_address(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters.address_jurisdiction_prefixes.iter().any(|prefix| { + let Some(rest) = lower.strip_prefix(prefix) else { + return false; + }; + rest.chars().next().is_some_and(char::is_whitespace) + && rest.chars().any(char::is_alphabetic) + }) +} + +fn text_ends_with_address_component( + text: &str, + filters: &DenyListFilterData, +) -> bool { + let lower = text.to_lowercase(); + filters.street_types.iter().any(|component| { + if component.is_empty() || !lower.ends_with(component) { + return false; + } + let prefix_len = lower.len().saturating_sub(component.len()); + lower + .get(..prefix_len) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary) + }) +} + +fn contains_component(text: &str, component: &str) -> bool { + if component.is_empty() { + return false; + } + text.match_indices(component).any(|(start, _)| { + let end = start.saturating_add(component.len()); + let left_ok = text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(is_left_component_boundary); + let right_ok = text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(is_right_component_boundary); + left_ok && right_ok + }) +} + +const fn is_left_component_boundary(ch: char) -> bool { + ch.is_whitespace() || ch == ',' || ch == '(' || ch == '[' +} + +const fn is_right_component_boundary(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, ',' | '.' | '/' | ')' | ']') +} + +fn is_signing_place_address(text: &str, filters: &DenyListFilterData) -> bool { + let lower = text.to_lowercase(); + filters.signing_place_guards.iter().any(|guard| { + guard.prefix_phrases.iter().any(|prefix| { + !prefix.is_empty() + && lower.starts_with(prefix) + && guard + .suffix_phrases + .iter() + .any(|suffix| !suffix.is_empty() && lower.ends_with(suffix)) + }) + }) +} + +fn first_word(text: &str) -> Option<(usize, &str)> { + let mut end = 0usize; + for (index, ch) in text.char_indices() { + if !ch.is_alphabetic() { + break; + } + end = index.saturating_add(ch.len_utf8()); + } + if end == 0 { + return None; + } + text.get(..end).map(|word| (end, word)) +} + +fn word_count(text: &str) -> usize { + let mut count = 0usize; + let mut in_word = false; + for ch in text.chars() { + let word_char = + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '-' | '.'); + if word_char && !in_word { + count = count.saturating_add(1); + } + in_word = word_char; + } + count +} + +fn trim_token_punctuation(text: &str) -> &str { + text + .trim() + .trim_matches(|ch: char| matches!(ch, '.' | ',' | ';' | ':' | '!' | '?')) +} + +fn leading_whitespace_len(text: &str) -> usize { + let mut len = 0usize; + for ch in text.chars() { + if !ch.is_whitespace() { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + +fn slice(text: &str, start: usize, end: usize) -> Result<&str> { + text.get(start..end).ok_or_else(|| Error::InvalidSpan { + start: u32::try_from(start).unwrap_or(u32::MAX), + end: u32::try_from(end).unwrap_or(u32::MAX), + }) +} + +fn collapse_display_whitespace(text: &str) -> String { + let mut out = String::new(); + let mut whitespace = String::new(); + + for ch in text.chars() { + if ch.is_whitespace() { + whitespace.push(ch); + continue; + } + + flush_whitespace(&mut out, &mut whitespace); + out.push(ch); + } + + flush_whitespace(&mut out, &mut whitespace); + out +} + +fn flush_whitespace(output: &mut String, whitespace: &mut String) { + if whitespace.is_empty() { + return; + } + + if whitespace.chars().any(|ch| matches!(ch, '\n' | '\r')) + || whitespace.chars().count() >= 2 + { + output.push(' '); + } else if let Some(ch) = whitespace.chars().next() { + output.push(ch); + } + + whitespace.clear(); +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn regex_is_match(regex: &LazyLock>, text: &str) -> bool { + regex + .as_ref() + .is_some_and(|compiled| compiled.is_match(text)) +} + +fn uppercase_ratio_at_least(upper_count: usize, letter_count: usize) -> bool { + let Some(upper) = u32::try_from(upper_count).ok().map(f64::from) else { + return true; + }; + let Some(total) = u32::try_from(letter_count).ok().map(f64::from) else { + return true; + }; + upper / total >= ALL_CAPS_LINE_RATIO +} + +const fn is_caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + + use std::collections::BTreeSet; + + use super::*; + + #[test] + fn rejects_template_placeholders() { + let entities = filter_entity_false_positives( + vec![entity( + "[NAME]", + "[NAME]", + PERSON_LABEL, + DetectionSource::Regex, + )], + "[NAME]", + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn rejects_generic_false_positives_without_deny_list_filters() { + let text = "[NAME]\n17. NO ASSIGNMENT.\n"; + let heading_start = text.find("NO ASSIGNMENT").unwrap(); + let heading_end = heading_start.saturating_add("NO ASSIGNMENT".len()); + let entities = filter_entity_false_positives( + vec![ + entity("[NAME]", "[NAME]", PERSON_LABEL, DetectionSource::Regex), + PipelineEntity::detected( + u32::try_from(heading_start).unwrap(), + u32::try_from(heading_end).unwrap(), + ORGANIZATION_LABEL, + "NO ASSIGNMENT", + 0.8, + DetectionSource::Regex, + ), + ], + text, + None, + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn trims_address_role_prefix_from_shared_role_data() { + let text = "sídlo prodávajícího Na Květnici 1"; + let start = text.find("prodávajícího").unwrap(); + let filters = DenyListFilterData { + generic_roles: set(["prodávajícího"]), + ..DenyListFilterData::default() + }; + + let entities = filter_entity_false_positives( + vec![PipelineEntity::detected( + u32::try_from(start).unwrap(), + u32::try_from(text.len()).unwrap(), + ADDRESS_LABEL, + "prodávajícího Na Květnici 1", + 0.8, + DetectionSource::Trigger, + )], + text, + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Na Květnici 1"); + assert_eq!( + entities[0].start, + u32::try_from("sídlo prodávajícího ".len()).unwrap() + ); + } + + #[test] + fn preserves_single_non_breaking_space_in_entity_text() { + let text = "Městským soudem v\u{00a0}Praze"; + let entities = filter_entity_false_positives( + vec![entity( + text, + text, + ORGANIZATION_LABEL, + DetectionSource::Trigger, + )], + text, + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, text); + } + + #[test] + fn rejects_trigger_address_without_digits_or_street_component() { + let entities = filter_entity_false_positives( + vec![entity( + "Nejsme plátci DPH", + "Nejsme plátci DPH", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "Nejsme plátci DPH", + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn keeps_trigger_address_with_street_component() { + let filters = DenyListFilterData { + street_types: set(["street"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity( + "West Street", + "West Street", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "West Street", + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + } + + #[test] + fn keeps_configured_jurisdiction_addresses_without_digits() { + let filters = DenyListFilterData { + address_jurisdiction_prefixes: set(["state of"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity( + "State of Delaware", + "State of Delaware", + ADDRESS_LABEL, + DetectionSource::Trigger, + )], + "State of Delaware", + Some(&filters), + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + } + + #[test] + fn rejects_person_stopwords() { + let filters = DenyListFilterData { + person_stopwords: set(["tato"]), + ..DenyListFilterData::default() + }; + let entities = filter_entity_false_positives( + vec![entity("Tato", "Tato", PERSON_LABEL, DetectionSource::Regex)], + "Tato", + Some(&filters), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + #[test] + fn rejects_all_caps_section_heading_organizations() { + let text = "17. NO ASSIGNMENT.\n"; + let start = text.find("NO ASSIGNMENT").unwrap(); + let end = start.saturating_add("NO ASSIGNMENT".len()); + let entities = filter_entity_false_positives( + vec![PipelineEntity::detected( + u32::try_from(start).unwrap(), + u32::try_from(end).unwrap(), + ORGANIZATION_LABEL, + "NO ASSIGNMENT", + 0.8, + DetectionSource::Regex, + )], + text, + Some(&DenyListFilterData::default()), + ) + .unwrap(); + + assert!(entities.is_empty()); + } + + fn entity( + full_text: &str, + text: &str, + label: &str, + source: DetectionSource, + ) -> PipelineEntity { + PipelineEntity::detected( + 0, + u32::try_from(full_text.len()).expect("fixture length fits u32"), + label, + text, + 0.8, + source, + ) + } + + fn set(values: [&str; N]) -> BTreeSet { + values.into_iter().map(String::from).collect() + } +} diff --git a/crates/anonymize-core/src/hotwords.rs b/crates/anonymize-core/src/hotwords.rs new file mode 100644 index 00000000..eceb5e29 --- /dev/null +++ b/crates/anonymize-core/src/hotwords.rs @@ -0,0 +1,249 @@ +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{PipelineEntity, SourceDetail}; +use crate::search::{ + LiteralSearchOptions, SearchIndex, SearchOptions, SearchPattern, +}; +use crate::types::{Error, Result, SearchMatch}; + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct HotwordRuleData { + pub rules: Vec, + #[serde(default)] + pub pattern_rule_indices: Vec, +} + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct HotwordRule { + #[serde(default)] + pub hotwords: Vec, + pub target_labels: Vec, + pub score_adjustment: f64, + pub reclassify_to: Option, + pub proximity_before: u32, + pub proximity_after: u32, +} + +pub(crate) struct PreparedHotwordData { + rules: Vec, + pattern_rule_indices: Vec, + search: SearchIndex, +} + +impl PreparedHotwordData { + pub(crate) fn new(data: HotwordRuleData) -> Result { + let mut patterns = Vec::new(); + let mut pattern_rule_indices = Vec::new(); + + for (rule_index, rule) in data.rules.iter().enumerate() { + let rule_index = + u32::try_from(rule_index).map_err(|_| Error::InvalidStaticData { + field: "hotword_data.rules", + reason: String::from("rule index exceeds u32 range"), + })?; + for hotword in &rule.hotwords { + if hotword.is_empty() { + return Err(Error::InvalidStaticData { + field: "hotword_data.rules.hotwords", + reason: String::from("hotword must not be empty"), + }); + } + patterns.push(SearchPattern::LiteralWithOptions { + pattern: hotword.clone(), + case_insensitive: Some(true), + whole_words: Some(true), + }); + pattern_rule_indices.push(rule_index); + } + } + + let search = SearchIndex::new( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: true, + }, + ..SearchOptions::default() + }, + )?; + + Ok(Self { + rules: data.rules, + pattern_rule_indices, + search, + }) + } +} + +pub(crate) fn apply_hotword_rules( + entities: Vec, + full_text: &str, + data: &PreparedHotwordData, + allowed_labels: &[String], +) -> Result> { + let hits_by_rule = collect_hits_by_rule(full_text, data)?; + let offsets = ByteOffsets::new(full_text); + let mut result = Vec::with_capacity(entities.len()); + + for entity in entities { + if caller_owned(&entity) { + result.push(entity); + continue; + } + + let adjusted = apply_entity_rules(entity, &offsets, data, &hits_by_rule)?; + if label_allowed(&adjusted.label, allowed_labels) { + result.push(adjusted); + } + } + + Ok(result) +} + +fn collect_hits_by_rule( + full_text: &str, + data: &PreparedHotwordData, +) -> Result>> { + let mut hits_by_rule = vec![Vec::new(); data.rules.len()]; + + for found in data.search.find_iter(full_text)? { + let Ok(local_index) = usize::try_from(found.pattern()) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("pattern index exceeds usize range"), + }); + }; + let Some(rule_index) = data.pattern_rule_indices.get(local_index) else { + continue; + }; + let Ok(rule_index) = usize::try_from(*rule_index) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index exceeds usize range"), + }); + }; + let Some(bucket) = hits_by_rule.get_mut(rule_index) else { + return Err(Error::InvalidStaticData { + field: "hotword_data.pattern_rule_indices", + reason: String::from("rule index out of range"), + }); + }; + bucket.push(found); + } + + Ok(hits_by_rule) +} + +fn apply_entity_rules( + mut entity: PipelineEntity, + offsets: &ByteOffsets<'_>, + data: &PreparedHotwordData, + hits_by_rule: &[Vec], +) -> Result { + let mut best = None::; + + for (rule_index, rule) in data.rules.iter().enumerate() { + if !rule + .target_labels + .iter() + .any(|label| label == &entity.label) + { + continue; + } + let Some(rule_hits) = hits_by_rule.get(rule_index) else { + continue; + }; + for hit in rule_hits { + let Some((distance, max_distance)) = + hotword_distance(offsets, &entity, hit, rule)? + else { + continue; + }; + let decay = if max_distance == 0 { + 1.0 + } else { + 1.0 - (f64::from(distance) / f64::from(max_distance)) + }; + let adjustment = rule.score_adjustment * decay; + if adjustment.abs() <= f64::EPSILON { + continue; + } + if best + .as_ref() + .is_some_and(|best| adjustment.abs() <= best.score.abs()) + { + continue; + } + + best = Some(HotwordAdjustment { + score: adjustment, + reclassify_to: if adjustment.is_sign_positive() { + rule.reclassify_to.clone() + } else { + None + }, + }); + } + } + + let Some(best) = best else { + return Ok(entity); + }; + + entity.score = (entity.score + best.score).clamp(0.0, 1.0); + if let Some(label) = best.reclassify_to { + entity.label = label; + } + Ok(entity) +} + +fn hotword_distance( + offsets: &ByteOffsets<'_>, + entity: &PipelineEntity, + hit: &SearchMatch, + rule: &HotwordRule, +) -> Result> { + let (distance, max_distance) = if hit.end() <= entity.start { + ( + text_distance(offsets, hit.end(), entity.start)?, + rule.proximity_before, + ) + } else if hit.start() >= entity.end { + ( + text_distance(offsets, entity.end, hit.start())?, + rule.proximity_after, + ) + } else { + (0, u32::max(rule.proximity_before, rule.proximity_after)) + }; + + if distance > max_distance { + return Ok(None); + } + Ok(Some((distance, max_distance))) +} + +fn text_distance( + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, +) -> Result { + offsets.utf16_units_between(start, end) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +fn label_allowed(label: &str, allowed_labels: &[String]) -> bool { + allowed_labels.is_empty() + || allowed_labels.iter().any(|allowed| allowed == label) +} + +struct HotwordAdjustment { + score: f64, + reclassify_to: Option, +} diff --git a/crates/anonymize-core/src/legal_forms.rs b/crates/anonymize-core/src/legal_forms.rs new file mode 100644 index 00000000..2b3cf66e --- /dev/null +++ b/crates/anonymize-core/src/legal_forms.rs @@ -0,0 +1,1614 @@ +use std::collections::BTreeSet; + +use crate::byte_offsets::ByteOffsets; +use crate::processors::PatternSlice; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Result, SearchMatch}; + +const LEGAL_FORM_SCORE: f64 = 0.95; +const HEAD_TOKEN_CAP: usize = 20; +const MAX_LOWER_BRIDGE: usize = 4; +const MAX_NAME_LOOKBACK: usize = 32; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct LegalFormData { + pub suffixes: Vec, + pub normalized_boundary_suffixes: Vec, + pub normalized_in_name_words: Vec, + pub normalized_suffix_words: Vec, + pub role_heads: Vec, + pub sentence_verb_indicators: Vec, + pub clause_noun_heads: Vec, + pub connector_prose_heads: Vec, + pub structural_single_cap_prefixes: Vec, + pub leading_clause_phrases: Vec, + pub leading_clause_direct_prefixes: Vec, + pub connector_words: Vec, + pub and_connector_words: Vec, + pub in_name_prepositions: Vec, + pub company_suffix_words: Vec, + pub comma_gated_direct_prefixes: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct PreparedLegalFormData { + suffixes: Vec, + normalized_boundary_suffixes: BTreeSet, + normalized_in_name_words: BTreeSet, + normalized_suffix_words: BTreeSet, + role_heads: BTreeSet, + sentence_verb_indicators: BTreeSet, + clause_noun_heads: BTreeSet, + connector_prose_heads: BTreeSet, + structural_single_cap_prefixes: BTreeSet, + leading_clause_phrases: Vec, + leading_clause_direct_prefixes: Vec, + connector_words: BTreeSet, + and_connector_words: BTreeSet, + in_name_prepositions: BTreeSet, + company_suffix_words: BTreeSet, + comma_gated_direct_prefixes: BTreeSet, +} + +impl PreparedLegalFormData { + pub(crate) fn new(data: LegalFormData) -> Self { + Self { + suffixes: data.suffixes, + normalized_boundary_suffixes: lower_set( + data.normalized_boundary_suffixes, + ), + normalized_in_name_words: lower_set(data.normalized_in_name_words), + normalized_suffix_words: lower_set(data.normalized_suffix_words), + role_heads: lower_set(data.role_heads), + sentence_verb_indicators: lower_set(data.sentence_verb_indicators), + clause_noun_heads: lower_set(data.clause_noun_heads), + connector_prose_heads: lower_set(data.connector_prose_heads), + structural_single_cap_prefixes: lower_set( + data.structural_single_cap_prefixes, + ), + leading_clause_phrases: lower_vec(data.leading_clause_phrases), + leading_clause_direct_prefixes: lower_vec( + data.leading_clause_direct_prefixes, + ), + connector_words: lower_set(data.connector_words), + and_connector_words: lower_set(data.and_connector_words), + in_name_prepositions: lower_set(data.in_name_prepositions), + company_suffix_words: lower_set(data.company_suffix_words), + comma_gated_direct_prefixes: lower_set(data.comma_gated_direct_prefixes), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct Candidate { + start: usize, + suffix_start: usize, + end: usize, + trimmed: bool, +} + +pub(crate) fn process_legal_form_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &PreparedLegalFormData, +) -> Result> { + if data.suffixes.is_empty() { + return Ok(Vec::new()); + } + + let offsets = ByteOffsets::new(full_text); + let mut candidates = Vec::new(); + for found in matches { + if slice.local_index(found.pattern()).is_none() { + continue; + } + + let suffix_start = offsets.validate_offset(found.start())?; + let suffix_end = offsets.validate_offset(found.end())?; + let effective_suffix_start = + effective_line_wrapped_suffix_start(full_text, suffix_start); + if !is_leading_separator(full_text, suffix_start) + || !is_trailing_boundary(full_text, suffix_end) + { + continue; + } + + let Some(walker_start) = + walk_backward(full_text, effective_suffix_start, data) + else { + continue; + }; + if walker_start >= effective_suffix_start { + continue; + } + if crosses_sentence_end(full_text, walker_start, effective_suffix_start) { + continue; + } + + let candidate_start = trim_to_first_cap_after_verb( + full_text, + walker_start, + effective_suffix_start, + data, + ); + if candidate_start >= effective_suffix_start { + continue; + } + + candidates.push(Candidate { + start: candidate_start, + suffix_start: effective_suffix_start, + end: suffix_end, + trimmed: candidate_start != walker_start, + }); + } + + let candidates = drop_overlapping(candidates); + let mut entities = Vec::new(); + for candidate in candidates { + process_candidate(&mut entities, full_text, &candidate, data); + } + + Ok(entities) +} + +fn effective_line_wrapped_suffix_start( + text: &str, + suffix_start: usize, +) -> usize { + let mut scan = suffix_start; + while let Some((prev_start, ch)) = previous_char(text, scan) { + if matches!(ch, ' ' | '\t') { + scan = prev_start; + continue; + } + break; + } + + let Some((newline_start, '\n')) = previous_char(text, scan) else { + return suffix_start; + }; + let mut before = newline_start; + while let Some((prev_start, ch)) = previous_char(text, before) { + if ch == ' ' { + before = prev_start; + continue; + } + return if ch == '.' { before } else { suffix_start }; + } + + suffix_start +} + +fn is_trailing_boundary(text: &str, end: usize) -> bool { + text + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn is_leading_separator(text: &str, suffix_start: usize) -> bool { + let Some((prev_start, prev)) = previous_char(text, suffix_start) else { + return true; + }; + if prev.is_alphanumeric() { + return false; + } + if prev != '.' { + return true; + } + previous_char(text, prev_start).is_none_or(|(_, ch)| !ch.is_alphabetic()) +} + +fn walk_backward( + text: &str, + suffix_start: usize, + data: &PreparedLegalFormData, +) -> Option { + let mut pos = suffix_start; + let mut steps = 0; + let mut leftmost_cap = None::; + let mut lower_bridge_run = 0_usize; + + while steps < HEAD_TOKEN_CAP { + let Some(token) = token_before(text, pos) else { + break; + }; + if !is_acceptable_token(token.text, data) { + break; + } + + if starts_lower(token.text) && leftmost_cap.is_some() { + let after_token = text.get(token.end..pos).unwrap_or_default(); + if starts_with_list_separator(after_token) + && is_legal_form_suffix_word(token.text, data) + { + break; + } + } + + if data.connector_words.contains(&token.text.to_lowercase()) { + let previous = token_before(text, token.start); + if previous + .as_ref() + .is_some_and(|found| is_legal_form_suffix_word(found.text, data)) + { + break; + } + if data + .and_connector_words + .contains(&token.text.to_lowercase()) + { + let upper_before = count_upper_before(text, token.start); + if upper_before <= 2 || has_middle_initial_before(text, token.start) { + break; + } + } + } + + if starts_upper(token.text) { + leftmost_cap = Some(token.start); + lower_bridge_run = 0; + } else if starts_lower(token.text) { + if leftmost_cap.is_some() { + lower_bridge_run = lower_bridge_run.saturating_add(1); + if lower_bridge_run > MAX_LOWER_BRIDGE { + break; + } + } + } else { + lower_bridge_run = 0; + } + + pos = token.start; + steps = steps.saturating_add(1); + } + + leftmost_cap +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct Token<'a> { + start: usize, + end: usize, + text: &'a str, +} + +fn token_before(text: &str, pos: usize) -> Option> { + let mut end = pos; + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch == '\n' { + return None; + } + if is_inter_token_space(ch) || matches!(ch, ',' | ';') { + end = prev_start; + continue; + } + break; + } + if end == 0 { + return None; + } + + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if ch == '\n' || !is_token_char(ch) { + break; + } + start = prev_start; + } + + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +const fn is_inter_token_space(ch: char) -> bool { + matches!(ch, ' ' | '\t' | '\u{00a0}' | '\u{202f}') +} + +fn is_token_char(ch: char) -> bool { + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '.' | '&' | '-') +} + +fn is_acceptable_token(token: &str, data: &PreparedLegalFormData) -> bool { + token.chars().next().is_some_and(|ch| { + ch.is_uppercase() || ch.is_lowercase() || ch.is_ascii_digit() + }) || data.connector_words.contains(&token.to_lowercase()) +} + +fn starts_upper(text: &str) -> bool { + text.chars().next().is_some_and(char::is_uppercase) +} + +fn starts_lower(text: &str) -> bool { + text.chars().next().is_some_and(char::is_lowercase) +} + +fn starts_with_list_separator(text: &str) -> bool { + text + .chars() + .next() + .is_some_and(|ch| matches!(ch, ',' | ';')) +} + +fn normalize_suffix_token(text: &str) -> String { + text + .chars() + .filter(|ch| { + !matches!(ch, '.' | ',' | ' ' | '\t' | '\u{00a0}' | '\u{202f}') + }) + .collect::() + .to_lowercase() +} + +fn is_legal_form_suffix_word(word: &str, data: &PreparedLegalFormData) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() && data.normalized_suffix_words.contains(&normalized) +} + +fn is_known_boundary_suffix(word: &str, data: &PreparedLegalFormData) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() + && data.normalized_boundary_suffixes.contains(&normalized) +} + +fn is_in_name_legal_form_word( + word: &str, + data: &PreparedLegalFormData, +) -> bool { + let normalized = normalize_suffix_token(word); + !normalized.is_empty() && data.normalized_in_name_words.contains(&normalized) +} + +fn count_upper_before(text: &str, pos: usize) -> usize { + let mut scan = pos; + let mut count = 0_usize; + while let Some(token) = token_before(text, scan) { + if !starts_upper(token.text) { + break; + } + count = count.saturating_add(1); + scan = token.start; + } + count +} + +fn has_middle_initial_before(text: &str, pos: usize) -> bool { + let start = pos.saturating_sub(MAX_NAME_LOOKBACK); + let Some(slice) = text.get(start..pos) else { + return false; + }; + let trimmed = slice.trim_end_matches(is_inter_token_space); + let Some(last_word) = trailing_word(trimmed) else { + return false; + }; + let before_word = trimmed.get(..last_word.start).unwrap_or_default(); + let before_word = before_word.trim_end_matches(is_inter_token_space); + let Some((dot_start, '.')) = previous_char(before_word, before_word.len()) + else { + return false; + }; + previous_char(before_word, dot_start).is_some_and(|(_, ch)| ch.is_uppercase()) +} + +fn trailing_word(text: &str) -> Option> { + let mut end = text.len(); + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch.is_alphabetic() || matches!(ch, '\'' | '’') { + break; + } + end = prev_start; + } + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if !(ch.is_alphabetic() || matches!(ch, '\'' | '’')) { + break; + } + start = prev_start; + } + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +fn crosses_sentence_end(text: &str, start: usize, suffix_start: usize) -> bool { + let Some(slice) = text.get(start..suffix_start) else { + return false; + }; + let mut previous = None::; + let mut lowercase_run = 0_usize; + let mut uppercase_run = 0_usize; + + for ch in slice.chars() { + if ch.is_uppercase() { + uppercase_run = uppercase_run.saturating_add(1); + lowercase_run = 0; + previous = Some(ch); + continue; + } + if ch.is_lowercase() { + if previous.is_some_and(char::is_uppercase) || lowercase_run > 0 { + lowercase_run = lowercase_run.saturating_add(1); + } + uppercase_run = 0; + previous = Some(ch); + continue; + } + if ch == '.' { + previous = Some(ch); + continue; + } + if ch.is_whitespace() && previous == Some('.') { + if lowercase_run >= 2 || uppercase_run >= 2 { + return true; + } + lowercase_run = 0; + uppercase_run = 0; + } + previous = Some(ch); + } + + false +} + +fn trim_to_first_cap_after_verb( + text: &str, + candidate_start: usize, + suffix_start: usize, + data: &PreparedLegalFormData, +) -> usize { + if candidate_start >= suffix_start { + return candidate_start; + } + let mut last_verb_end = None::; + for token in word_tokens(text, candidate_start, suffix_start) { + if starts_lower(token.text) + && data + .sentence_verb_indicators + .contains(&token.text.to_lowercase()) + { + last_verb_end = Some(token.end); + } + } + + let Some(scan_start) = last_verb_end else { + return candidate_start; + }; + for token in word_tokens(text, scan_start, suffix_start) { + if !starts_upper(token.text) { + continue; + } + let lower = token.text.to_lowercase(); + if data.role_heads.contains(&lower) + || data.clause_noun_heads.contains(&lower) + { + continue; + } + return token.start; + } + + suffix_start +} + +fn word_tokens(text: &str, start: usize, end: usize) -> Vec> { + let mut tokens = Vec::new(); + let mut cursor = start; + while cursor < end { + let Some((ch_start, ch)) = next_char(text, cursor) else { + break; + }; + if !is_word_token_char(ch) { + cursor = ch_start.saturating_add(ch.len_utf8()); + continue; + } + + let token_start = ch_start; + let mut token_end = ch_start.saturating_add(ch.len_utf8()); + while token_end < end { + let Some((next_start, next)) = next_char(text, token_end) else { + break; + }; + if !is_word_token_char(next) { + break; + } + token_end = next_start.saturating_add(next.len_utf8()); + } + if let Some(token_text) = text.get(token_start..token_end) { + tokens.push(Token { + start: token_start, + end: token_end, + text: token_text, + }); + } + cursor = token_end; + } + tokens +} + +fn is_word_token_char(ch: char) -> bool { + ch.is_alphanumeric() || matches!(ch, '\'' | '’' | '-') +} + +fn drop_overlapping(candidates: Vec) -> Vec { + let mut sorted = candidates; + sorted.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + }); + + let mut out = Vec::::new(); + for candidate in sorted { + if out.last().is_some_and(|last| { + candidate.start >= last.start && candidate.end <= last.end + }) { + continue; + } + out.push(candidate); + } + out +} + +fn process_candidate( + results: &mut Vec, + full_text: &str, + candidate: &Candidate, + data: &PreparedLegalFormData, +) { + let Some(raw_text) = full_text.get(candidate.start..candidate.end) else { + return; + }; + let processed_end = candidate.start.saturating_add(trim_end_byte(raw_text)); + if processed_end <= candidate.start { + return; + } + let Some(text) = full_text.get(candidate.start..processed_end) else { + return; + }; + if text.len() < 5 { + return; + } + + let mut processed_start = candidate.start; + let mut processed_text = text; + if is_structural_single_cap_match(processed_text, data) + || is_bare_single_cap_structural_inner_match( + full_text, + candidate.start, + processed_text, + data, + ) + { + return; + } + + let role_trimmed = if let Some(trimmed) = + trim_role_head(full_text, processed_start, processed_text, data) + { + let Some(next_text) = full_text.get(trimmed.start..processed_end) else { + return; + }; + processed_start = trimmed.start; + processed_text = next_text; + true + } else { + false + }; + + if processed_text.contains('\n') && has_disallowed_line_break(processed_text) + { + return; + } + + let (entity_start, entity_text) = candidate_entity_span( + full_text, + candidate, + processed_start, + processed_end, + processed_text, + role_trimmed, + data, + ); + emit_candidate_segments( + results, + candidate, + text, + entity_start, + entity_text, + data, + ); +} + +fn candidate_entity_span<'a>( + full_text: &'a str, + candidate: &Candidate, + processed_start: usize, + processed_end: usize, + processed_text: &'a str, + role_trimmed: bool, + data: &PreparedLegalFormData, +) -> (usize, &'a str) { + if candidate.trimmed + || role_trimmed + || is_bare_single_cap_legal_form(processed_text) + { + return (processed_start, processed_text); + } + + let extended = extend_backward(full_text, processed_start, data, false); + if extended < processed_start + && let Some(extended_text) = full_text.get(extended..processed_end) + { + return (extended, extended_text.trim_end()); + } + + (processed_start, processed_text) +} + +fn emit_candidate_segments( + results: &mut Vec, + candidate: &Candidate, + original_text: &str, + entity_start: usize, + entity_text: &str, + data: &PreparedLegalFormData, +) { + for segment in split_embedded_legal_form_list(entity_start, entity_text, data) + { + let (mut segment_start, mut segment_text) = + trim_embedded_legal_form_list_prefix(segment.start, segment.text, data); + let leading = trim_leading_clause(segment_text, data); + if leading.offset > 0 + && let Some(trimmed) = segment_text.get(leading.offset..) + { + segment_start = segment_start.saturating_add(leading.offset); + segment_text = trimmed.trim_start(); + segment_start = segment_start.saturating_add(leading_ws_len(trimmed)); + } + + if segment_text.contains('\n') && has_disallowed_line_break(segment_text) { + continue; + } + + let mut emit_start = segment_start; + let mut emit_text = segment_text; + let prefix = prefix_info(emit_text); + let all_caps_match = + prefix.part.len() > 2 && prefix.part == prefix.part.to_uppercase(); + if all_caps_match { + let word_count = if prefix.end > 0 { + emit_text + .get(..prefix.end) + .unwrap_or_default() + .split_whitespace() + .count() + } else { + emit_text.split_whitespace().count() + }; + if word_count > 3 { + emit_start = candidate.start; + emit_text = original_text; + } + } + + if has_roman_numeral_suffix(emit_text) { + continue; + } + if short_ascii_suffix_collides_with_non_ascii_prefix(emit_text) { + continue; + } + + let end = emit_start.saturating_add(emit_text.len()); + results.push(PipelineEntity::detected( + u32::try_from(emit_start).unwrap_or(u32::MAX), + u32::try_from(end).unwrap_or(u32::MAX), + "organization", + emit_text, + LEGAL_FORM_SCORE, + DetectionSource::LegalForm, + )); + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct TrimmedStart { + start: usize, +} + +fn trim_role_head( + full_text: &str, + match_start: usize, + text: &str, + data: &PreparedLegalFormData, +) -> Option { + let first = first_role_word(text)?; + let first_lower = first.text.to_lowercase(); + let first_leading = first + .text + .split('-') + .next() + .unwrap_or_default() + .to_lowercase(); + if !data.role_heads.contains(&first_lower) + && !data.role_heads.contains(&first_leading) + { + return None; + } + + let suffix_offset = suffix_offset_in_text(text, data)?; + let mid_start = first.end; + if mid_start >= suffix_offset { + return None; + } + let mid = text.get(mid_start..suffix_offset).unwrap_or_default(); + let mut last_verb_end = None::; + for token in word_tokens(text, mid_start, suffix_offset) { + if data + .sentence_verb_indicators + .contains(&token.text.to_lowercase()) + { + last_verb_end = Some(token.end); + } + } + let digit_after_role = mid + .trim_start() + .chars() + .next() + .is_some_and(|ch| ch.is_ascii_digit()); + let appositive_role_head = !digit_after_role + && last_verb_end.is_none() + && preceding_word_is_sentence_verb(full_text, match_start, data); + + if last_verb_end.is_none() && !digit_after_role && !appositive_role_head { + return None; + } + + let scan_start = last_verb_end.unwrap_or(mid_start); + for token in word_tokens(text, scan_start, suffix_offset) { + if !starts_upper(token.text) { + continue; + } + let lower = token.text.to_lowercase(); + if data.role_heads.contains(&lower) + || data.clause_noun_heads.contains(&lower) + { + continue; + } + return Some(TrimmedStart { + start: match_start.saturating_add(token.start), + }); + } + + Some(TrimmedStart { + start: match_start.saturating_add(suffix_offset), + }) +} + +fn first_role_word(text: &str) -> Option> { + let mut end = 0_usize; + let mut saw = false; + let mut previous_was_hyphen = false; + while let Some((start, ch)) = next_char(text, end) { + if ch.is_alphabetic() { + saw = true; + previous_was_hyphen = false; + end = start.saturating_add(ch.len_utf8()); + continue; + } + if ch == '-' && saw { + previous_was_hyphen = true; + end = start.saturating_add(ch.len_utf8()); + continue; + } + if previous_was_hyphen { + end = start.saturating_sub('-'.len_utf8()); + } + break; + } + (saw && end > 0).then(|| Token { + start: 0, + end, + text: text.get(..end).unwrap_or_default(), + }) +} + +fn suffix_offset_in_text( + text: &str, + data: &PreparedLegalFormData, +) -> Option { + for suffix in &data.suffixes { + let Some(offset) = text.rfind(suffix) else { + continue; + }; + if offset.saturating_add(suffix.len()) >= text.len().saturating_sub(1) { + return Some(offset); + } + } + None +} + +fn preceding_word_is_sentence_verb( + full_text: &str, + match_start: usize, + data: &PreparedLegalFormData, +) -> bool { + let window_start = match_start.saturating_sub(40); + let Some(before) = full_text.get(window_start..match_start) else { + return false; + }; + trailing_word(before).is_some_and(|word| { + data + .sentence_verb_indicators + .contains(&word.text.to_lowercase()) + }) +} + +fn is_structural_single_cap_match( + text: &str, + data: &PreparedLegalFormData, +) -> bool { + let mut tokens = text.split_whitespace(); + let Some(first) = tokens.next() else { + return false; + }; + let Some(second) = tokens.next() else { + return false; + }; + data + .structural_single_cap_prefixes + .contains(&first.to_lowercase()) + && is_single_cap_token(second.trim_matches(',')) +} + +fn is_bare_single_cap_structural_inner_match( + full_text: &str, + match_start: usize, + text: &str, + data: &PreparedLegalFormData, +) -> bool { + if !is_bare_single_cap_legal_form(text) { + return false; + } + token_before(full_text, match_start).is_some_and(|token| { + data + .structural_single_cap_prefixes + .contains(&token.text.to_lowercase()) + }) +} + +fn is_bare_single_cap_legal_form(text: &str) -> bool { + let Some(first) = text.chars().next() else { + return false; + }; + if !first.is_uppercase() { + return false; + } + let after_first = text.get(first.len_utf8()..).unwrap_or_default(); + after_first + .chars() + .next() + .is_some_and(|ch| is_inter_token_space(ch) || ch == ',') +} + +fn is_single_cap_token(text: &str) -> bool { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_uppercase() && chars.next().is_none() +} + +fn has_disallowed_line_break(text: &str) -> bool { + let mut search_start = 0_usize; + while let Some(relative) = + text.get(search_start..).and_then(|tail| tail.find('\n')) + { + let index = search_start.saturating_add(relative); + let before = text.get(..index).unwrap_or_default(); + let after = text.get(index.saturating_add(1)..).unwrap_or_default(); + let dotted_designator_before = + before.trim_end_matches(is_inter_token_space).ends_with('.'); + let after_trimmed = after.trim_matches(is_inter_token_space); + let legal_suffix_after = is_dotted_upper_suffix(after_trimmed); + let all_caps_suffix_after = after_trimmed + .trim_end_matches('.') + .chars() + .all(char::is_uppercase) + && after_trimmed.chars().any(char::is_uppercase); + if !dotted_designator_before + || (!legal_suffix_after && !all_caps_suffix_after) + { + return true; + } + search_start = index.saturating_add(1); + } + false +} + +fn is_dotted_upper_suffix(text: &str) -> bool { + let mut saw_upper = false; + for part in text.split('.') { + if part.is_empty() { + continue; + } + if !part.chars().all(char::is_uppercase) { + return false; + } + saw_upper = true; + } + saw_upper +} + +fn extend_backward( + full_text: &str, + match_start: usize, + data: &PreparedLegalFormData, + force_suffix_mode: bool, +) -> usize { + let head_word = leading_entity_word(full_text, match_start); + let suffix_mode = force_suffix_mode + || head_word.as_ref().is_some_and(|word| { + data.company_suffix_words.contains(&word.to_lowercase()) + }); + let mut pos = match_start; + + while let Some(found) = simple_word_before(full_text, pos) { + let word = found.text; + let lower = word.to_lowercase(); + let is_upper = starts_upper(word); + let is_connector = data.connector_words.contains(&lower); + let is_in_name_prep = + suffix_mode && data.in_name_prepositions.contains(&lower); + + if is_upper { + pos = found.start; + continue; + } + + if is_connector { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) + || is_known_boundary_suffix(previous.text, data) + { + break; + } + if data.and_connector_words.contains(&lower) { + let upper_before = + count_upper_words_before(full_text, found.start, suffix_mode, data); + let middle_initial = has_middle_initial_before(full_text, found.start); + if upper_before <= 1 + && (data + .clause_noun_heads + .contains(&previous.text.to_lowercase()) + || data + .connector_prose_heads + .contains(&previous.text.to_lowercase())) + { + break; + } + let person_name_boundary = if suffix_mode { + middle_initial && has_single_cap_prefix_before(full_text, match_start) + } else { + (upper_before == 2 + && !is_in_name_legal_form_word(previous.text, data)) + || middle_initial + }; + if person_name_boundary { + break; + } + } + pos = previous.start; + continue; + } + + if is_in_name_prep { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) { + break; + } + pos = previous.start; + continue; + } + + break; + } + + skip_initials_backward(full_text, pos) +} + +fn simple_word_before(text: &str, pos: usize) -> Option> { + let mut end = pos; + while let Some((prev_start, ch)) = previous_char(text, end) { + if ch == '\n' { + return None; + } + if ch.is_whitespace() { + end = prev_start; + continue; + } + break; + } + + let mut start = end; + while let Some((prev_start, ch)) = previous_char(text, start) { + if !(ch.is_alphabetic() || ch == '&') { + break; + } + start = prev_start; + } + + (start < end).then(|| Token { + start, + end, + text: text.get(start..end).unwrap_or_default(), + }) +} + +fn leading_entity_word(text: &str, start: usize) -> Option { + let mut end = start; + while let Some((ch_start, ch)) = next_char(text, end) { + if !(ch.is_alphabetic() || ch == '&') { + break; + } + end = ch_start.saturating_add(ch.len_utf8()); + } + (end > start).then(|| text.get(start..end).unwrap_or_default().to_owned()) +} + +fn count_upper_words_before( + full_text: &str, + pos: usize, + cross_in_name_preps: bool, + data: &PreparedLegalFormData, +) -> usize { + let mut count = 0_usize; + let mut scan = pos; + while scan > 0 { + let Some(found) = simple_word_before(full_text, scan) else { + break; + }; + if starts_upper(found.text) { + count = count.saturating_add(1); + scan = found.start; + continue; + } + if cross_in_name_preps + && data + .in_name_prepositions + .contains(&found.text.to_lowercase()) + { + let Some(previous) = simple_word_before(full_text, found.start) else { + break; + }; + if !starts_upper(previous.text) { + break; + } + scan = found.start; + continue; + } + break; + } + count +} + +fn has_single_cap_prefix_before(full_text: &str, match_start: usize) -> bool { + simple_word_before(full_text, match_start) + .is_some_and(|word| is_single_cap_token(word.text)) +} + +fn skip_initials_backward(full_text: &str, pos: usize) -> usize { + let mut scan = pos; + while let Some((prev_start, ch)) = previous_char(full_text, scan) { + if ch == '\n' || !ch.is_whitespace() { + break; + } + scan = prev_start; + } + let Some((dot_start, '.')) = previous_char(full_text, scan) else { + return pos; + }; + + let mut cursor = dot_start; + let mut start = dot_start; + let mut saw_two = false; + while let Some((letter_start, letter)) = previous_char(full_text, cursor) { + if !letter.is_uppercase() { + break; + } + start = letter_start; + let before_letter = previous_char(full_text, letter_start); + match before_letter { + Some((space_start, ch)) if is_inter_token_space(ch) => { + cursor = space_start; + } + Some((prev_dot_start, '.')) => { + saw_two = true; + cursor = prev_dot_start; + } + _ => break, + } + } + + if saw_two + && previous_char(full_text, start) + .is_none_or(|(_, ch)| !ch.is_alphanumeric()) + { + return start; + } + pos +} + +#[derive(Clone, Copy, Debug)] +struct Segment<'a> { + start: usize, + text: &'a str, +} + +fn split_embedded_legal_form_list<'a>( + entity_start: usize, + entity_text: &'a str, + data: &PreparedLegalFormData, +) -> Vec> { + let mut cuts = vec![0_usize]; + for suffix in &data.suffixes { + if is_roman_numeral(&clean_suffix(suffix)) { + continue; + } + let mut search_from = 0_usize; + while let Some(relative) = entity_text + .get(search_from..) + .and_then(|tail| tail.find(suffix)) + { + let suffix_start = search_from.saturating_add(relative); + let suffix_end = suffix_start.saturating_add(suffix.len()); + search_from = suffix_end; + if suffix_end >= entity_text.len().saturating_sub(1) { + continue; + } + let Some(after) = entity_text.get(suffix_end..) else { + continue; + }; + let boundary_len = legal_list_boundary_len(after); + if boundary_len > 0 { + cuts.push(suffix_end.saturating_add(boundary_len)); + } + } + } + + cuts.sort_unstable(); + cuts.dedup(); + if cuts.len() == 1 { + return vec![Segment { + start: entity_start, + text: entity_text, + }]; + } + + let mut segments = Vec::new(); + for (index, start) in cuts.iter().enumerate() { + let end = cuts + .get(index.saturating_add(1)) + .copied() + .unwrap_or(entity_text.len()); + if *start >= end { + continue; + } + let Some(segment) = entity_text.get(*start..end) else { + continue; + }; + let trimmed = segment.trim_end_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, ',' | ';') + }); + if trimmed.is_empty() || !ends_with_legal_suffix(trimmed, data) { + continue; + } + segments.push(Segment { + start: entity_start.saturating_add(*start), + text: trimmed, + }); + } + + segments +} + +fn legal_list_boundary_len(text: &str) -> usize { + let mut chars = text.char_indices(); + let Some((_, first)) = chars.next() else { + return 0; + }; + if !matches!(first, ',' | ';') { + return 0; + } + let mut end = first.len_utf8(); + let mut saw_space = false; + for (index, ch) in chars { + if ch.is_whitespace() { + saw_space = true; + end = index.saturating_add(ch.len_utf8()); + continue; + } + if saw_space && (ch.is_uppercase() || ch == '.') { + return end; + } + return 0; + } + 0 +} + +fn ends_with_legal_suffix(text: &str, data: &PreparedLegalFormData) -> bool { + data.suffixes.iter().any(|suffix| text.ends_with(suffix)) +} + +fn trim_embedded_legal_form_list_prefix<'a>( + entity_start: usize, + entity_text: &'a str, + data: &PreparedLegalFormData, +) -> (usize, &'a str) { + let mut cut = 0_usize; + for suffix in &data.suffixes { + if is_roman_numeral(&clean_suffix(suffix)) { + continue; + } + let mut search_from = 0_usize; + while let Some(relative) = entity_text + .get(search_from..) + .and_then(|tail| tail.find(suffix)) + { + let suffix_start = search_from.saturating_add(relative); + let suffix_end = suffix_start.saturating_add(suffix.len()); + search_from = suffix_end; + if suffix_end >= entity_text.len().saturating_sub(1) { + continue; + } + let Some(after) = entity_text.get(suffix_end..) else { + continue; + }; + let boundary_len = comma_upper_boundary_len(after); + if boundary_len == 0 { + continue; + } + let next_start = suffix_end.saturating_add(boundary_len); + if entity_text + .get(next_start..) + .is_some_and(|remainder| ends_with_legal_suffix(remainder, data)) + { + cut = cut.max(next_start); + } + } + } + + if cut == 0 { + return (entity_start, entity_text); + } + ( + entity_start.saturating_add(cut), + entity_text.get(cut..).unwrap_or_default(), + ) +} + +fn comma_upper_boundary_len(text: &str) -> usize { + let Some(stripped) = text.strip_prefix(',') else { + return 0; + }; + let ws_len = leading_ws_len(stripped); + if ws_len == 0 { + return 0; + } + let after_ws = stripped.get(ws_len..).unwrap_or_default(); + if after_ws.chars().next().is_some_and(char::is_uppercase) { + return ','.len_utf8().saturating_add(ws_len); + } + 0 +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct LeadingTrim { + offset: usize, +} + +fn trim_leading_clause( + text: &str, + data: &PreparedLegalFormData, +) -> LeadingTrim { + let lower = text.to_lowercase(); + let mut cut = 0_usize; + + for phrase in &data.leading_clause_phrases { + let mut search_from = 0_usize; + while let Some(relative) = + lower.get(search_from..).and_then(|tail| tail.find(phrase)) + { + let start = search_from.saturating_add(relative); + let end = start.saturating_add(phrase.len()); + search_from = end; + let before_ok = start == 0 + || lower + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_whitespace); + let after_ws = lower.get(end..).map(leading_ws_len).unwrap_or_default(); + if before_ok && after_ws > 0 { + cut = cut.max(end.saturating_add(after_ws)); + } + } + } + + for prefix in &data.leading_clause_direct_prefixes { + let mut search_from = 0_usize; + while let Some(relative) = lower + .get(search_from..) + .and_then(|tail| find_word_at_boundary(tail, prefix)) + { + let start = search_from.saturating_add(relative); + let end = start.saturating_add(prefix.len()); + search_from = end; + let after_ws = lower.get(end..).map(leading_ws_len).unwrap_or_default(); + let after = lower + .get(end.saturating_add(after_ws)..) + .and_then(|suffix| suffix.chars().next()); + if after_ws == 0 || !after.is_some_and(char::is_uppercase) { + continue; + } + + let before = text.get(..start).unwrap_or_default(); + let prefix_lower = prefix.to_lowercase(); + if data.comma_gated_direct_prefixes.contains(&prefix_lower) { + let has_comma = before.trim_end().ends_with(','); + let has_sentence_verb = + word_tokens(before, 0, before.len()).iter().any(|word| { + starts_lower(word.text) + && data + .sentence_verb_indicators + .contains(&word.text.to_lowercase()) + }); + if !has_comma && !has_sentence_verb { + continue; + } + } + + let words = word_tokens(before, 0, before.len()); + let has_prose_prefix = + words.len() >= 3 && words.iter().any(|word| starts_lower(word.text)); + if has_prose_prefix { + cut = cut.max(end.saturating_add(after_ws)); + } + } + } + + for (comma, _) in text.match_indices(',') { + let before = text.get(..comma).unwrap_or_default(); + if !before.chars().any(|ch| ch.is_ascii_digit()) { + continue; + } + let after = text.get(comma.saturating_add(1)..).unwrap_or_default(); + let ws = leading_ws_len(after); + let candidate = after.get(ws..).unwrap_or_default(); + let upper_words = word_tokens(candidate, 0, candidate.len()) + .into_iter() + .filter(|word| starts_upper(word.text)) + .count(); + if upper_words >= 3 { + cut = cut.max(comma.saturating_add(1).saturating_add(ws)); + } + } + + LeadingTrim { offset: cut } +} + +fn find_word_at_boundary(haystack: &str, needle: &str) -> Option { + let mut from = 0_usize; + while let Some(relative) = haystack.get(from..)?.find(needle) { + let start = from.saturating_add(relative); + let end = start.saturating_add(needle.len()); + let left_ok = previous_char(haystack, start) + .is_none_or(|(_, ch)| !ch.is_alphanumeric()); + let right_ok = haystack + .get(end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if left_ok && right_ok { + return Some(start); + } + from = end; + } + None +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct PrefixInfo { + end: usize, + part: String, +} + +fn prefix_info(text: &str) -> PrefixInfo { + let end = text.rfind(',').or_else(|| text.rfind(' ')).unwrap_or(0); + let source = if end > 0 { + text.get(..end).unwrap_or_default() + } else { + text + }; + PrefixInfo { + end, + part: source.chars().filter(|ch| ch.is_alphabetic()).collect(), + } +} + +fn has_roman_numeral_suffix(text: &str) -> bool { + let separator = last_suffix_separator(text); + let raw_suffix = separator + .and_then(|index| text.get(index.saturating_add(1)..)) + .unwrap_or_default(); + let suffix = clean_suffix(raw_suffix); + !suffix.is_empty() && is_roman_numeral(&suffix) +} + +fn short_ascii_suffix_collides_with_non_ascii_prefix(text: &str) -> bool { + let separator = last_suffix_separator(text); + let raw_suffix = separator + .and_then(|index| text.get(index.saturating_add(1)..)) + .unwrap_or_default(); + let suffix = clean_suffix(raw_suffix); + if suffix.len() > 2 || raw_suffix.contains('.') { + return false; + } + let prefix = separator + .and_then(|index| text.get(..index)) + .unwrap_or(text) + .chars() + .filter(|ch| !matches!(ch, '\u{00a0}' | '\u{202f}')) + .collect::(); + !prefix.is_ascii() +} + +fn last_suffix_separator(text: &str) -> Option { + text + .char_indices() + .filter_map(|(index, ch)| { + matches!(ch, ' ' | '\t' | '\u{00a0}' | '\u{202f}' | ',').then_some(index) + }) + .next_back() +} + +fn clean_suffix(text: &str) -> String { + text.chars().filter(|ch| !matches!(ch, '.' | ',')).collect() +} + +fn is_roman_numeral(text: &str) -> bool { + if text.is_empty() + || !text.chars().next().is_some_and(|ch| { + ch == 'I' + || ch == 'V' + || ch == 'X' + || ch == 'L' + || ch == 'C' + || ch == 'D' + || ch == 'M' + }) + { + return false; + } + + let bytes = text.as_bytes(); + let mut index = 0_usize; + + let _ = take_repeated(bytes, &mut index, b'M', 3); + + if consume_pair(bytes, &mut index, b'C', b'M') + || consume_pair(bytes, &mut index, b'C', b'D') + { + } else { + let _ = consume(bytes, &mut index, b'D'); + let _ = take_repeated(bytes, &mut index, b'C', 3); + } + + if consume_pair(bytes, &mut index, b'X', b'C') + || consume_pair(bytes, &mut index, b'X', b'L') + { + } else { + let _ = consume(bytes, &mut index, b'L'); + let _ = take_repeated(bytes, &mut index, b'X', 3); + } + + if consume_pair(bytes, &mut index, b'I', b'X') + || consume_pair(bytes, &mut index, b'I', b'V') + { + } else { + let _ = consume(bytes, &mut index, b'V'); + let _ = take_repeated(bytes, &mut index, b'I', 3); + } + + index == bytes.len() +} + +fn take_repeated( + bytes: &[u8], + index: &mut usize, + target: u8, + max: usize, +) -> usize { + let mut count = 0_usize; + while count < max && bytes.get(*index) == Some(&target) { + *index = index.saturating_add(1); + count = count.saturating_add(1); + } + count +} + +fn consume_pair( + bytes: &[u8], + index: &mut usize, + first: u8, + second: u8, +) -> bool { + if bytes.get(*index) != Some(&first) + || bytes.get(index.saturating_add(1)) != Some(&second) + { + return false; + } + *index = index.saturating_add(2); + true +} + +fn consume(bytes: &[u8], index: &mut usize, target: u8) -> bool { + if bytes.get(*index) != Some(&target) { + return false; + } + *index = index.saturating_add(1); + true +} + +fn trim_end_byte(text: &str) -> usize { + text.trim_end().len() +} + +fn leading_ws_len(text: &str) -> usize { + let mut len = 0_usize; + for ch in text.chars() { + if !ch.is_whitespace() { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + +fn previous_char(text: &str, pos: usize) -> Option<(usize, char)> { + text.get(..pos)?.char_indices().next_back() +} + +fn next_char(text: &str, pos: usize) -> Option<(usize, char)> { + text + .get(pos..)? + .char_indices() + .next() + .map(|(relative, ch)| (pos.saturating_add(relative), ch)) +} + +fn lower_set(values: Vec) -> BTreeSet { + values + .into_iter() + .filter(|value| !value.is_empty()) + .map(|value| value.to_lowercase()) + .collect() +} + +fn lower_vec(values: Vec) -> Vec { + values + .into_iter() + .filter(|value| !value.is_empty()) + .map(|value| value.to_lowercase()) + .collect() +} diff --git a/crates/anonymize-core/src/lib.rs b/crates/anonymize-core/src/lib.rs new file mode 100644 index 00000000..6bd01df2 --- /dev/null +++ b/crates/anonymize-core/src/lib.rs @@ -0,0 +1,77 @@ +#![allow(clippy::redundant_pub_crate)] + +//! Core anonymization contracts shared by host-language bindings. + +mod address_context; +mod address_seeds; +mod anchored; +mod artifact_bytes; +pub(crate) mod byte_offsets; +mod coreference; +mod dates; +mod diagnostics; +mod false_positives; +mod hotwords; +mod legal_forms; +mod money; +mod name_corpus; +pub(crate) mod normalize; +mod placeholders; +mod prepared; +mod processors; +mod redact; +mod resolution; +mod search; +mod signatures; +mod triggers; +mod types; +mod validators; +mod zones; + +pub use address_context::AddressContextData; +pub use address_seeds::AddressSeedData; +pub use coreference::{CoreferenceData, CoreferencePatternData}; +pub use dates::DateData; +pub use diagnostics::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, + StaticRedactionDiagnostics, +}; +pub use hotwords::{HotwordRule, HotwordRuleData}; +pub use legal_forms::LegalFormData; +pub use money::{ + AmountWordsData, CurrencyData, MagnitudeSuffixData, MonetaryData, + ShareQuantityTermData, WrittenAmountPatternData, +}; +pub use name_corpus::{NameCorpusData, PreparedNameCorpusData}; +pub use normalize::normalize_for_search; +pub use placeholders::build_placeholder_map; +pub use prepared::{ + PreparedSearch, PreparedSearchArtifacts, PreparedSearchBuildResult, + PreparedSearchConfig, PreparedSearchMatches, PreparedSearchSlices, + StaticDetectionResult, StaticRedactionDiagnosticResult, + StaticRedactionResult, +}; +pub use processors::{ + CountryMatchData, DenyListFilterData, DenyListMatchData, GazetteerMatchData, + PatternSlice, RegexMatchMeta, SigningPlaceGuardData, StringGroups, + process_country_matches, process_deny_list_matches, + process_gazetteer_matches, process_regex_matches, +}; +pub use redact::{deanonymise, redact_text}; +pub use resolution::{ + DetectionSource, PipelineEntity, SourceDetail, enforce_boundary_consistency, + merge_and_dedup, sanitize_entities, +}; +pub use search::{ + FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, SearchIndex, + SearchIndexArtifacts, SearchOptions, SearchPattern, +}; +pub use triggers::{ + TriggerData, TriggerRule, TriggerStrategy, TriggerValidation, +}; +pub use types::{ + Entity, EntityKind, Error, OperatorConfig, OperatorEntry, OperatorType, + PlaceholderEntry, PlaceholderMap, RedactionEntry, RedactionResult, Result, + SearchEngine, SearchMatch, +}; +pub use zones::{ZoneData, ZonePatternData, ZoneSigningClauseData}; diff --git a/crates/anonymize-core/src/money.rs b/crates/anonymize-core/src/money.rs new file mode 100644 index 00000000..3b7ba516 --- /dev/null +++ b/crates/anonymize-core/src/money.rs @@ -0,0 +1,872 @@ +use std::collections::BTreeSet; + +use crate::anchored::{ + AnchorSpan, AnchorTerm, AnchoredExtractor, AnchoredRule, +}; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::Result; + +const MONEY_LABEL: &str = "monetary amount"; +const MONEY_SCORE: f64 = 0.9; +const MAX_LEFT_SCAN_BYTES: usize = 96; +const MAX_MONEY_NUMBER_SCAN_BYTES: usize = 48; +const MAX_UNGROUPED_MONEY_DIGITS: usize = 9; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct MonetaryData { + pub currencies: CurrencyData, + pub amount_words: AmountWordsData, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct CurrencyData { + pub codes: Vec, + pub symbols: Vec, + pub local_names: Vec, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct AmountWordsData { + pub written_amount_patterns: Vec, + pub magnitude_suffixes: Vec, + pub share_quantity_terms: Vec, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct WrittenAmountPatternData { + pub keywords: Vec, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct MagnitudeSuffixData { + pub words: Vec, + pub abbreviations_case_insensitive: Vec, + pub abbreviations_case_sensitive: Vec, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct ShareQuantityTermData { + pub modifiers: Vec, + pub nouns: Vec, +} + +pub(crate) struct PreparedMonetaryData { + extractor: AnchoredExtractor, +} + +impl PreparedMonetaryData { + pub(crate) fn new(data: MonetaryData) -> Result> { + AnchoredExtractor::new(MonetaryRule::new(data)) + .map(|extractor| extractor.map(|extractor| Self { extractor })) + } + + pub(crate) fn process(&self, full_text: &str) -> Result> { + self.extractor.extract(full_text) + } + + pub(crate) fn extend_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + self.extractor.rule().extend_entities(full_text, entities) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum AnchorKind { + Code, + Symbol, + LocalName, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct MagnitudeTerm { + text: String, + folded: String, + case_insensitive: bool, +} + +struct MonetaryRule { + codes: BTreeSet, + symbols: BTreeSet, + local_names: Vec, + magnitudes: Vec, + quantity_followers: Vec, + written_amount_keywords: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct CurrencyName { + text: String, + folded: String, + case_insensitive: bool, + whole_words: bool, +} + +impl MonetaryRule { + fn new(data: MonetaryData) -> Self { + let codes = clean_terms(data.currencies.codes) + .into_iter() + .collect::>(); + let symbols = clean_terms(data.currencies.symbols) + .into_iter() + .collect::>(); + let mut local_names = clean_terms(data.currencies.local_names) + .into_iter() + .map(currency_name) + .collect::>(); + local_names.sort_by_key(|name| std::cmp::Reverse(name.text.len())); + let mut magnitudes = Vec::new(); + for entry in data.amount_words.magnitude_suffixes { + magnitudes.extend( + clean_terms(entry.words) + .into_iter() + .map(|text| magnitude_term(text, true)), + ); + magnitudes.extend( + clean_terms(entry.abbreviations_case_insensitive) + .into_iter() + .map(|text| magnitude_term(text, true)), + ); + magnitudes.extend( + clean_terms(entry.abbreviations_case_sensitive) + .into_iter() + .map(|text| magnitude_term(text, false)), + ); + } + magnitudes.sort_by_key(|term| std::cmp::Reverse(term.text.len())); + + let mut quantity_followers = Vec::new(); + for entry in data.amount_words.share_quantity_terms { + quantity_followers.extend(clean_terms(entry.modifiers)); + quantity_followers.extend(clean_terms(entry.nouns)); + } + quantity_followers.sort_by_key(|term| std::cmp::Reverse(term.len())); + + let mut written_amount_keywords = Vec::new(); + for entry in data.amount_words.written_amount_patterns { + written_amount_keywords.extend( + clean_terms(entry.keywords) + .into_iter() + .map(|term| term.to_lowercase()), + ); + } + written_amount_keywords.sort_by_key(|term| std::cmp::Reverse(term.len())); + + Self { + codes, + symbols, + local_names, + magnitudes, + quantity_followers, + written_amount_keywords, + } + } + + fn classify_anchor(&self, text: &str) -> Option { + if self.symbols.contains(text) { + return Some(AnchorKind::Symbol); + } + if self.codes.contains(text) { + return Some(AnchorKind::Code); + } + + let folded = text.to_lowercase(); + self.local_names.iter().find_map(|name| { + if name.case_insensitive && name.folded == folded { + return Some(AnchorKind::LocalName); + } + (!name.case_insensitive && name.text == text) + .then_some(AnchorKind::LocalName) + }) + } +} + +impl AnchoredRule for MonetaryRule { + fn anchor_terms(&self) -> Vec { + let mut anchors = Vec::new(); + anchors.extend( + self + .codes + .iter() + .cloned() + .map(AnchorTerm::word_case_sensitive), + ); + anchors.extend(self.symbols.iter().cloned().map(AnchorTerm::symbol)); + anchors.extend(self.local_names.iter().map(|name| { + AnchorTerm::new( + name.text.clone(), + name.case_insensitive, + name.whole_words, + ) + })); + anchors + } + + fn extract( + &self, + full_text: &str, + anchor: AnchorSpan, + ) -> Result> { + let Some(anchor_text) = str_slice(full_text, anchor.start, anchor.end) + else { + return Ok(Vec::new()); + }; + let Some(kind) = self.classify_anchor(anchor_text) else { + return Ok(Vec::new()); + }; + + let mut entities = Vec::new(); + if let Some((start, end)) = + self.leading_amount_span(full_text, anchor, kind) + && let Some(entity) = money_entity(full_text, start, end) + { + entities.push(entity); + } + if let Some((start, end)) = + self.trailing_amount_span(full_text, anchor, kind) + && let Some(entity) = money_entity(full_text, start, end) + { + entities.push(entity); + } + + Ok(entities) + } +} + +impl MonetaryRule { + fn extend_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + let mut extended = Vec::with_capacity(entities.len()); + for entity in entities { + extended.push(self.extend_entity(full_text, entity)); + } + extended + } + + fn extend_entity( + &self, + full_text: &str, + entity: &PipelineEntity, + ) -> PipelineEntity { + if entity.label != MONEY_LABEL || caller_owned(entity) { + return entity.clone(); + } + + let mut next = entity.clone(); + let mut end = usize::try_from(next.end).unwrap_or(usize::MAX); + if !ends_with_letter(&next.text) + && let Some(currency_end) = self.trailing_currency_end(full_text, end) + { + end = currency_end; + } + end = self.extend_written_amount(full_text, end); + + let Ok(end_u32) = u32::try_from(end) else { + return next; + }; + if end_u32 == next.end { + return next; + } + + let Ok(start) = usize::try_from(next.start) else { + return next; + }; + let Some(text) = str_slice(full_text, start, end) else { + return next; + }; + next.end = end_u32; + text.clone_into(&mut next.text); + next + } + + fn trailing_currency_end(&self, text: &str, index: usize) -> Option { + let start = skip_trailing_currency_gap(text, index, 4); + + for name in &self.local_names { + let end = start.saturating_add(name.text.len()); + let Some(candidate) = str_slice(text, start, end) else { + continue; + }; + let matches = if name.case_insensitive { + candidate.to_lowercase() == name.folded + } else { + candidate == name.text + }; + if matches && right_alnum_boundary(text, end) { + return Some(end); + } + } + + for code in &self.codes { + let end = start.saturating_add(code.len()); + let Some(candidate) = str_slice(text, start, end) else { + continue; + }; + if candidate == code && right_alnum_boundary(text, end) { + return Some(end); + } + } + + None + } + + fn leading_amount_span( + &self, + text: &str, + anchor: AnchorSpan, + kind: AnchorKind, + ) -> Option<(usize, usize)> { + if !left_money_boundary(text, anchor.start, kind) { + return None; + } + + let number_start = skip_horizontal_ws_limit(text, anchor.end, 2); + let number = parse_number_forward(text, number_start)?; + let (end, _) = self + .parse_magnitude_forward(text, number.end) + .unwrap_or((number.end, false)); + right_money_boundary(text, end) + .then(|| (anchor.start, self.extend_written_amount(text, end))) + } + + fn trailing_amount_span( + &self, + text: &str, + anchor: AnchorSpan, + kind: AnchorKind, + ) -> Option<(usize, usize)> { + if !right_money_boundary(text, anchor.end) { + return None; + } + + let scan_start = char_boundary_before( + text, + anchor.start.saturating_sub(MAX_LEFT_SCAN_BYTES), + ); + let window = str_slice(text, scan_start, anchor.start)?; + let mut best = None; + + for (offset, ch) in window.char_indices() { + if !ch.is_ascii_digit() { + continue; + } + let number_start = scan_start.saturating_add(offset); + let number = parse_number_forward(text, number_start)?; + let (after_number, has_magnitude) = self + .parse_magnitude_forward(text, number.end) + .unwrap_or((number.end, false)); + let after_gap = skip_horizontal_ws_limit(text, after_number, 4); + if after_gap != anchor.start { + continue; + } + + let start = leading_symbol_start(text, number.start) + .filter(|value| left_money_boundary(text, *value, AnchorKind::Symbol)) + .unwrap_or(number.start); + if !left_money_boundary(text, start, kind) { + continue; + } + if has_magnitude + && kind != AnchorKind::Symbol + && self.has_quantity_follower(text, anchor.end) + { + continue; + } + let end = self.extend_written_amount(text, anchor.end); + if best.is_none_or(|(best_start, _)| start < best_start) { + best = Some((start, end)); + } + } + + best + } + + fn parse_magnitude_forward( + &self, + text: &str, + index: usize, + ) -> Option<(usize, bool)> { + let start = skip_horizontal_ws_limit(text, index, 8); + self.match_magnitude_at(text, start).map(|end| (end, true)) + } + + fn match_magnitude_at(&self, text: &str, index: usize) -> Option { + for term in &self.magnitudes { + let end = index.saturating_add(term.text.len()); + let Some(candidate) = str_slice(text, index, end) else { + continue; + }; + let matches = if term.case_insensitive { + candidate.to_lowercase() == term.folded + } else { + candidate == term.text + }; + if matches && right_word_boundary(text, end) { + return Some(end); + } + } + None + } + + fn has_quantity_follower(&self, text: &str, index: usize) -> bool { + let start = skip_horizontal_ws_limit(text, index, 16); + self.quantity_followers.iter().any(|term| { + let end = start.saturating_add(term.len()); + str_slice(text, start, end).is_some_and(|candidate| { + candidate.to_lowercase() == *term && right_word_boundary(text, end) + }) + }) + } + + fn extend_written_amount(&self, text: &str, index: usize) -> usize { + if self.written_amount_keywords.is_empty() { + return index; + } + + self.match_written_amount_at(text, index).unwrap_or(index) + } + + fn match_written_amount_at(&self, text: &str, index: usize) -> Option { + let after = str_tail(text, index)?; + let mut cursor = 0usize; + + if let Some(ch) = after.chars().next() + && matches!(ch, ',' | ';') + { + cursor = cursor.saturating_add(ch.len_utf8()); + } + + cursor = skip_horizontal_ws_limit(after, cursor, usize::MAX); + if after.get(cursor..)?.chars().next()? != '(' { + return None; + } + + cursor = cursor.saturating_add('('.len_utf8()); + let keyword_end = self.match_written_amount_keyword(after, cursor)?; + cursor = keyword_end; + let separator = after.get(cursor..)?.chars().next()?; + if separator == '\n' || separator == '\r' { + return None; + } + if separator != ':' && !separator.is_whitespace() { + return None; + } + cursor = cursor.saturating_add(separator.len_utf8()); + + let mut content_chars = 0usize; + for (offset, ch) in after.get(cursor..)?.char_indices() { + if ch == '\n' || ch == '\r' { + return None; + } + if ch == ')' { + if content_chars == 0 || content_chars > 120 { + return None; + } + return Some( + index + .saturating_add(cursor) + .saturating_add(offset) + .saturating_add(ch.len_utf8()), + ); + } + content_chars = content_chars.saturating_add(1); + if content_chars > 120 { + return None; + } + } + + None + } + + fn match_written_amount_keyword( + &self, + text: &str, + index: usize, + ) -> Option { + for keyword in &self.written_amount_keywords { + let end = index.saturating_add(keyword.len()); + let Some(candidate) = str_slice(text, index, end) else { + continue; + }; + if candidate.to_lowercase() == *keyword { + return Some(end); + } + } + None + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct NumberSpan { + start: usize, + end: usize, +} + +fn parse_number_forward(text: &str, index: usize) -> Option { + let mut digits = 0usize; + let mut end = index; + let mut value_end = index; + let mut current_group_digits = 0usize; + let mut first_component_digits = 0usize; + let mut has_separator = false; + let mut has_grouping_separator = false; + + for (offset, ch) in str_tail(text, index)?.char_indices() { + let char_start = index.saturating_add(offset); + if char_start.saturating_sub(index) > MAX_MONEY_NUMBER_SCAN_BYTES { + break; + } + + if ch.is_ascii_digit() { + digits = digits.saturating_add(1); + current_group_digits = current_group_digits.saturating_add(1); + end = char_start.saturating_add(ch.len_utf8()); + value_end = end; + continue; + } + + if is_dash(ch) && digits > 0 { + value_end = char_start.saturating_add(ch.len_utf8()); + break; + } + + if is_number_separator(ch) + && number_separator_continues( + text, + char_start.saturating_add(ch.len_utf8()), + ch, + ) + { + if !has_separator { + first_component_digits = current_group_digits; + } + let next_index = char_start.saturating_add(ch.len_utf8()); + let next_group_digits = digit_run_after_separator(text, next_index, ch); + if current_group_digits > 0 + && current_group_digits <= 3 + && next_group_digits == 3 + { + has_grouping_separator = true; + } + has_separator = true; + current_group_digits = 0; + end = char_start.saturating_add(ch.len_utf8()); + continue; + } + + break; + } + + if digits == 0 { + return None; + } + let leading_digits = if has_separator { + first_component_digits + } else { + digits + }; + if !has_grouping_separator && leading_digits > MAX_UNGROUPED_MONEY_DIGITS { + return None; + } + + Some(NumberSpan { + start: index, + end: value_end.max(end), + }) +} + +fn digit_run_after_separator( + text: &str, + index: usize, + separator: char, +) -> usize { + let mut count = 0usize; + let mut skipping_spaces = separator.is_whitespace(); + for ch in str_tail(text, index).into_iter().flat_map(str::chars) { + if skipping_spaces && ch.is_whitespace() && ch != '\n' && ch != '\r' { + continue; + } + skipping_spaces = false; + if !ch.is_ascii_digit() { + break; + } + count = count.saturating_add(1); + } + count +} + +fn number_separator_continues( + text: &str, + index: usize, + separator: char, +) -> bool { + let mut saw_space = false; + for ch in str_tail(text, index) + .into_iter() + .flat_map(str::chars) + .take(2) + { + if ch == '\n' || ch == '\r' { + return false; + } + if ch.is_whitespace() { + saw_space = true; + continue; + } + if separator.is_whitespace() { + return ch.is_ascii_digit(); + } + return (!saw_space && ch.is_ascii_digit()) || is_dash(ch); + } + false +} + +fn money_entity( + full_text: &str, + start: usize, + end: usize, +) -> Option { + let start_u32 = u32::try_from(start).unwrap_or(u32::MAX); + let end_u32 = u32::try_from(end).unwrap_or(u32::MAX); + Some(PipelineEntity::detected( + start_u32, + end_u32, + MONEY_LABEL, + str_slice(full_text, start, end)?.to_owned(), + MONEY_SCORE, + DetectionSource::Regex, + )) +} + +fn leading_symbol_start(text: &str, number_start: usize) -> Option { + let before_number = skip_horizontal_ws_backward_limit(text, number_start, 2); + let (symbol_start, ch) = previous_char(text, before_number)?; + is_currency_symbol(ch).then_some(symbol_start) +} + +fn currency_name(text: String) -> CurrencyName { + let case_insensitive = is_ascii_phrase(&text) && text.chars().count() >= 3; + let whole_words = text + .chars() + .all(|ch| ch.is_alphanumeric() || ch.is_whitespace()); + CurrencyName { + folded: text.to_lowercase(), + text, + case_insensitive, + whole_words, + } +} + +fn magnitude_term(text: String, case_insensitive: bool) -> MagnitudeTerm { + MagnitudeTerm { + folded: text.to_lowercase(), + text, + case_insensitive, + } +} + +fn clean_terms(values: Vec) -> Vec { + values + .into_iter() + .map(|value| value.trim().to_owned()) + .filter(|value| !value.is_empty()) + .collect() +} + +fn left_money_boundary(text: &str, index: usize, kind: AnchorKind) -> bool { + if kind == AnchorKind::Symbol { + return true; + } + previous_char(text, index).is_none_or(|(_, ch)| !is_identifier_char(ch)) +} + +fn right_money_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| ch.is_whitespace() || ".,;!?)]}".contains(ch)) +} + +fn right_word_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| !is_identifier_char(ch)) +} + +fn is_ascii_phrase(text: &str) -> bool { + text + .chars() + .all(|ch| ch.is_ascii_alphabetic() || ch.is_whitespace()) +} + +fn is_identifier_char(ch: char) -> bool { + ch == '_' || ch.is_alphanumeric() +} + +fn right_alnum_boundary(text: &str, index: usize) -> bool { + str_tail(text, index) + .and_then(|value| value.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn ends_with_letter(text: &str) -> bool { + text.chars().next_back().is_some_and(char::is_alphabetic) +} + +const fn caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +const fn is_number_separator(ch: char) -> bool { + ch == ',' + || ch == '.' + || ch == '\'' + || (ch.is_whitespace() && ch != '\n' && ch != '\r') +} + +const fn is_dash(ch: char) -> bool { + matches!( + ch, + '-' + | '‐' + | '‑' + | '‒' + | '–' + | '—' + | '―' + | '⸺' + | '⸻' + | '⁃' + | '־' + | '−' + ) +} + +const fn is_currency_symbol(ch: char) -> bool { + matches!( + ch, + '$' + | '£' + | '¥' + | '৳' + | '₡' + | '₦' + | '₩' + | '₪' + | '₫' + | '€' + | '₭' + | '₮' + | '₱' + | '₲' + | '₴' + | '₵' + | '₸' + | '₹' + | '₺' + | '₼' + | '₽' + | '₾' + ) +} + +fn skip_horizontal_ws_limit( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some(ch) = str_tail(text, index).and_then(|value| value.chars().next()) + else { + break; + }; + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + skipped = skipped.saturating_add(1); + } + index +} + +fn skip_trailing_currency_gap( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some(ch) = str_tail(text, index).and_then(|value| value.chars().next()) + else { + break; + }; + if ch == '\n' || ch == '\t' || !ch.is_whitespace() { + break; + } + index = index.saturating_add(ch.len_utf8()); + skipped = skipped.saturating_add(1); + } + index +} + +fn skip_horizontal_ws_backward_limit( + text: &str, + mut index: usize, + max_chars: usize, +) -> usize { + let mut skipped = 0usize; + while skipped < max_chars { + let Some((char_start, ch)) = previous_char(text, index) else { + break; + }; + if ch == '\n' || ch == '\r' || !ch.is_whitespace() { + break; + } + index = char_start; + skipped = skipped.saturating_add(1); + } + index +} + +fn previous_char(text: &str, index: usize) -> Option<(usize, char)> { + str_head(text, index)?.char_indices().next_back() +} + +const fn char_boundary_before(text: &str, mut index: usize) -> usize { + while !text.is_char_boundary(index) { + index = index.saturating_sub(1); + } + index +} + +fn str_head(text: &str, index: usize) -> Option<&str> { + text.get(..index) +} + +fn str_tail(text: &str, index: usize) -> Option<&str> { + text.get(index..) +} + +fn str_slice(text: &str, start: usize, end: usize) -> Option<&str> { + text.get(start..end) +} diff --git a/crates/anonymize-core/src/name_corpus.rs b/crates/anonymize-core/src/name_corpus.rs new file mode 100644 index 00000000..27ef9ea3 --- /dev/null +++ b/crates/anonymize-core/src/name_corpus.rs @@ -0,0 +1,995 @@ +use std::collections::HashSet; + +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Error, Result}; + +const PERSON_LABEL: &str = "person"; +const CJK_HAN_RATIO_NUMERATOR: usize = 15; +const CJK_HAN_RATIO_DENOMINATOR: usize = 100; +const CJK_SCORE: f64 = 0.95; +const HIGH_CONFIDENCE_NAME_SCORE: f64 = 0.9; +const TITLE_NAME_SCORE: f64 = 0.95; +const LOW_CONFIDENCE_NAME_SCORE: f64 = 0.5; +const MAX_CHAIN: usize = 5; +const ALL_CAPS_NAME_LINE_RATIO: f64 = 0.9; +const ALL_CAPS_NAME_LINE_MIN_LETTERS: usize = 3; +const ALL_CAPS_NAME_LINE_MAX_TOKENS: usize = 6; +const MAX_HORIZONTAL_CHAIN_GAP: usize = 4; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct NameCorpusData { + #[serde(default)] + pub first_names: Vec, + #[serde(default)] + pub surnames: Vec, + #[serde(default)] + pub title_tokens: Vec, + #[serde(default)] + pub title_abbreviations: Vec, + #[serde(default)] + pub excluded_words: Vec, + #[serde(default)] + pub common_words: Vec, + #[serde(default)] + pub non_western_names: Vec, + #[serde(default)] + pub excluded_all_caps: Vec, + #[serde(default)] + pub ja_suffixes: Vec, + #[serde(default)] + pub arabic_connectors: Vec, + #[serde(default)] + pub relation_connectors: Vec, + #[serde(default)] + pub hyphenated_prefixes: Vec, + #[serde(default)] + pub cjk_non_person_terms: Vec, + #[serde(default)] + pub cjk_surname_starters: Vec, + #[serde(default)] + pub organization_terms: Vec, +} + +#[derive(Clone, Debug)] +pub struct PreparedNameCorpusData { + first_names: HashSet, + surnames: HashSet, + title_tokens: HashSet, + title_abbreviations: HashSet, + excluded_words: HashSet, + common_words: HashSet, + non_western_names: HashSet, + excluded_all_caps: HashSet, + ja_suffixes: HashSet, + arabic_connectors: HashSet, + relation_connectors: HashSet, + hyphenated_prefixes: HashSet, + cjk_non_person_terms: HashSet, + cjk_surname_starters: HashSet, + organization_terms: HashSet, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum TokenKind { + Name, + Surname, + Title, + Abbreviation, + JaSuffix, + ArabicConnector, + Capitalized, + Other, +} + +#[derive(Clone, Debug)] +struct WordSegment<'a> { + text: &'a str, + start: usize, + end: usize, +} + +#[derive(Clone, Debug)] +struct ClassifiedToken<'a> { + text: &'a str, + kind: TokenKind, + start: usize, + end: usize, + non_western: bool, + title_abbreviation: bool, +} + +impl PreparedNameCorpusData { + #[must_use] + pub fn new(data: NameCorpusData) -> Self { + Self { + first_names: string_set(data.first_names), + surnames: string_set(data.surnames), + title_tokens: lower_string_set(data.title_tokens), + title_abbreviations: lower_string_set(data.title_abbreviations), + excluded_words: lower_string_set(data.excluded_words), + common_words: lower_string_set(data.common_words), + non_western_names: string_set(data.non_western_names), + excluded_all_caps: string_set(data.excluded_all_caps), + ja_suffixes: lower_string_set(data.ja_suffixes), + arabic_connectors: lower_string_set(data.arabic_connectors), + relation_connectors: lower_string_set(data.relation_connectors), + hyphenated_prefixes: lower_string_set(data.hyphenated_prefixes), + cjk_non_person_terms: string_set(data.cjk_non_person_terms), + cjk_surname_starters: data + .cjk_surname_starters + .into_iter() + .filter_map(|value| value.chars().next()) + .collect(), + organization_terms: lower_string_set(data.organization_terms), + } + } + + pub fn detect_supplemental( + &self, + full_text: &str, + deny_list_entities: &[PipelineEntity], + ) -> Result> { + let mut entities = self.detect_cjk_names(full_text)?; + entities.extend(self.detect_token_names(full_text)?); + let mut entities = deduplicate_spans(entities); + entities.retain(|entity| { + !deny_list_entities + .iter() + .any(|deny| covers_same_label(entity, deny)) + }); + Ok(entities) + } + + fn detect_cjk_names(&self, full_text: &str) -> Result> { + if self.cjk_surname_starters.is_empty() { + return Ok(Vec::new()); + } + + let text_len = full_text.chars().count(); + let threshold = + ceil_ratio(text_len, CJK_HAN_RATIO_NUMERATOR, CJK_HAN_RATIO_DENOMINATOR); + let threshold = threshold.max(1); + let mut han_count = 0usize; + for ch in full_text.chars() { + if is_han(ch) { + han_count = han_count.saturating_add(1); + if han_count >= threshold { + break; + } + } + } + if text_len >= 100 && han_count >= threshold { + return Ok(Vec::new()); + } + + let mut entities = Vec::new(); + let mut run_start = None; + let mut run_chars = 0usize; + let mut previous_end = 0usize; + for (index, ch) in full_text.char_indices() { + if is_han(ch) { + if run_start.is_none() { + run_start = Some(index); + } + run_chars = run_chars.saturating_add(1); + previous_end = index.saturating_add(ch.len_utf8()); + continue; + } + self.push_cjk_run( + full_text, + run_start, + previous_end, + run_chars, + &mut entities, + )?; + run_start = None; + run_chars = 0; + } + self.push_cjk_run( + full_text, + run_start, + full_text.len(), + run_chars, + &mut entities, + )?; + Ok(entities) + } + + fn push_cjk_run( + &self, + full_text: &str, + start: Option, + end: usize, + char_count: usize, + entities: &mut Vec, + ) -> Result<()> { + if !(2..=4).contains(&char_count) { + return Ok(()); + } + let Some(start) = start else { + return Ok(()); + }; + let Some(text) = full_text.get(start..end) else { + return Err(invalid_name_data("cjk span is not a UTF-8 boundary")); + }; + if !self.is_likely_cjk_person_name(text) || self.is_organization(text) { + return Ok(()); + } + entities.push(PipelineEntity::detected( + usize_to_u32(start, "name_corpus.cjk.start")?, + usize_to_u32(end, "name_corpus.cjk.end")?, + PERSON_LABEL, + text, + CJK_SCORE, + DetectionSource::Regex, + )); + Ok(()) + } + + fn detect_token_names(&self, full_text: &str) -> Result> { + let words = segment_words(full_text); + let mut tokens = Vec::with_capacity(words.len()); + let mut word_index = 0usize; + while let Some(word) = words.get(word_index) { + if let Some((connector, end, consumed)) = + relation_connector(word, &words, word_index, full_text, self) + { + tokens.push(ClassifiedToken { + text: connector, + kind: TokenKind::ArabicConnector, + start: word.start, + end, + non_western: false, + title_abbreviation: false, + }); + word_index = word_index.saturating_add(consumed); + continue; + } + tokens.push(self.classify_token(word, full_text)); + word_index = word_index.saturating_add(1); + } + + let mut consumed = vec![false; tokens.len()]; + let mut entities = Vec::new(); + for index in 0..tokens.len() { + if consumed.get(index).copied().unwrap_or(false) { + continue; + } + let Some(token) = tokens.get(index) else { + continue; + }; + if !is_chain_start(token.kind) { + continue; + } + + let chain = Self::build_chain(full_text, &tokens, index); + let Some(score) = supplemental_chain_score(full_text, &chain, self) + else { + continue; + }; + let Some(first) = chain.first() else { + continue; + }; + let Some(last) = chain.last() else { + continue; + }; + let Some(text) = full_text.get(first.start..last.end) else { + return Err(invalid_name_data("name span is not a UTF-8 boundary")); + }; + if self.is_organization(text) { + continue; + } + for slot in index..index.saturating_add(chain.len()) { + if let Some(value) = consumed.get_mut(slot) { + *value = true; + } + } + entities.push(PipelineEntity::detected( + usize_to_u32(first.start, "name_corpus.start")?, + usize_to_u32(last.end, "name_corpus.end")?, + PERSON_LABEL, + text, + score, + DetectionSource::Regex, + )); + } + + Ok(entities) + } + + fn classify_token<'a>( + &self, + word: &WordSegment<'a>, + full_text: &str, + ) -> ClassifiedToken<'a> { + let text = word.text; + let lower = text.to_lowercase(); + let stripped = lower.strip_suffix('.').unwrap_or(&lower); + if self.title_tokens.contains(stripped) { + return ClassifiedToken { + text, + kind: TokenKind::Title, + start: word.start, + end: word.end, + non_western: false, + title_abbreviation: self.title_abbreviations.contains(stripped), + }; + } + if self.ja_suffixes.contains(&lower) { + return classified(word, TokenKind::JaSuffix, false); + } + if self.arabic_connectors.contains(&lower) { + return classified(word, TokenKind::ArabicConnector, false); + } + if self.is_hyphenated_prefix_name(text) { + return classified(word, TokenKind::Name, true); + } + if is_abbreviation(text) || is_multi_dot_abbreviation(text) { + return classified(word, TokenKind::Abbreviation, false); + } + if is_single_letter_initial(text, word.end, full_text) + && self.initial_has_name_context(word, full_text) + { + return classified(word, TokenKind::Abbreviation, false); + } + if self.excluded_words.contains(&lower) || text.chars().count() < 2 { + return classified(word, TokenKind::Other, false); + } + let short_token_allowed = self.is_non_western_name_token(text) + || self.ja_suffixes.contains(&lower) + || self.arabic_connectors.contains(&lower) + || (is_all_upper(text) && !self.excluded_all_caps.contains(text)); + if text.chars().count() < 3 && !short_token_allowed { + return classified(word, TokenKind::Other, false); + } + if text.chars().count() >= 3 && is_all_upper(text) { + if self.excluded_all_caps.contains(text) { + return classified(word, TokenKind::Other, false); + } + let title_cased = title_case_simple(text); + let non_western = self.is_non_western_name_token(&title_cased); + if non_western && !self.is_first_name_token(&title_cased) { + return classified(word, TokenKind::Name, true); + } + if is_all_caps_context_line(full_text, word.start) + && is_all_caps_line_name_shaped(full_text, word.start) + { + if self.is_first_name_token(&title_cased) { + return classified(word, TokenKind::Name, non_western); + } + if self.is_surname_token(&title_cased) { + return classified(word, TokenKind::Surname, non_western); + } + if non_western { + return classified(word, TokenKind::Name, true); + } + } + return classified(word, TokenKind::Other, false); + } + if !starts_uppercase(text) { + return classified(word, TokenKind::Other, false); + } + if self.is_first_name_token(text) { + return classified( + word, + TokenKind::Name, + self.is_non_western_name_token(text), + ); + } + if self.is_surname_token(text) { + return classified( + word, + TokenKind::Surname, + self.is_non_western_name_token(text), + ); + } + if self.is_non_western_name_token(text) { + return classified(word, TokenKind::Name, true); + } + classified(word, TokenKind::Capitalized, false) + } + + fn build_chain<'a>( + full_text: &str, + tokens: &'a [ClassifiedToken<'a>], + start: usize, + ) -> Vec<&'a ClassifiedToken<'a>> { + let mut chain = Vec::new(); + let Some(first) = tokens.get(start) else { + return chain; + }; + chain.push(first); + let mut index = start.saturating_add(1); + while index < tokens.len() && chain.len() < MAX_CHAIN { + let Some(next) = tokens.get(index) else { + break; + }; + let Some(previous) = chain.last().copied() else { + break; + }; + let Some(gap) = full_text.get(previous.end..next.start) else { + break; + }; + if horizontal_gap_width(gap) > MAX_HORIZONTAL_CHAIN_GAP { + break; + } + let period_is_part_of_previous = previous.kind == TokenKind::Abbreviation + || (previous.kind == TokenKind::Title && previous.title_abbreviation); + let breaks_on_period = gap.contains('.') + && !is_initial_continuation_gap(previous.text, gap) + && !period_is_part_of_previous; + if gap.contains('\n') + || gap.contains('!') + || gap.contains('?') + || gap.contains(';') + || gap.contains(':') + || breaks_on_period + { + break; + } + if next.kind == TokenKind::JaSuffix + && gap != "-" + && !gap.trim().is_empty() + { + break; + } + if next.kind == TokenKind::Other { + break; + } + chain.push(next); + index = index.saturating_add(1); + } + chain + } + + fn is_likely_cjk_person_name(&self, text: &str) -> bool { + if self.cjk_non_person_terms.contains(text) { + return false; + } + text + .chars() + .next() + .is_some_and(|first| self.cjk_surname_starters.contains(&first)) + } + + fn is_organization(&self, text: &str) -> bool { + let words = segment_words(text); + words + .iter() + .any(|word| self.organization_terms.contains(&word.text.to_lowercase())) + } + + fn is_hyphenated_prefix_name(&self, text: &str) -> bool { + let Some((prefix, tail)) = text.split_once('-') else { + return false; + }; + self.hyphenated_prefixes.contains(&prefix.to_lowercase()) + && tail.chars().next().is_some_and(char::is_uppercase) + } + + fn is_first_name_token(&self, token: &str) -> bool { + self.first_names.contains(token) + } + + fn is_surname_token(&self, token: &str) -> bool { + self.surnames.contains(token) + } + + fn is_non_western_name_token(&self, token: &str) -> bool { + self.non_western_names.contains(token) + || self + .non_western_names + .contains(&title_case_with_apostrophe(token)) + } + + fn initial_has_name_context( + &self, + word: &WordSegment<'_>, + full_text: &str, + ) -> bool { + let line = line_before(full_text, word.start); + if let Some(last_word) = trailing_word(line) + && self.lookup_name_token(last_word) + { + return true; + } + let after_dot_start = word.end.saturating_add(1); + let after_dot = full_text + .get(after_dot_start..) + .unwrap_or_default() + .trim_start(); + let Some(next_word) = leading_word(after_dot) else { + return false; + }; + self.lookup_name_token(next_word) + || (next_word.chars().count() == 1 && starts_uppercase(next_word)) + } + + fn lookup_name_token(&self, token: &str) -> bool { + self.is_first_name_token(token) + || self.is_first_name_token(&title_case_simple(token)) + || self.is_non_western_name_token(token) + } +} + +fn supplemental_chain_score( + full_text: &str, + chain: &[&ClassifiedToken<'_>], + data: &PreparedNameCorpusData, +) -> Option { + let has_title = chain.iter().any(|token| token.kind == TokenKind::Title); + let has_abbreviation = chain + .iter() + .any(|token| token.kind == TokenKind::Abbreviation); + let has_non_western = chain.iter().any(|token| token.non_western); + if !has_non_western { + return None; + } + let has_ja_suffix = + chain.iter().any(|token| token.kind == TokenKind::JaSuffix); + let has_arabic_connector = chain + .iter() + .any(|token| token.kind == TokenKind::ArabicConnector); + let capitalized_count = chain + .iter() + .filter(|token| token.kind == TokenKind::Capitalized) + .count(); + let non_western_count = + chain.iter().filter(|token| token.non_western).count(); + let chain_all_common_words = chain + .iter() + .all(|token| data.common_words.contains(&token.text.to_lowercase())); + let title_confidence = + has_title && (non_western_count > 0 || capitalized_count > 0); + let high_confidence = (has_ja_suffix + && (capitalized_count > 0 || non_western_count > 0)) + || (has_arabic_connector && non_western_count > 0) + || non_western_count >= 2 + || (non_western_count > 0 + && (capitalized_count > 0 || has_abbreviation) + && !chain_all_common_words); + let score = if title_confidence { + TITLE_NAME_SCORE + } else if high_confidence { + HIGH_CONFIDENCE_NAME_SCORE + } else if non_western_count == 1 + && chain.len() == 1 + && !is_sentence_start(full_text, chain.first()?.start) + { + LOW_CONFIDENCE_NAME_SCORE + } else { + return None; + }; + (score >= HIGH_CONFIDENCE_NAME_SCORE).then_some(score) +} + +fn segment_words(full_text: &str) -> Vec> { + let mut words = Vec::new(); + let mut start = None; + let mut end = 0usize; + for (index, ch) in full_text.char_indices() { + if is_word_char(ch) { + if start.is_none() { + start = Some(index); + } + end = index.saturating_add(ch.len_utf8()); + continue; + } + if let Some(word_start) = start.take() + && let Some(text) = full_text.get(word_start..end) + { + words.push(WordSegment { + text, + start: word_start, + end, + }); + } + } + if let Some(word_start) = start + && let Some(text) = full_text.get(word_start..end) + { + words.push(WordSegment { + text, + start: word_start, + end, + }); + } + words +} + +fn relation_connector<'a>( + word: &WordSegment<'a>, + words: &[WordSegment<'a>], + index: usize, + full_text: &'a str, + data: &PreparedNameCorpusData, +) -> Option<(&'a str, usize, usize)> { + let lower = word.text.to_lowercase(); + if !matches!(lower.as_str(), "s" | "d" | "w" | "r") { + return None; + } + let next = words.get(index.saturating_add(1))?; + if full_text.get(word.end..next.start)? != "/" + || !next.text.eq_ignore_ascii_case("o") + { + return None; + } + let connector = full_text.get(word.start..next.end)?; + data + .relation_connectors + .contains(&connector.to_lowercase()) + .then_some((connector, next.end, 2)) +} + +const fn classified<'a>( + word: &WordSegment<'a>, + kind: TokenKind, + non_western: bool, +) -> ClassifiedToken<'a> { + ClassifiedToken { + text: word.text, + kind, + start: word.start, + end: word.end, + non_western, + title_abbreviation: false, + } +} + +const fn is_chain_start(kind: TokenKind) -> bool { + matches!( + kind, + TokenKind::Title + | TokenKind::Name + | TokenKind::Surname + | TokenKind::Abbreviation + | TokenKind::ArabicConnector + ) +} + +fn covers_same_label(entity: &PipelineEntity, deny: &PipelineEntity) -> bool { + entity.label == deny.label + && deny.start <= entity.start + && deny.end >= entity.end +} + +fn deduplicate_spans(mut entities: Vec) -> Vec { + entities.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| right.end.cmp(&left.end)) + }); + let mut result = Vec::new(); + for entity in entities { + let keep = result + .last() + .is_none_or(|last: &PipelineEntity| entity.start >= last.end); + if keep { + result.push(entity); + } + } + result +} + +fn title_case_with_apostrophe(text: &str) -> String { + let mut result = String::new(); + let mut uppercase_next = true; + for ch in text.chars() { + if uppercase_next { + result.extend(ch.to_uppercase()); + uppercase_next = false; + } else { + result.extend(ch.to_lowercase()); + } + if ch == '\'' { + uppercase_next = true; + } + } + result +} + +fn title_case_simple(text: &str) -> String { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return String::new(); + }; + let mut result = String::new(); + result.extend(first.to_uppercase()); + result.push_str(&chars.as_str().to_lowercase()); + result +} + +fn starts_uppercase(text: &str) -> bool { + text.chars().next().is_some_and(char::is_uppercase) +} + +fn is_all_upper(text: &str) -> bool { + let mut letters = 0usize; + for ch in text.chars() { + if ch.is_alphabetic() { + letters = letters.saturating_add(1); + if !ch.is_uppercase() { + return false; + } + } + } + letters > 0 +} + +fn is_abbreviation(text: &str) -> bool { + let mut chars = text.chars(); + let Some(first) = chars.next() else { + return false; + }; + chars.next() == Some('.') && chars.next().is_none() && first.is_uppercase() +} + +fn is_multi_dot_abbreviation(text: &str) -> bool { + let mut saw_upper = false; + let mut previous_dot = true; + for ch in text.chars() { + if previous_dot { + if !ch.is_uppercase() { + return false; + } + saw_upper = true; + previous_dot = false; + continue; + } + if ch != '.' { + return false; + } + previous_dot = true; + } + saw_upper +} + +fn is_single_letter_initial(text: &str, end: usize, full_text: &str) -> bool { + text.chars().count() == 1 + && starts_uppercase(text) + && full_text + .get(end..) + .is_some_and(|tail| tail.starts_with('.')) +} + +fn is_initial_continuation_gap(text: &str, gap: &str) -> bool { + if text.chars().count() == 1 && starts_uppercase(text) { + let Some(rest) = gap.strip_prefix('.') else { + return false; + }; + let spaces = rest + .chars() + .take_while(|ch| ch.is_whitespace() && *ch != '\n') + .count(); + return (1..=2).contains(&spaces) && rest.chars().count() == spaces; + } + false +} + +fn horizontal_gap_width(gap: &str) -> usize { + if gap.chars().any(|ch| ch == '\n' || !ch.is_whitespace()) { + return 0; + } + gap.chars().count() +} + +fn is_sentence_start(text: &str, pos: usize) -> bool { + if pos == 0 { + return true; + } + let Some(before) = text.get(..pos) else { + return false; + }; + for ch in before.chars().rev() { + if ch.is_whitespace() { + continue; + } + return matches!(ch, '.' | '!' | '?'); + } + true +} + +fn is_all_caps_context_line(full_text: &str, start: usize) -> bool { + let line = current_line(full_text, start); + let mut letters = 0usize; + let mut upper = 0usize; + for ch in line.chars() { + if ch.is_alphabetic() { + letters = letters.saturating_add(1); + if ch.is_uppercase() { + upper = upper.saturating_add(1); + } + } + } + if letters < ALL_CAPS_NAME_LINE_MIN_LETTERS { + return false; + } + let upper = + u32::try_from(upper).map_or_else(|_| f64::from(u32::MAX), f64::from); + let letters = + u32::try_from(letters).map_or_else(|_| f64::from(u32::MAX), f64::from); + upper / letters >= ALL_CAPS_NAME_LINE_RATIO +} + +const fn ceil_ratio( + value: usize, + numerator: usize, + denominator: usize, +) -> usize { + value.saturating_mul(numerator).div_ceil(denominator) +} + +fn is_all_caps_line_name_shaped(full_text: &str, start: usize) -> bool { + let line = current_line(full_text, start); + if line.chars().any(|ch| ch.is_ascii_digit()) { + return false; + } + let tokens = segment_words(line).len(); + tokens > 0 && tokens <= ALL_CAPS_NAME_LINE_MAX_TOKENS +} + +fn current_line(full_text: &str, start: usize) -> &str { + let line_start = full_text + .get(..start) + .and_then(|head| head.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + let line_end = full_text + .get(start..) + .and_then(|tail| tail.find('\n').map(|index| start.saturating_add(index))) + .unwrap_or(full_text.len()); + full_text.get(line_start..line_end).unwrap_or_default() +} + +fn line_before(full_text: &str, start: usize) -> &str { + let line_start = full_text + .get(..start) + .and_then(|head| head.rfind('\n').map(|index| index.saturating_add(1))) + .unwrap_or(0); + full_text.get(line_start..start).unwrap_or_default() +} + +fn trailing_word(text: &str) -> Option<&str> { + segment_words(text).last().map(|word| word.text) +} + +fn leading_word(text: &str) -> Option<&str> { + segment_words(text).first().map(|word| word.text) +} + +fn is_word_char(ch: char) -> bool { + ch.is_alphanumeric() || ch == '\'' +} + +const fn is_han(ch: char) -> bool { + matches!( + ch, + '\u{3400}'..='\u{4DBF}' + | '\u{4E00}'..='\u{9FFF}' + | '\u{F900}'..='\u{FAFF}' + | '\u{20000}'..='\u{2A6DF}' + | '\u{2A700}'..='\u{2B73F}' + | '\u{2B740}'..='\u{2B81F}' + | '\u{2B820}'..='\u{2CEAF}' + | '\u{2CEB0}'..='\u{2EBEF}' + | '\u{30000}'..='\u{3134F}' + ) +} + +fn string_set(values: Vec) -> HashSet { + values.into_iter().collect() +} + +fn lower_string_set(values: Vec) -> HashSet { + values + .into_iter() + .map(|value| value.to_lowercase()) + .collect() +} + +fn usize_to_u32(value: usize, field: &'static str) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} + +fn invalid_name_data(reason: &'static str) -> Error { + Error::InvalidStaticData { + field: "name_corpus", + reason: String::from(reason), + } +} + +#[cfg(test)] +#[allow(clippy::expect_used, clippy::indexing_slicing)] +mod tests { + use super::*; + + #[test] + fn supplemental_detects_cjk_name_with_configured_surname() { + let data = PreparedNameCorpusData::new(NameCorpusData { + cjk_surname_starters: vec![String::from("王")], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental("Signed by 王小明 today.", &[]) + .expect("cjk detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "王小明"); + assert!((entities[0].score - CJK_SCORE).abs() < f64::EPSILON); + } + + #[test] + fn supplemental_skips_names_covered_by_deny_list() { + let data = PreparedNameCorpusData::new(NameCorpusData { + cjk_surname_starters: vec![String::from("王")], + ..NameCorpusData::default() + }); + let text = "Signed by 王小明 today."; + let start = + u32::try_from(text.find("王小明").expect("fixture contains name")) + .expect("offset fits"); + let end = start.saturating_add( + u32::try_from("王小明".len()).expect("fixture span length fits"), + ); + let deny = PipelineEntity::detected( + start, + end, + PERSON_LABEL, + "王小明", + 0.9, + DetectionSource::DenyList, + ); + + let entities = data + .detect_supplemental(text, &[deny]) + .expect("cjk detection should succeed"); + + assert!(entities.is_empty()); + } + + #[test] + fn supplemental_detects_non_western_chain() { + let data = PreparedNameCorpusData::new(NameCorpusData { + non_western_names: vec![String::from("Sato"), String::from("Kenji")], + ja_suffixes: vec![String::from("san")], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental("The signer is Sato Kenji.", &[]) + .expect("name detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Sato Kenji"); + assert!( + (entities[0].score - HIGH_CONFIDENCE_NAME_SCORE).abs() < f64::EPSILON + ); + } + + #[test] + fn supplemental_does_not_cross_signature_column_gap() { + let data = PreparedNameCorpusData::new(NameCorpusData { + non_western_names: vec![ + String::from("Priya"), + String::from("Ramanathan"), + ], + ..NameCorpusData::default() + }); + + let entities = data + .detect_supplemental( + "Name: Priya Ramanathan Name: Jonathan", + &[], + ) + .expect("name detection should succeed"); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Priya Ramanathan"); + } +} diff --git a/crates/anonymize-core/src/normalize.rs b/crates/anonymize-core/src/normalize.rs new file mode 100644 index 00000000..f6c79c36 --- /dev/null +++ b/crates/anonymize-core/src/normalize.rs @@ -0,0 +1,498 @@ +const PHONE_NOISE: [char; 3] = ['(', ')', '-']; +const ID_SEPARATORS: [char; 3] = ['-', '/', '.']; + +use crate::types::{Error, Result}; + +pub(crate) struct NormalizedSearchText { + text: String, + byte_to_original: Option>, +} + +impl NormalizedSearchText { + pub(crate) fn as_str(&self) -> &str { + &self.text + } + + pub(crate) fn map_span(&self, start: u32, end: u32) -> Result<(u32, u32)> { + if start > end { + return Err(Error::InvalidSpan { start, end }); + } + + let Some(byte_to_original) = &self.byte_to_original else { + return Ok((start, end)); + }; + + Ok(( + map_normalized_offset(byte_to_original, start)?, + map_normalized_offset(byte_to_original, end)?, + )) + } +} + +#[must_use] +pub fn normalize_for_search(text: &str) -> String { + let mut has_replacement = false; + for ch in text.chars() { + if replacement_char(ch) != ch { + has_replacement = true; + break; + } + } + if !has_replacement { + return text.to_owned(); + } + + let mut output = String::with_capacity(text.len()); + for ch in text.chars() { + output.push(replacement_char(ch)); + } + output +} + +pub(crate) fn normalize_for_search_with_byte_map( + text: &str, +) -> Result { + let mut has_replacement = false; + for ch in text.chars() { + if replacement_char(ch) != ch { + has_replacement = true; + break; + } + } + if !has_replacement { + return Ok(NormalizedSearchText { + text: text.to_owned(), + byte_to_original: None, + }); + } + + let mut output = String::with_capacity(text.len()); + let mut byte_to_original = vec![0_u32]; + for (original_start, ch) in text.char_indices() { + set_boundary( + &mut byte_to_original, + output.len(), + checked_u32(original_start)?, + ); + output.push(replacement_char(ch)); + set_boundary( + &mut byte_to_original, + output.len(), + checked_u32(original_start.saturating_add(ch.len_utf8()))?, + ); + } + + Ok(NormalizedSearchText { + text: output, + byte_to_original: Some(byte_to_original), + }) +} + +fn set_boundary( + byte_to_original: &mut Vec, + normalized_offset: usize, + original_offset: u32, +) { + if byte_to_original.len() <= normalized_offset { + byte_to_original.resize(normalized_offset.saturating_add(1), u32::MAX); + } + if let Some(slot) = byte_to_original.get_mut(normalized_offset) { + *slot = original_offset; + } +} + +fn map_normalized_offset(byte_to_original: &[u32], offset: u32) -> Result { + let index = usize::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset })?; + let mapped = byte_to_original + .get(index) + .copied() + .ok_or(Error::ByteOffsetOutOfBounds { offset })?; + if mapped == u32::MAX { + return Err(Error::ByteOffsetInsideCodepoint { offset }); + } + Ok(mapped) +} + +fn checked_u32(offset: usize) -> Result { + u32::try_from(offset) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX }) +} + +// Normalization decides placeholder identity. +pub(crate) fn label_key(label: &str) -> String { + let uppercase = uppercase(label); + collapse_whitespace(&uppercase, "_", false) +} + +pub(crate) fn placeholder_fallback(label: &str) -> String { + format!("[{}]", label_key(label)) +} + +pub(crate) fn normalize_entity_text(label: &str, text: &str) -> String { + let upper = label_key(label); + + if upper == "EMAIL_ADDRESS" || upper == "EMAIL" { + return text.to_lowercase().trim().to_owned(); + } + if upper == "PHONE_NUMBER" || upper == "PHONE" { + return text + .chars() + .filter(|ch| !ch.is_whitespace() && !PHONE_NOISE.contains(ch)) + .collect(); + } + if upper == "CRYPTO" { + return normalize_crypto_text(text); + } + if upper == "NATIONAL_IDENTIFICATION_NUMBER" && contains_nhs_cue(text) { + return text.chars().filter(char::is_ascii_digit).collect(); + } + if is_identifier_label(&upper) { + return normalize_identifier_text(text); + } + if upper == "PASSPORT_NUMBER" { + return normalize_passport_text(text); + } + if is_collapsible_text_label(&upper) { + return collapse_whitespace(text, " ", false) + .to_lowercase() + .trim() + .to_owned(); + } + + text.trim().to_owned() +} + +fn uppercase(text: &str) -> String { + let mut output = String::new(); + for ch in text.chars() { + output.extend(ch.to_uppercase()); + } + output +} + +fn collapse_whitespace(text: &str, replacement: &str, trim: bool) -> String { + let mut output = String::new(); + let mut in_whitespace = false; + + for ch in text.chars() { + if ch.is_whitespace() { + if !in_whitespace { + output.push_str(replacement); + in_whitespace = true; + } + continue; + } + + output.push(ch); + in_whitespace = false; + } + + if trim { + return output.trim().to_owned(); + } + output +} + +fn strip_id_separators(text: &str) -> String { + text + .chars() + .filter(|ch| !ch.is_whitespace() && !ID_SEPARATORS.contains(ch)) + .collect() +} + +fn normalize_identifier_text(text: &str) -> String { + strip_id_separators(text).to_uppercase() +} + +fn is_identifier_label(upper: &str) -> bool { + matches!( + upper, + "IBAN" + | "BANK_ACCOUNT_NUMBER" + | "TAX_IDENTIFICATION_NUMBER" + | "REGISTRATION_NUMBER" + | "NATIONAL_IDENTIFICATION_NUMBER" + | "SOCIAL_SECURITY_NUMBER" + | "BIRTH_NUMBER" + | "IDENTITY_CARD_NUMBER" + | "CREDIT_CARD_NUMBER" + ) +} + +fn is_collapsible_text_label(upper: &str) -> bool { + matches!( + upper, + "PERSON" | "ORGANIZATION" | "ADDRESS" | "LAND_PARCEL" | "MISC" + ) +} + +fn contains_nhs_cue(text: &str) -> bool { + let lower = text.to_lowercase(); + contains_word(&lower, "nhs") + || collapse_whitespace(&lower, " ", true) + .contains("national health service") +} + +fn normalize_crypto_text(text: &str) -> String { + let trimmed = text.trim(); + + if let Some(address) = find_ethereum_address(trimmed) { + return address.to_lowercase(); + } + if let Some(address) = find_bech32_address(trimmed) { + return address.to_lowercase(); + } + if let Some(address) = find_base58_address(trimmed) { + return address.to_owned(); + } + + trimmed.to_owned() +} + +fn find_ethereum_address(text: &str) -> Option<&str> { + for (start, _) in text.match_indices("0x") { + let end = start.saturating_add(42); + let Some(candidate) = text.get(start..end) else { + continue; + }; + if candidate.chars().skip(2).all(|ch| ch.is_ascii_hexdigit()) { + return Some(candidate); + } + } + + None +} + +fn find_bech32_address(text: &str) -> Option<&str> { + find_ascii_token(text, |token| { + let lower = token.to_lowercase(); + lower.len() >= 14 + && lower.len() <= 74 + && lower.starts_with("bc1") + && lower + .chars() + .skip(3) + .all(|ch| matches!(ch, 'a'..='h' | 'j'..='n' | 'p'..='z' | '0'..='9')) + }) +} + +fn find_base58_address(text: &str) -> Option<&str> { + find_ascii_token(text, |token| { + let len = token.len(); + (26..=35).contains(&len) + && (token.starts_with('1') || token.starts_with('3')) + && token.chars().all(is_base58_char) + }) +} + +fn find_ascii_token( + text: &str, + predicate: impl Fn(&str) -> bool, +) -> Option<&str> { + let mut token_start = None; + + for (index, ch) in text.char_indices() { + if ch.is_ascii_alphanumeric() { + if token_start.is_none() { + token_start = Some(index); + } + continue; + } + + if let Some(start) = token_start { + let token = text.get(start..index)?; + if predicate(token) { + return Some(token); + } + token_start = None; + } + } + + let start = token_start?; + let token = text.get(start..)?; + predicate(token).then_some(token) +} + +fn find_compact_ascii_identifier( + text: &str, + allow_whitespace: bool, + predicate: impl Fn(&str) -> bool, +) -> Option { + for (start, ch) in text.char_indices() { + if !is_identifier_start(text, start, ch) { + continue; + } + let Some(candidate) = + compact_ascii_identifier_from(text, start, allow_whitespace, &predicate) + else { + continue; + }; + return Some(candidate); + } + + None +} + +fn compact_ascii_identifier_from( + text: &str, + start: usize, + allow_whitespace: bool, + predicate: &impl Fn(&str) -> bool, +) -> Option { + let mut compact = String::new(); + let mut token = String::new(); + let mut last_valid = None; + let tail = text.get(start..)?; + + for ch in tail.chars() { + if ch.is_ascii_alphanumeric() { + compact.push(ch.to_ascii_uppercase()); + token.push(ch.to_ascii_uppercase()); + continue; + } + + if is_identifier_separator(ch, allow_whitespace) { + if predicate(&compact) { + last_valid = Some(compact.clone()); + } + token.clear(); + continue; + } + + break; + } + + if allow_whitespace && token_is_trailing_prose(&token) && last_valid.is_some() + { + return last_valid; + } + if predicate(&compact) { + return Some(compact); + } + last_valid +} + +fn token_is_trailing_prose(token: &str) -> bool { + token.len() >= 3 && token.chars().all(|ch| ch.is_ascii_alphabetic()) +} + +fn is_identifier_start(text: &str, index: usize, ch: char) -> bool { + ch.is_ascii_alphanumeric() + && text + .get(..index) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|previous| !previous.is_ascii_alphanumeric()) +} + +fn is_identifier_separator(ch: char, allow_whitespace: bool) -> bool { + ID_SEPARATORS.contains(&ch) || (allow_whitespace && ch.is_whitespace()) +} + +const fn is_base58_char(ch: char) -> bool { + matches!( + ch, + 'a'..='k' + | 'm'..='z' + | 'A'..='H' + | 'J'..='N' + | 'P'..='Z' + | '1'..='9' + ) +} + +fn normalize_passport_text(text: &str) -> String { + find_compact_ascii_identifier(text, true, is_passport_identifier) + .unwrap_or_else(|| strip_id_separators(text).to_uppercase()) +} + +fn is_passport_identifier(token: &str) -> bool { + let chars: Vec = token.chars().collect(); + matches_letters_digits(&chars, 1, 2, 6, 8) + || matches_digits_letters_digits(&chars, 2, 2, 5) + || (token.len() >= 7 + && token.len() <= 9 + && token.chars().all(|ch| ch.is_ascii_digit())) +} + +fn matches_letters_digits( + chars: &[char], + min_letters: usize, + max_letters: usize, + min_digits: usize, + max_digits: usize, +) -> bool { + for letter_count in min_letters..=max_letters { + let digit_count = chars.len().saturating_sub(letter_count); + if digit_count < min_digits || digit_count > max_digits { + continue; + } + let Some((letters, digits)) = chars.split_at_checked(letter_count) else { + continue; + }; + if letters.iter().all(char::is_ascii_alphabetic) + && digits.iter().all(char::is_ascii_digit) + { + return true; + } + } + + false +} + +fn matches_digits_letters_digits( + chars: &[char], + first_digits: usize, + letters_count: usize, + last_digits: usize, +) -> bool { + let expected_len = first_digits + .saturating_add(letters_count) + .saturating_add(last_digits); + if chars.len() != expected_len { + return false; + } + + let Some((first, tail)) = chars.split_at_checked(first_digits) else { + return false; + }; + let Some((letters, last)) = tail.split_at_checked(letters_count) else { + return false; + }; + + first.iter().all(char::is_ascii_digit) + && letters.iter().all(char::is_ascii_alphabetic) + && last.iter().all(char::is_ascii_digit) +} + +fn contains_word(text: &str, word: &str) -> bool { + let mut start = 0; + while let Some(relative) = text.get(start..).and_then(|tail| tail.find(word)) + { + let word_start = start.saturating_add(relative); + let word_end = word_start.saturating_add(word.len()); + let before_ok = text + .get(..word_start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphanumeric()); + let after_ok = text + .get(word_end..) + .and_then(|suffix| suffix.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if before_ok && after_ok { + return true; + } + start = word_end; + } + + false +} + +const fn replacement_char(ch: char) -> char { + match ch { + '\u{00a0}' | '\u{2007}' | '\u{202f}' => ' ', + '\u{2013}' | '\u{2014}' => '-', + '\u{201c}' | '\u{201d}' => '"', + _ => ch, + } +} diff --git a/crates/anonymize-core/src/placeholders.rs b/crates/anonymize-core/src/placeholders.rs new file mode 100644 index 00000000..9216e35e --- /dev/null +++ b/crates/anonymize-core/src/placeholders.rs @@ -0,0 +1,139 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::normalize::{label_key, normalize_entity_text}; +use crate::types::{Entity, EntityKind, PlaceholderMap}; + +// Document-local placeholder key. +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +struct NormalizedKey { + label_key: String, + text: String, +} + +#[must_use] +pub fn build_placeholder_map( + entities: &[Entity], + reserved_text: &str, +) -> PlaceholderMap { + let mut counters = BTreeMap::::new(); + let mut normalized_to_placeholder = BTreeMap::::new(); + let reserved_placeholders = collect_reserved_placeholders(reserved_text); + let mut placeholder_map = PlaceholderMap::default(); + + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + for entity in &sorted { + if placeholder_map.has_entity(entity) { + continue; + } + + let label_key = label_key(&entity.label); + let normalized_key = normalized_key(entity, &label_key); + + if let Some(existing) = normalized_to_placeholder.get(&normalized_key) { + placeholder_map.push_entity(entity, existing); + continue; + } + + let placeholder = + next_placeholder(&label_key, &mut counters, &reserved_placeholders); + placeholder_map.push_entity(entity, &placeholder); + normalized_to_placeholder.insert(normalized_key, placeholder); + } + + placeholder_map +} + +fn normalized_key(entity: &Entity, label_key: &str) -> NormalizedKey { + // Coreference aliases key by source identity, not alias text. + let text = match &entity.kind { + EntityKind::Detected => normalize_entity_text(&entity.label, &entity.text), + EntityKind::Coreference { source_text } => { + normalize_entity_text(&entity.label, source_text) + } + }; + + NormalizedKey { + label_key: label_key.to_owned(), + text, + } +} + +fn next_placeholder( + label_key: &str, + counters: &mut BTreeMap, + reserved_placeholders: &BTreeSet, +) -> String { + let mut count = counters.get(label_key).copied().unwrap_or(0); + + loop { + count = count.saturating_add(1); + let placeholder = format!("[{label_key}_{count}]"); + if reserved_placeholders.contains(&placeholder) { + continue; + } + + counters.insert(label_key.to_owned(), count); + return placeholder; + } +} + +fn collect_reserved_placeholders(text: &str) -> BTreeSet { + let mut placeholders = BTreeSet::new(); + let mut remaining = text; + + while let Some(start) = remaining.find('[') { + let candidate_start = start.saturating_add('['.len_utf8()); + let Some(after_open) = remaining.get(candidate_start..) else { + break; + }; + let Some(end) = after_open.find(']') else { + break; + }; + let Some(inner) = after_open.get(..end) else { + break; + }; + let valid = is_placeholder_inner(inner); + if valid { + placeholders.insert(format!("[{inner}]")); + } + + let next_start = if valid { + candidate_start + .saturating_add(end) + .saturating_add(']'.len_utf8()) + } else { + candidate_start + }; + remaining = remaining.get(next_start..).unwrap_or_default(); + } + + placeholders +} + +fn is_placeholder_inner(inner: &str) -> bool { + if inner.is_empty() + || inner + .chars() + .any(|ch| ch.is_whitespace() || ch == '[' || ch == ']') + { + return false; + } + + let Some(separator) = inner.rfind('_') else { + return false; + }; + if separator == 0 { + return false; + } + + let Some(number) = inner.get(separator.saturating_add(1)..) else { + return false; + }; + let mut chars = number.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_ascii_digit() && first != '0' && chars.all(|ch| ch.is_ascii_digit()) +} diff --git a/crates/anonymize-core/src/prepared.rs b/crates/anonymize-core/src/prepared.rs new file mode 100644 index 00000000..7ffac5ba --- /dev/null +++ b/crates/anonymize-core/src/prepared.rs @@ -0,0 +1,2090 @@ +use std::time::Instant; + +use crate::address_context::{AddressContextData, PreparedAddressContextData}; +use crate::address_seeds::{AddressSeedData, PreparedAddressSeedData}; +use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; +use crate::byte_offsets::ByteOffsets; +use crate::coreference::{CoreferenceData, PreparedCoreferenceData}; +use crate::dates::{DateData, PreparedDateData}; +use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; +use crate::false_positives::filter_entity_false_positives; +use crate::hotwords::{ + HotwordRuleData, PreparedHotwordData, apply_hotword_rules, +}; +use crate::legal_forms::{ + LegalFormData, PreparedLegalFormData, process_legal_form_matches, +}; +use crate::money::{MonetaryData, PreparedMonetaryData}; +use crate::name_corpus::{ + NameCorpusData, PreparedNameCorpusData as PreparedNames, +}; +use crate::normalize::{ + NormalizedSearchText, normalize_for_search_with_byte_map, +}; +use crate::processors::{ + CountryMatchData, DenyListFilterData, DenyListMatchData, GazetteerMatchData, + PatternSlice, RegexMatchMeta, ensure_supported_deny_list_sources, + process_country_matches, process_deny_list_matches, + process_gazetteer_matches, process_regex_matches, +}; +use crate::redact::redact_text; +use crate::resolution::{ + PipelineEntity, SourceDetail, enforce_boundary_consistency, merge_and_dedup, + sanitize_entities_with_source, +}; +use crate::search::{ + LiteralSearchOptions, SearchIndex, SearchIndexArtifacts, SearchOptions, + SearchPattern, +}; +use crate::signatures::detect_signatures; +use crate::triggers::{ + PreparedTriggerData, TriggerData, process_trigger_matches, +}; +use crate::types::{ + Entity, EntityKind, Error, OperatorConfig, RedactionResult, Result, + SearchMatch, +}; +use crate::zones::{PreparedZoneData, ZoneData}; + +const PREPARED_SEARCH_ARTIFACTS_HEADER: [u8; 8] = *b"ANONPSR1"; +const PREPARED_SEARCH_ARTIFACTS_VERSION: u32 = 1; +const NEAR_MISS_BAND: f64 = 0.15; +const BOOST_PER_NEIGHBOUR: f64 = 0.05; +const CONTEXT_WINDOW_CHARS: f64 = 150.0; +const HIGH_CONFIDENCE_FLOOR: f64 = 0.9; + +pub struct PreparedSearch { + regex: SearchIndex, + custom_regex: SearchIndex, + legal_forms: SearchIndex, + triggers: SearchIndex, + literals: SearchIndex, + allowed_labels: Vec, + threshold: f64, + confidence_boost: bool, + slices: PreparedSearchSlices, + regex_meta: Vec, + custom_regex_meta: Vec, + deny_list_data: Option, + false_positive_filters: Option, + gazetteer_data: Option, + country_data: Option, + hotword_data: Option, + trigger_data: Option, + legal_form_data: Option, + address_seed_data: Option, + zone_data: Option, + address_context_data: Option, + coreference_data: Option, + name_corpus_data: Option, + date_data: Option, + monetary_data: Option, + monetary_extraction: bool, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct PreparedSearchSlices { + pub regex: PatternSlice, + pub custom_regex: PatternSlice, + pub legal_forms: PatternSlice, + pub triggers: PatternSlice, + pub deny_list: PatternSlice, + pub street_types: PatternSlice, + pub gazetteer: PatternSlice, + pub countries: PatternSlice, + pub hotwords: PatternSlice, +} + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct PreparedSearchConfig { + pub regex_patterns: Vec, + pub custom_regex_patterns: Vec, + pub literal_patterns: Vec, + pub regex_options: SearchOptions, + pub custom_regex_options: SearchOptions, + pub literal_options: SearchOptions, + #[serde(default)] + pub allowed_labels: Vec, + #[serde(default)] + pub threshold: f64, + #[serde(default)] + pub confidence_boost: bool, + pub slices: PreparedSearchSlices, + pub regex_meta: Vec, + pub custom_regex_meta: Vec, + pub deny_list_data: Option, + #[serde(default)] + pub false_positive_filters: Option, + pub gazetteer_data: Option, + pub country_data: Option, + #[serde(default)] + pub hotword_data: Option, + pub trigger_data: Option, + pub legal_form_data: Option, + pub address_seed_data: Option, + #[serde(default)] + pub zone_data: Option, + #[serde(default)] + pub address_context_data: Option, + #[serde(default)] + pub coreference_data: Option, + #[serde(default)] + pub name_corpus_data: Option, + pub date_data: Option, + pub monetary_data: Option, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PreparedSearchArtifacts { + pub regex: SearchIndexArtifacts, + pub custom_regex: SearchIndexArtifacts, + pub legal_forms: SearchIndexArtifacts, + pub triggers: SearchIndexArtifacts, + pub literals: SearchIndexArtifacts, +} + +impl PreparedSearchArtifacts { + pub fn to_bytes(&self) -> Result> { + let mut writer = ArtifactWriter::new( + PREPARED_SEARCH_ARTIFACTS_HEADER, + PREPARED_SEARCH_ARTIFACTS_VERSION, + ); + write_index_artifacts(&mut writer, "prepared.regex", &self.regex)?; + write_index_artifacts( + &mut writer, + "prepared.custom_regex", + &self.custom_regex, + )?; + write_index_artifacts( + &mut writer, + "prepared.legal_forms", + &self.legal_forms, + )?; + write_index_artifacts(&mut writer, "prepared.triggers", &self.triggers)?; + write_index_artifacts(&mut writer, "prepared.literals", &self.literals)?; + Ok(writer.into_bytes()) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let mut reader = ArtifactReader::new( + bytes, + PREPARED_SEARCH_ARTIFACTS_HEADER, + PREPARED_SEARCH_ARTIFACTS_VERSION, + "prepared_search_artifacts", + )?; + let artifacts = Self { + regex: read_index_artifacts(&mut reader)?, + custom_regex: read_index_artifacts(&mut reader)?, + legal_forms: read_index_artifacts(&mut reader)?, + triggers: read_index_artifacts(&mut reader)?, + literals: read_index_artifacts(&mut reader)?, + }; + reader.finish()?; + Ok(artifacts) + } +} + +fn write_index_artifacts( + writer: &mut ArtifactWriter, + field: &'static str, + artifacts: &SearchIndexArtifacts, +) -> Result<()> { + writer.write_len_prefixed_bytes(field, &artifacts.to_bytes()?) +} + +fn read_index_artifacts( + reader: &mut ArtifactReader<'_>, +) -> Result { + SearchIndexArtifacts::from_bytes(reader.read_len_prefixed_bytes()?) +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PreparedSearchMatches { + pub regex: Vec, + pub custom_regex: Vec, + pub literal: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct StaticDetectionResult { + pub matches: PreparedSearchMatches, + pub regex_entities: Vec, + pub custom_regex_entities: Vec, + pub deny_list_entities: Vec, + pub gazetteer_entities: Vec, + pub country_entities: Vec, + pub anchored_entities: Vec, + pub trigger_entities: Vec, + pub signature_entities: Vec, + pub legal_form_entities: Vec, + pub address_seed_entities: Vec, + pub name_corpus_entities: Vec, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct StaticRedactionResult { + pub detections: StaticDetectionResult, + pub resolved_entities: Vec, + pub redaction: RedactionResult, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct StaticRedactionDiagnosticResult { + pub result: StaticRedactionResult, + pub diagnostics: StaticRedactionDiagnostics, +} + +struct TimedEntities { + entities: Vec, + elapsed_us: u64, +} + +struct StaticEntityPasses { + regex: TimedEntities, + custom_regex: TimedEntities, + deny_list: TimedEntities, + gazetteer: TimedEntities, + country: TimedEntities, + anchored: TimedEntities, + trigger: TimedEntities, + signature: TimedEntities, + legal_form: TimedEntities, + address_seed: TimedEntities, + name_corpus: TimedEntities, +} + +pub struct PreparedSearchBuildResult { + pub prepared: PreparedSearch, + pub diagnostics: StaticRedactionDiagnostics, +} + +struct RegexPatternGroups { + regex: Vec, + legal_forms: Vec, + triggers: Vec, +} + +type TimedSearchIndex = (SearchIndex, u64); + +struct PreparedSearchIndexes { + regex: TimedSearchIndex, + custom_regex: TimedSearchIndex, + legal_forms: TimedSearchIndex, + triggers: TimedSearchIndex, + literals: TimedSearchIndex, +} + +struct SearchIndexBuildInputs { + regex_patterns: Vec, + regex_options: SearchOptions, + custom_regex_patterns: Vec, + custom_regex_options: SearchOptions, + legal_form_patterns: Vec, + trigger_patterns: Vec, + literal_patterns: Vec, + literal_options: SearchOptions, +} + +#[derive(Clone, Copy)] +struct SearchIndexPrepareMetrics { + regex: (usize, u64), + custom_regex: (usize, u64), + legal_forms: (usize, u64), + triggers: (usize, u64), + literals: (usize, u64), +} + +impl PreparedSearch { + pub fn new(config: PreparedSearchConfig) -> Result { + Self::new_inner(config, None, None) + } + + pub fn warm_lazy_regex(&self) -> Result<()> { + self.regex.warm_lazy_regex()?; + self.custom_regex.warm_lazy_regex()?; + self.legal_forms.warm_lazy_regex()?; + self.triggers.warm_lazy_regex()?; + self.literals.warm_lazy_regex() + } + + pub fn prepare_artifacts( + config: PreparedSearchConfig, + ) -> Result { + validate_supported_config(&config, false)?; + let regex_groups = + split_regex_patterns(config.regex_patterns, &config.slices)?; + Ok(PreparedSearchArtifacts { + regex: SearchIndex::prepare_artifacts( + regex_groups.regex, + config.regex_options, + )?, + custom_regex: SearchIndex::prepare_artifacts( + config.custom_regex_patterns, + config.custom_regex_options, + )?, + legal_forms: SearchIndex::prepare_artifacts( + regex_groups.legal_forms, + legal_form_search_options(), + )?, + triggers: SearchIndex::prepare_artifacts( + promote_case_insensitive_literals(regex_groups.triggers), + trigger_search_options(), + )?, + literals: SearchIndex::prepare_artifacts( + config.literal_patterns, + config.literal_options, + )?, + }) + } + + pub fn new_with_artifacts( + config: PreparedSearchConfig, + artifacts: &PreparedSearchArtifacts, + ) -> Result { + Self::new_inner(config, None, Some(artifacts)) + } + + pub fn new_with_artifacts_diagnostics( + config: PreparedSearchConfig, + artifacts: &PreparedSearchArtifacts, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let prepared = + Self::new_inner(config, Some(&mut diagnostics), Some(artifacts))?; + + Ok(PreparedSearchBuildResult { + prepared, + diagnostics, + }) + } + + pub fn new_with_diagnostics( + config: PreparedSearchConfig, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let prepared = Self::new_inner(config, Some(&mut diagnostics), None)?; + + Ok(PreparedSearchBuildResult { + prepared, + diagnostics, + }) + } + + fn new_inner( + config: PreparedSearchConfig, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + artifacts: Option<&PreparedSearchArtifacts>, + ) -> Result { + let total_start = Instant::now(); + let allow_literal_artifacts = + artifacts.is_some_and(|artifacts| !artifacts.literals.slots.is_empty()); + validate_supported_config(&config, allow_literal_artifacts)?; + let slices = config.slices.clone(); + let allowed_labels = config.allowed_labels.clone(); + let threshold = config.threshold; + let confidence_boost = config.confidence_boost; + let monetary_extraction = should_extract_monetary_data(&config); + let regex_groups = split_regex_patterns(config.regex_patterns, &slices)?; + let regex_len = regex_groups.regex.len(); + let custom_regex_len = config.custom_regex_patterns.len(); + let anchored_len = anchored_config_len( + config.date_data.as_ref(), + config.monetary_data.as_ref(), + ); + let legal_form_len = regex_groups.legal_forms.len(); + let trigger_len = regex_groups.triggers.len(); + + let (date_data, monetary_data) = prepare_anchored_data( + config.date_data.as_ref(), + config.monetary_data, + anchored_len, + diagnostics.as_deref_mut(), + )?; + + let indexes = build_search_indexes_for_config( + regex_groups, + config.regex_options, + config.custom_regex_patterns, + config.custom_regex_options, + config.literal_patterns, + config.literal_options, + artifacts, + )?; + let ( + (regex, regex_elapsed), + (custom_regex, custom_regex_elapsed), + (legal_forms, legal_forms_elapsed), + (triggers, triggers_elapsed), + (literals, literals_elapsed), + ) = ( + indexes.regex, + indexes.custom_regex, + indexes.legal_forms, + indexes.triggers, + indexes.literals, + ); + let literal_len = literals.len(); + record_search_index_prepare_stages( + &mut diagnostics, + &SearchIndexPrepareMetrics { + regex: (regex_len, regex_elapsed), + custom_regex: (custom_regex_len, custom_regex_elapsed), + legal_forms: (legal_form_len, legal_forms_elapsed), + triggers: (trigger_len, triggers_elapsed), + literals: (literal_len, literals_elapsed), + }, + ); + record_prepare_total( + &mut diagnostics, + [ + regex_len, + custom_regex_len, + anchored_len, + legal_form_len, + trigger_len, + literal_len, + ], + total_start, + ); + + Ok(Self { + regex, + custom_regex, + legal_forms, + triggers, + literals, + allowed_labels, + threshold, + confidence_boost, + slices, + regex_meta: config.regex_meta, + custom_regex_meta: config.custom_regex_meta, + deny_list_data: config.deny_list_data, + false_positive_filters: config.false_positive_filters, + gazetteer_data: config.gazetteer_data, + country_data: config.country_data, + hotword_data: prepare_hotword_data(config.hotword_data)?, + trigger_data: prepare_trigger_data(config.trigger_data)?, + legal_form_data: config.legal_form_data.map(PreparedLegalFormData::new), + address_seed_data: prepare_address_seed_data(config.address_seed_data)?, + zone_data: prepare_zone_data(config.zone_data.as_ref())?, + address_context_data: prepare_address_context_data( + config.address_context_data, + )?, + coreference_data: prepare_coreference_data(config.coreference_data)?, + name_corpus_data: config.name_corpus_data.map(PreparedNames::new), + date_data, + monetary_data, + monetary_extraction, + }) + } + + pub fn find_matches(&self, full_text: &str) -> Result { + self.find_matches_inner(full_text, None) + } + + fn find_matches_inner( + &self, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let total_start = Instant::now(); + let normalize_start = Instant::now(); + let normalized = normalize_for_search_with_byte_map(full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::Normalize, + None, + Some(elapsed_us(normalize_start)), + Some(full_text.len()), + ); + } + + let regex_start = Instant::now(); + let regex = offset_matches( + self.regex.find_iter(full_text)?, + self.slices.regex.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchRegex, + ®ex, + full_text, + Some(elapsed_us(regex_start)), + ); + } + + let legal_form_start = Instant::now(); + let legal_forms = normalized_offset_matches( + &self.legal_forms, + &normalized, + self.slices.legal_forms.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchLegalForm, + &legal_forms, + full_text, + Some(elapsed_us(legal_form_start)), + ); + } + + let trigger_start = Instant::now(); + let triggers = offset_matches( + self.triggers.find_iter(full_text)?, + self.slices.triggers.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchTrigger, + &triggers, + full_text, + Some(elapsed_us(trigger_start)), + ); + } + + let custom_regex_start = Instant::now(); + let custom_regex = offset_matches( + self.custom_regex.find_iter(full_text)?, + self.slices.custom_regex.start, + )?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchCustomRegex, + &custom_regex, + full_text, + Some(elapsed_us(custom_regex_start)), + ); + } + + let literal_start = Instant::now(); + let literal = self + .literals + .find_iter(normalized.as_str())? + .into_iter() + .map(|found| remap_normalized_match(&normalized, found)) + .collect::>>()?; + let regex = combine_regex_matches(regex, legal_forms, triggers); + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_search_matches( + DiagnosticStage::SearchLiteral, + &literal, + full_text, + Some(elapsed_us(literal_start)), + ); + diagnostics.record_stage( + DiagnosticStage::FindMatches, + Some( + regex + .len() + .saturating_add(custom_regex.len()) + .saturating_add(literal.len()), + ), + Some(elapsed_us(total_start)), + Some(full_text.len()), + ); + } + + Ok(PreparedSearchMatches { + regex, + custom_regex, + literal, + }) + } + + pub fn detect_static_entities( + &self, + full_text: &str, + ) -> Result { + self.detect_static_entities_inner(full_text, None) + } + + fn detect_static_entities_inner( + &self, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let matches = + self.find_matches_inner(full_text, diagnostics.as_deref_mut())?; + let passes = self.process_static_entity_passes( + &matches, + full_text, + diagnostics.as_deref_mut(), + )?; + + if let Some(diagnostics) = &mut diagnostics { + record_static_entity_diagnostics(diagnostics, full_text, &passes); + } + + Ok(StaticDetectionResult { + matches, + regex_entities: passes.regex.entities, + custom_regex_entities: passes.custom_regex.entities, + deny_list_entities: passes.deny_list.entities, + gazetteer_entities: passes.gazetteer.entities, + country_entities: passes.country.entities, + anchored_entities: passes.anchored.entities, + trigger_entities: passes.trigger.entities, + signature_entities: passes.signature.entities, + legal_form_entities: passes.legal_form.entities, + address_seed_entities: passes.address_seed.entities, + name_corpus_entities: passes.name_corpus.entities, + }) + } + + fn process_static_entity_passes( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let regex_start = Instant::now(); + let regex = TimedEntities { + entities: process_regex_matches( + &matches.regex, + self.slices.regex, + full_text, + &self.regex_meta, + )?, + elapsed_us: elapsed_us(regex_start), + }; + + let custom_regex_start = Instant::now(); + let custom_regex = TimedEntities { + entities: process_regex_matches( + &matches.custom_regex, + self.slices.custom_regex, + full_text, + &self.custom_regex_meta, + )?, + elapsed_us: elapsed_us(custom_regex_start), + }; + + let deny_list_start = Instant::now(); + let deny_list = TimedEntities { + entities: if let Some(data) = &self.deny_list_data { + process_deny_list_matches( + &matches.literal, + self.slices.deny_list, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(deny_list_start), + }; + + let gazetteer_start = Instant::now(); + let gazetteer = TimedEntities { + entities: if let Some(data) = &self.gazetteer_data { + process_gazetteer_matches( + &matches.literal, + self.slices.gazetteer, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(gazetteer_start), + }; + + let country = self.process_country_entities(matches, full_text)?; + + let anchored = self.process_anchored_entities(full_text)?; + + let trigger = + self.process_trigger_entities(matches, full_text, diagnostics)?; + + let signature = process_signature_entities(full_text); + + let legal_form = self.process_legal_form_entities(matches, full_text)?; + + let name_corpus = + self.process_name_corpus_entities(full_text, &deny_list.entities)?; + + let address_seed = self.process_address_seed_entities( + matches, + full_text, + &[ + ®ex.entities, + &custom_regex.entities, + &anchored.entities, + &trigger.entities, + &signature.entities, + &legal_form.entities, + &deny_list.entities, + &gazetteer.entities, + &name_corpus.entities, + ], + )?; + + Ok(StaticEntityPasses { + regex, + custom_regex, + deny_list, + gazetteer, + country, + anchored, + trigger, + signature, + legal_form, + address_seed, + name_corpus, + }) + } + + fn process_anchored_entities( + &self, + full_text: &str, + ) -> Result { + let anchored_start = Instant::now(); + let mut entities = Vec::new(); + if let Some(data) = &self.date_data { + entities.extend(data.process(full_text)?); + } + if self.monetary_extraction + && let Some(data) = &self.monetary_data + { + entities.extend(data.process(full_text)?); + } + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(anchored_start), + }) + } + + fn process_trigger_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.trigger_data { + process_trigger_matches( + &matches.regex, + self.slices.triggers, + full_text, + data, + diagnostics, + )? + } else { + Vec::new() + }; + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + fn process_legal_form_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.legal_form_data { + process_legal_form_matches( + &matches.regex, + self.slices.legal_forms, + full_text, + data, + )? + } else { + Vec::new() + }; + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + fn process_address_seed_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + context_layers: &[&[PipelineEntity]], + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.address_seed_data { + let existing_entities = address_seed_context(context_layers); + data.process( + &matches.literal, + self.slices.street_types, + full_text, + &existing_entities, + )? + } else { + Vec::new() + }; + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + fn process_country_entities( + &self, + matches: &PreparedSearchMatches, + full_text: &str, + ) -> Result { + let country_start = Instant::now(); + Ok(TimedEntities { + entities: if let Some(data) = &self.country_data { + process_country_matches( + &matches.literal, + self.slices.countries, + full_text, + data, + )? + } else { + Vec::new() + }, + elapsed_us: elapsed_us(country_start), + }) + } + + fn process_name_corpus_entities( + &self, + full_text: &str, + deny_list_entities: &[PipelineEntity], + ) -> Result { + let start = Instant::now(); + let entities = if let Some(data) = &self.name_corpus_data { + data.detect_supplemental(full_text, deny_list_entities)? + } else { + Vec::new() + }; + + Ok(TimedEntities { + entities, + elapsed_us: elapsed_us(start), + }) + } + + pub fn redact_static_entities( + &self, + full_text: &str, + operators: &OperatorConfig, + ) -> Result { + self.redact_static_entities_inner(full_text, operators, None) + } + + pub fn redact_static_entities_with_diagnostics( + &self, + full_text: &str, + operators: &OperatorConfig, + ) -> Result { + let mut diagnostics = StaticRedactionDiagnostics::default(); + let result = self.redact_static_entities_inner( + full_text, + operators, + Some(&mut diagnostics), + )?; + + Ok(StaticRedactionDiagnosticResult { + result, + diagnostics, + }) + } + + fn redact_static_entities_inner( + &self, + full_text: &str, + operators: &OperatorConfig, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result { + let detections = self + .detect_static_entities_inner(full_text, diagnostics.as_deref_mut())?; + let pre_threshold_entities = self.prepare_pre_threshold_entities( + &detections, + full_text, + diagnostics.as_deref_mut(), + )?; + let mut raw_entities = filter_entities_for_redaction( + pre_threshold_entities, + full_text, + self.threshold, + self.confidence_boost, + &self.allowed_labels, + )?; + let address_context_start = Instant::now(); + let address_context_entities = + self.process_address_context_entities(full_text, &raw_entities)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::EntityAddressContext, + &address_context_entities, + full_text, + Some(elapsed_us(address_context_start)), + ); + } + raw_entities.extend(address_context_entities); + let merge_start = Instant::now(); + let merged = merge_and_dedup(&raw_entities); + let merged = self.extend_monetary_entities(full_text, &merged); + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Merge, + &merged, + full_text, + Some(elapsed_us(merge_start)), + ); + } + let boundary_start = Instant::now(); + let consistent = enforce_boundary_consistency(&merged, full_text)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Boundary, + &consistent, + full_text, + Some(elapsed_us(boundary_start)), + ); + } + let sanitize_start = Instant::now(); + let sanitized_entities = + sanitize_entities_with_source(&consistent, full_text)?; + let false_positive_filters = + self.false_positive_filters.as_ref().or_else(|| { + self + .deny_list_data + .as_ref() + .and_then(|data| data.filters.as_ref()) + }); + let mut resolved_entities = filter_entities_for_config( + filter_entity_false_positives( + sanitized_entities, + full_text, + false_positive_filters, + )?, + self.threshold, + &self.allowed_labels, + ); + resolved_entities = self.process_coreference_entities( + full_text, + resolved_entities, + false_positive_filters, + diagnostics.as_deref_mut(), + )?; + clear_internal_source_details(&mut resolved_entities); + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::Sanitize, + &resolved_entities, + full_text, + Some(elapsed_us(sanitize_start)), + ); + } + let redaction_entities = resolved_entities + .iter() + .map(to_redaction_entity) + .collect::>(); + let redaction_start = Instant::now(); + let redaction = redact_text(full_text, &redaction_entities, operators)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_redaction( + &redaction, + Some(elapsed_us(redaction_start)), + full_text.len(), + ); + } + + Ok(StaticRedactionResult { + detections, + resolved_entities, + redaction, + }) + } + + fn prepare_pre_threshold_entities( + &self, + detections: &StaticDetectionResult, + full_text: &str, + diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let zone_adjusted_entities = self.apply_zone_adjustments( + detections.all_entities(), + full_text, + diagnostics, + )?; + self.apply_hotword_entities( + zone_adjusted_entities, + full_text, + &detections.matches.literal, + ) + } + + fn apply_hotword_entities( + &self, + entities: Vec, + full_text: &str, + _literal_matches: &[SearchMatch], + ) -> Result> { + let Some(data) = &self.hotword_data else { + return Ok(entities); + }; + apply_hotword_rules(entities, full_text, data, &self.allowed_labels) + } + + fn apply_zone_adjustments( + &self, + entities: Vec, + full_text: &str, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let Some(data) = &self.zone_data else { + return Ok(entities); + }; + + let start = Instant::now(); + let adjusted = data.adjust_entities(full_text, entities)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_stage( + DiagnosticStage::EntityZoneAdjustment, + Some(adjusted.boosted), + Some(elapsed_us(start)), + Some(full_text.len()), + ); + } + Ok(adjusted.entities) + } + + fn process_address_context_entities( + &self, + full_text: &str, + existing_entities: &[PipelineEntity], + ) -> Result> { + if !label_is_allowed("address", &self.allowed_labels) { + return Ok(Vec::new()); + } + let Some(data) = &self.address_context_data else { + return Ok(Vec::new()); + }; + data.process(full_text, existing_entities) + } + + fn process_coreference_entities( + &self, + full_text: &str, + existing_entities: Vec, + false_positive_filters: Option<&DenyListFilterData>, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, + ) -> Result> { + let Some(data) = &self.coreference_data else { + return Ok(existing_entities); + }; + + let start = Instant::now(); + let coreference_entities = + data.process(full_text, &existing_entities, self.threshold)?; + if let Some(diagnostics) = &mut diagnostics { + diagnostics.record_entities( + DiagnosticStage::EntityCoreference, + &coreference_entities, + full_text, + Some(elapsed_us(start)), + ); + } + if coreference_entities.is_empty() { + return Ok(existing_entities); + } + + let merged = + merge_and_dedup(&[existing_entities, coreference_entities].concat()); + let consistent = enforce_boundary_consistency(&merged, full_text)?; + let sanitized = sanitize_entities_with_source(&consistent, full_text)?; + let filtered = filter_entity_false_positives( + sanitized, + full_text, + false_positive_filters, + )?; + Ok(filter_entities_for_labels(filtered, &self.allowed_labels)) + } + + fn extend_monetary_entities( + &self, + full_text: &str, + entities: &[PipelineEntity], + ) -> Vec { + let Some(data) = &self.monetary_data else { + return entities.to_vec(); + }; + data.extend_entities(full_text, entities) + } +} + +fn should_extract_monetary_data(config: &PreparedSearchConfig) -> bool { + config.regex_patterns.is_empty() + || config + .regex_meta + .iter() + .any(|meta| meta.label == "monetary amount") +} + +fn process_signature_entities(full_text: &str) -> TimedEntities { + let start = Instant::now(); + TimedEntities { + entities: detect_signatures(full_text), + elapsed_us: elapsed_us(start), + } +} + +fn filter_entities_for_config( + entities: Vec, + threshold: f64, + allowed_labels: &[String], +) -> Vec { + filter_entities_for_threshold( + filter_entities_for_labels(entities, allowed_labels), + threshold, + ) +} + +fn filter_entities_for_redaction( + entities: Vec, + full_text: &str, + threshold: f64, + confidence_boost: bool, + allowed_labels: &[String], +) -> Result> { + let entities = filter_entities_for_labels(entities, allowed_labels); + if confidence_boost { + return boost_near_miss_entities(entities, full_text, threshold); + } + Ok(filter_entities_for_threshold(entities, threshold)) +} + +fn filter_entities_for_labels( + entities: Vec, + allowed_labels: &[String], +) -> Vec { + entities + .into_iter() + .filter(|entity| { + allowed_labels.is_empty() + || allowed_labels.iter().any(|label| label == &entity.label) + }) + .collect() +} + +fn label_is_allowed(label: &str, allowed_labels: &[String]) -> bool { + allowed_labels.is_empty() + || allowed_labels.iter().any(|allowed| allowed == label) +} + +fn filter_entities_for_threshold( + entities: Vec, + threshold: f64, +) -> Vec { + entities + .into_iter() + .filter(|entity| { + entity.score >= threshold + || entity.source_detail == Some(SourceDetail::AddressContext) + }) + .collect() +} + +fn clear_internal_source_details(entities: &mut [PipelineEntity]) { + for entity in entities { + if entity.source_detail == Some(SourceDetail::AddressContext) { + entity.source_detail = None; + } + } +} + +fn boost_near_miss_entities( + entities: Vec, + full_text: &str, + threshold: f64, +) -> Result> { + let near_miss_floor = f64::max(0.0, threshold - NEAR_MISS_BAND); + let byte_offsets = ByteOffsets::new(full_text); + let text_offsets = TextOffsetMap::new(full_text); + let anchors = entities + .iter() + .filter(|entity| entity.score >= HIGH_CONFIDENCE_FLOOR) + .map(|entity| entity_midpoint(entity, &byte_offsets, &text_offsets)) + .collect::>>()?; + + let mut boosted = Vec::with_capacity(entities.len()); + for mut entity in entities { + if entity.score >= threshold { + boosted.push(entity); + continue; + } + if entity.score < near_miss_floor { + continue; + } + + let midpoint = entity_midpoint(&entity, &byte_offsets, &text_offsets)?; + let neighbours = anchors + .iter() + .filter(|anchor| (midpoint - **anchor).abs() <= CONTEXT_WINDOW_CHARS) + .count(); + let neighbour_count = u32::try_from(neighbours).unwrap_or(u32::MAX); + let boosted_score = + f64::from(neighbour_count).mul_add(BOOST_PER_NEIGHBOUR, entity.score); + if boosted_score < threshold { + continue; + } + + entity.score = f64::min(1.0, boosted_score); + boosted.push(entity); + } + + Ok(boosted) +} + +fn entity_midpoint( + entity: &PipelineEntity, + byte_offsets: &ByteOffsets<'_>, + text_offsets: &TextOffsetMap, +) -> Result { + let start = text_offsets.offset_for(byte_offsets, entity.start)?; + let end = text_offsets.offset_for(byte_offsets, entity.end)?; + Ok(f64::midpoint(start, end)) +} + +struct TextOffsetMap { + byte_offsets: Vec, +} + +impl TextOffsetMap { + fn new(full_text: &str) -> Self { + let mut byte_offsets = full_text + .char_indices() + .map(|(byte_offset, _)| byte_offset) + .collect::>(); + byte_offsets.push(full_text.len()); + Self { byte_offsets } + } + + fn offset_for( + &self, + byte_offsets: &ByteOffsets<'_>, + offset: u32, + ) -> Result { + let byte_offset = byte_offsets.validate_offset(offset)?; + let index = self + .byte_offsets + .binary_search(&byte_offset) + .map_err(|_| Error::ByteOffsetInsideCodepoint { offset })?; + let index = u32::try_from(index) + .map_err(|_| Error::ByteOffsetOutOfBounds { offset: u32::MAX })?; + Ok(f64::from(index)) + } +} + +fn record_static_entity_diagnostics( + diagnostics: &mut StaticRedactionDiagnostics, + full_text: &str, + passes: &StaticEntityPasses, +) { + diagnostics.record_entities( + DiagnosticStage::EntityRegex, + &passes.regex.entities, + full_text, + Some(passes.regex.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCustomRegex, + &passes.custom_regex.entities, + full_text, + Some(passes.custom_regex.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityDenyList, + &passes.deny_list.entities, + full_text, + Some(passes.deny_list.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityGazetteer, + &passes.gazetteer.entities, + full_text, + Some(passes.gazetteer.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityCountry, + &passes.country.entities, + full_text, + Some(passes.country.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityAnchored, + &passes.anchored.entities, + full_text, + Some(passes.anchored.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityTrigger, + &passes.trigger.entities, + full_text, + Some(passes.trigger.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntitySignature, + &passes.signature.entities, + full_text, + Some(passes.signature.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityLegalForm, + &passes.legal_form.entities, + full_text, + Some(passes.legal_form.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityNameCorpus, + &passes.name_corpus.entities, + full_text, + Some(passes.name_corpus.elapsed_us), + ); + diagnostics.record_entities( + DiagnosticStage::EntityAddressSeed, + &passes.address_seed.entities, + full_text, + Some(passes.address_seed.elapsed_us), + ); +} + +fn address_seed_context(layers: &[&[PipelineEntity]]) -> Vec { + let capacity = layers + .iter() + .map(|layer| layer.len()) + .fold(0usize, usize::saturating_add); + let mut entities = Vec::with_capacity(capacity); + for layer in layers { + entities.extend(layer.iter().cloned()); + } + entities +} + +fn elapsed_us(start: Instant) -> u64 { + let micros = start.elapsed().as_micros(); + u64::try_from(micros).unwrap_or(u64::MAX) +} + +fn build_search_indexes_for_config( + regex_groups: RegexPatternGroups, + regex_options: SearchOptions, + custom_regex_patterns: Vec, + custom_regex_options: SearchOptions, + literal_patterns: Vec, + literal_options: SearchOptions, + artifacts: Option<&PreparedSearchArtifacts>, +) -> Result { + build_search_indexes( + SearchIndexBuildInputs { + regex_patterns: regex_groups.regex, + regex_options, + custom_regex_patterns, + custom_regex_options, + legal_form_patterns: regex_groups.legal_forms, + trigger_patterns: promote_case_insensitive_literals( + regex_groups.triggers, + ), + literal_patterns, + literal_options, + }, + artifacts, + ) +} + +fn build_search_indexes( + inputs: SearchIndexBuildInputs, + artifacts: Option<&PreparedSearchArtifacts>, +) -> Result { + let SearchIndexBuildInputs { + regex_patterns, + regex_options, + custom_regex_patterns, + custom_regex_options, + legal_form_patterns, + trigger_patterns, + literal_patterns, + literal_options, + } = inputs; + + let regex_artifacts = artifacts.map(|value| &value.regex); + let custom_regex_artifacts = artifacts.map(|value| &value.custom_regex); + let legal_form_artifacts = artifacts.map(|value| &value.legal_forms); + let trigger_artifacts = artifacts.map(|value| &value.triggers); + let literal_artifacts = artifacts.map(|value| &value.literals); + + std::thread::scope(|scope| { + let regex = scope.spawn(move || { + build_search_index(regex_patterns, regex_options, regex_artifacts) + }); + let custom_regex = scope.spawn(move || { + build_search_index( + custom_regex_patterns, + custom_regex_options, + custom_regex_artifacts, + ) + }); + let legal_forms = scope.spawn(move || { + build_search_index( + legal_form_patterns, + legal_form_search_options(), + legal_form_artifacts, + ) + }); + let triggers = scope.spawn(move || { + build_search_index( + trigger_patterns, + trigger_search_options(), + trigger_artifacts, + ) + }); + let literals = scope.spawn(move || { + build_search_index(literal_patterns, literal_options, literal_artifacts) + }); + + Ok(PreparedSearchIndexes { + regex: join_search_index(regex, "regex")?, + custom_regex: join_search_index(custom_regex, "custom_regex")?, + legal_forms: join_search_index(legal_forms, "legal_forms")?, + triggers: join_search_index(triggers, "triggers")?, + literals: join_search_index(literals, "literals")?, + }) + }) +} + +fn build_search_index( + patterns: Vec, + options: SearchOptions, + artifacts: Option<&SearchIndexArtifacts>, +) -> Result { + let start = Instant::now(); + let search = if let Some(artifacts) = artifacts { + SearchIndex::new_with_artifacts(patterns, options, artifacts)? + } else { + SearchIndex::new(patterns, options)? + }; + Ok((search, elapsed_us(start))) +} + +fn join_search_index( + handle: std::thread::ScopedJoinHandle<'_, Result>, + field: &'static str, +) -> Result { + handle.join().map_err(|_| Error::InvalidStaticData { + field, + reason: "search index builder panicked".to_owned(), + })? +} + +fn record_prepare_stage_elapsed( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + stage: DiagnosticStage, + count: usize, + elapsed_us: u64, +) { + if let Some(diagnostics) = diagnostics { + diagnostics.record_stage(stage, Some(count), Some(elapsed_us), None); + } +} + +fn record_search_index_prepare_stages( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + metrics: &SearchIndexPrepareMetrics, +) { + let stages = [ + (DiagnosticStage::PrepareRegex, metrics.regex), + (DiagnosticStage::PrepareCustomRegex, metrics.custom_regex), + (DiagnosticStage::PrepareLegalFormSearch, metrics.legal_forms), + (DiagnosticStage::PrepareTriggerSearch, metrics.triggers), + (DiagnosticStage::PrepareLiteral, metrics.literals), + ]; + for (stage, (count, elapsed)) in stages { + record_prepare_stage_elapsed(diagnostics, stage, count, elapsed); + } +} + +fn record_prepare_total( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + counts: [usize; 6], + start: Instant, +) { + let Some(diagnostics) = diagnostics else { + return; + }; + let count = counts.into_iter().fold(0usize, usize::saturating_add); + diagnostics.record_stage( + DiagnosticStage::PrepareTotal, + Some(count), + Some(elapsed_us(start)), + None, + ); +} + +fn anchored_config_len( + date_data: Option<&DateData>, + monetary_data: Option<&MonetaryData>, +) -> usize { + let date_len = date_data.map_or(0, |data| { + data.month_names_by_language.values().map(Vec::len).sum() + }); + let monetary_len = monetary_data.map_or(0, |data| { + data + .currencies + .codes + .len() + .saturating_add(data.currencies.symbols.len()) + .saturating_add(data.currencies.local_names.len()) + }); + date_len.saturating_add(monetary_len) +} + +fn prepare_anchored_data( + date_data: Option<&DateData>, + monetary_data: Option, + anchored_len: usize, + diagnostics: Option<&mut StaticRedactionDiagnostics>, +) -> Result<(Option, Option)> { + let anchored_start = Instant::now(); + let prepared_date = if let Some(data) = date_data { + PreparedDateData::new(data)? + } else { + None + }; + let prepared_monetary = if let Some(data) = monetary_data { + PreparedMonetaryData::new(data)? + } else { + None + }; + + if let Some(diagnostics) = diagnostics { + diagnostics.record_stage( + DiagnosticStage::PrepareAnchored, + Some(anchored_len), + Some(elapsed_us(anchored_start)), + None, + ); + } + + Ok((prepared_date, prepared_monetary)) +} + +fn prepare_address_seed_data( + data: Option, +) -> Result> { + data.map(PreparedAddressSeedData::new).transpose() +} + +fn prepare_hotword_data( + data: Option, +) -> Result> { + data.map(PreparedHotwordData::new).transpose() +} + +fn prepare_trigger_data( + data: Option, +) -> Result> { + data.map(PreparedTriggerData::new).transpose() +} + +fn prepare_address_context_data( + data: Option, +) -> Result> { + data.map(PreparedAddressContextData::new).transpose() +} + +fn prepare_zone_data( + data: Option<&ZoneData>, +) -> Result> { + data.map(PreparedZoneData::new).transpose() +} + +fn prepare_coreference_data( + data: Option, +) -> Result> { + data.map(PreparedCoreferenceData::new).transpose() +} + +fn split_regex_patterns( + patterns: Vec, + slices: &PreparedSearchSlices, +) -> Result { + let mut regex = Vec::new(); + let mut legal_forms = Vec::new(); + let mut triggers = Vec::new(); + + for (index, pattern) in patterns.into_iter().enumerate() { + let pattern_index = u32::try_from(index) + .map_err(|_| Error::PatternIndexOutOfRange { index })?; + if slices.legal_forms.contains(pattern_index) { + legal_forms.push(pattern); + continue; + } + if slices.triggers.contains(pattern_index) { + triggers.push(pattern); + continue; + } + regex.push(pattern); + } + + Ok(RegexPatternGroups { + regex, + legal_forms, + triggers, + }) +} + +fn legal_form_search_options() -> SearchOptions { + SearchOptions::default() +} + +fn trigger_search_options() -> SearchOptions { + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + } +} + +fn promote_case_insensitive_literals( + patterns: Vec, +) -> Vec { + patterns + .into_iter() + .map(|entry| match entry { + SearchPattern::LiteralWithOptions { + pattern: value, + case_insensitive: Some(true), + whole_words, + } if whole_words != Some(true) => SearchPattern::Literal(value), + other => other, + }) + .collect() +} + +fn offset_matches( + matches: Vec, + offset: u32, +) -> Result> { + if offset == 0 { + return Ok(matches); + } + + matches + .into_iter() + .map(|found| offset_match(found, offset)) + .collect() +} + +fn normalized_offset_matches( + search: &SearchIndex, + normalized: &NormalizedSearchText, + offset: u32, +) -> Result> { + search + .find_iter(normalized.as_str())? + .into_iter() + .map(|found| remap_normalized_match(normalized, found)) + .map(|found| found.and_then(|value| offset_match(value, offset))) + .collect() +} + +fn offset_match(found: SearchMatch, offset: u32) -> Result { + let pattern = found.pattern().checked_add(offset).ok_or_else(|| { + Error::PatternIndexNotAddressable { + pattern: found.pattern(), + } + })?; + + Ok(match found { + SearchMatch::Literal { start, end, .. } => SearchMatch::Literal { + pattern, + start, + end, + }, + SearchMatch::Regex { start, end, .. } => SearchMatch::Regex { + pattern, + start, + end, + }, + SearchMatch::Fuzzy { + start, + end, + distance, + .. + } => SearchMatch::Fuzzy { + pattern, + start, + end, + distance, + }, + }) +} + +fn combine_regex_matches( + mut regex: Vec, + legal_forms: Vec, + triggers: Vec, +) -> Vec { + regex.extend(legal_forms); + regex.extend(triggers); + sort_matches(&mut regex); + regex +} + +fn sort_matches(matches: &mut [SearchMatch]) { + matches.sort_by(|left, right| { + left + .start() + .cmp(&right.start()) + .then_with(|| left.end().cmp(&right.end())) + .then_with(|| left.pattern().cmp(&right.pattern())) + }); +} + +fn remap_normalized_match( + normalized: &NormalizedSearchText, + found: SearchMatch, +) -> Result { + let (start, end) = normalized.map_span(found.start(), found.end())?; + Ok(found.with_span(start, end)) +} + +fn validate_supported_config( + config: &PreparedSearchConfig, + allow_literal_artifacts: bool, +) -> Result<()> { + validate_search_config(config, allow_literal_artifacts)?; + validate_legal_form_config(config)?; + validate_trigger_config(config)?; + validate_deny_list_config(config)?; + validate_gazetteer_config(config)?; + validate_country_config(config)?; + validate_hotword_config(config)?; + validate_address_seed_config(config) +} + +fn validate_search_config( + config: &PreparedSearchConfig, + allow_literal_artifacts: bool, +) -> Result<()> { + validate_slice_bounds( + "slices.regex", + config.slices.regex, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.legal_forms", + config.slices.legal_forms, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.triggers", + config.slices.triggers, + config.regex_patterns.len(), + )?; + validate_slice_bounds( + "slices.custom_regex", + config.slices.custom_regex, + config.custom_regex_patterns.len(), + )?; + if !allow_literal_artifacts || !config.literal_patterns.is_empty() { + validate_slice_bounds( + "slices.deny_list", + config.slices.deny_list, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.street_types", + config.slices.street_types, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.gazetteer", + config.slices.gazetteer, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.countries", + config.slices.countries, + config.literal_patterns.len(), + )?; + validate_slice_bounds( + "slices.hotwords", + config.slices.hotwords, + config.literal_patterns.len(), + )?; + } + validate_static_data_length( + "regex_meta", + config.slices.regex, + config.regex_meta.len(), + )?; + validate_static_data_length( + "custom_regex_meta", + config.slices.custom_regex, + config.custom_regex_meta.len(), + ) +} + +fn validate_slice_bounds( + field: &'static str, + slice: PatternSlice, + pattern_count: usize, +) -> Result<()> { + if slice.start > slice.end { + return Err(Error::InvalidStaticData { + field, + reason: "slice start exceeds slice end".to_owned(), + }); + } + let Some(end) = usize::try_from(slice.end).ok() else { + return Err(Error::InvalidStaticData { + field, + reason: "slice end exceeds usize range".to_owned(), + }); + }; + if end <= pattern_count { + return Ok(()); + } + Err(Error::InvalidStaticData { + field, + reason: format!("slice end {end} exceeds pattern count {pattern_count}"), + }) +} + +fn validate_legal_form_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.legal_forms.is_empty() { + return Ok(()); + } + + let Some(data) = &config.legal_form_data else { + return Err(Error::MissingStaticData { + field: "legal_form_data", + }); + }; + + validate_static_data_length( + "legal_form_data.suffixes", + config.slices.legal_forms, + data.suffixes.len(), + ) +} + +fn validate_deny_list_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.deny_list.is_empty() { + return Ok(()); + } + + let Some(data) = &config.deny_list_data else { + return Err(Error::UnsupportedStaticSlice { slice: "deny_list" }); + }; + + validate_static_data_length( + "deny_list.labels", + config.slices.deny_list, + data.labels.len(), + )?; + validate_static_data_length( + "deny_list.custom_labels", + config.slices.deny_list, + data.custom_labels.len(), + )?; + validate_static_data_length( + "deny_list.originals", + config.slices.deny_list, + data.originals.len(), + )?; + validate_static_data_length( + "deny_list.sources", + config.slices.deny_list, + data.sources.len(), + )?; + ensure_supported_deny_list_sources(data) +} + +fn validate_gazetteer_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.gazetteer.is_empty() { + return Ok(()); + } + + let Some(data) = &config.gazetteer_data else { + return Err(Error::MissingStaticData { + field: "gazetteer_data", + }); + }; + + validate_static_data_length( + "gazetteer_data.labels", + config.slices.gazetteer, + data.labels.len(), + )?; + validate_static_data_length( + "gazetteer_data.is_fuzzy", + config.slices.gazetteer, + data.is_fuzzy.len(), + ) +} + +fn validate_country_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.countries.is_empty() { + return Ok(()); + } + + let Some(data) = &config.country_data else { + return Err(Error::MissingStaticData { + field: "country_data", + }); + }; + + validate_static_data_length( + "country_data.labels", + config.slices.countries, + data.labels.len(), + ) +} + +fn validate_hotword_config(config: &PreparedSearchConfig) -> Result<()> { + if !config.slices.hotwords.is_empty() { + return Err(Error::UnsupportedStaticSlice { slice: "hotwords" }); + } + + let Some(data) = &config.hotword_data else { + return Ok(()); + }; + + for rule in &data.rules { + if rule.hotwords.is_empty() { + return Err(Error::InvalidStaticData { + field: "hotword_data.rules.hotwords", + reason: String::from("native hotword rules require hotword strings"), + }); + } + for hotword in &rule.hotwords { + if hotword.is_empty() { + return Err(Error::InvalidStaticData { + field: "hotword_data.rules.hotwords", + reason: String::from("hotword must not be empty"), + }); + } + } + } + + Ok(()) +} + +const fn validate_address_seed_config( + config: &PreparedSearchConfig, +) -> Result<()> { + if config.slices.street_types.is_empty() { + return Ok(()); + } + + if config.address_seed_data.is_some() { + return Ok(()); + } + + Err(Error::MissingStaticData { + field: "address_seed_data", + }) +} + +fn validate_trigger_config(config: &PreparedSearchConfig) -> Result<()> { + if config.slices.triggers.is_empty() { + return Ok(()); + } + + let Some(data) = &config.trigger_data else { + return Err(Error::MissingStaticData { + field: "trigger_data", + }); + }; + + validate_static_data_length( + "trigger_data.rules", + config.slices.triggers, + data.rules.len(), + ) +} + +fn validate_static_data_length( + field: &'static str, + slice: PatternSlice, + actual: usize, +) -> Result<()> { + let expected = usize::try_from(slice.len()).map_err(|_| { + Error::StaticDataLengthMismatch { + field, + expected: usize::MAX, + actual, + } + })?; + if actual == expected { + return Ok(()); + } + + Err(Error::StaticDataLengthMismatch { + field, + expected, + actual, + }) +} + +impl StaticDetectionResult { + #[must_use] + pub fn all_entities(&self) -> Vec { + let capacity = self + .regex_entities + .len() + .saturating_add(self.custom_regex_entities.len()) + .saturating_add(self.deny_list_entities.len()) + .saturating_add(self.gazetteer_entities.len()) + .saturating_add(self.country_entities.len()) + .saturating_add(self.anchored_entities.len()) + .saturating_add(self.trigger_entities.len()) + .saturating_add(self.signature_entities.len()) + .saturating_add(self.legal_form_entities.len()) + .saturating_add(self.address_seed_entities.len()) + .saturating_add(self.name_corpus_entities.len()); + let mut entities = Vec::with_capacity(capacity); + entities.extend(self.regex_entities.iter().cloned()); + entities.extend(self.custom_regex_entities.iter().cloned()); + entities.extend(self.deny_list_entities.iter().cloned()); + entities.extend(self.gazetteer_entities.iter().cloned()); + entities.extend(self.country_entities.iter().cloned()); + entities.extend(self.anchored_entities.iter().cloned()); + entities.extend(self.trigger_entities.iter().cloned()); + entities.extend(self.signature_entities.iter().cloned()); + entities.extend(self.legal_form_entities.iter().cloned()); + entities.extend(self.address_seed_entities.iter().cloned()); + entities.extend(self.name_corpus_entities.iter().cloned()); + entities + } +} + +fn to_redaction_entity(entity: &PipelineEntity) -> Entity { + match &entity.kind { + EntityKind::Detected => Entity::detected( + entity.start, + entity.end, + entity.label.clone(), + entity.text.clone(), + ), + EntityKind::Coreference { source_text } => Entity::coreference( + entity.start, + entity.end, + entity.label.clone(), + entity.text.clone(), + source_text.clone(), + ), + } +} diff --git a/crates/anonymize-core/src/processors.rs b/crates/anonymize-core/src/processors.rs new file mode 100644 index 00000000..dd644ce6 --- /dev/null +++ b/crates/anonymize-core/src/processors.rs @@ -0,0 +1,1930 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::byte_offsets::ByteOffsets; +use crate::resolution::{DetectionSource, PipelineEntity, SourceDetail}; +use crate::types::{Error, Result, SearchMatch}; +use crate::validators::validate_id; + +const GAZETTEER_EXACT_SCORE: f64 = 0.9; +const GAZETTEER_FUZZY_SCORE: f64 = 0.85; +const COUNTRY_SCORE: f64 = 0.95; +const DENY_LIST_SCORE: f64 = 0.9; +const MAX_GAZETTEER_PREFIX_OVERSHOOT: u32 = 7; +pub(crate) const CUSTOM_DENY_LIST_SOURCE: &str = "custom-deny-list"; +const DENY_LIST_SOURCE: &str = "deny-list"; +const CITY_SOURCE: &str = "city"; +const FIRST_NAME_SOURCE: &str = "first-name"; +const SURNAME_SOURCE: &str = "surname"; +const TITLE_SOURCE: &str = "title"; +const PERSON_LABEL: &str = "person"; +const ADDRESS_LABEL: &str = "address"; + +#[derive( + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] +pub struct PatternSlice { + pub start: u32, + pub end: u32, +} + +impl PatternSlice { + #[must_use] + pub const fn is_empty(self) -> bool { + self.start >= self.end + } + + #[must_use] + pub const fn len(self) -> u32 { + self.end.saturating_sub(self.start) + } + + #[must_use] + pub const fn contains(self, pattern: u32) -> bool { + pattern >= self.start && pattern < self.end + } + + pub(crate) fn local_index(self, pattern: u32) -> Option { + if !self.contains(pattern) { + return None; + } + usize::try_from(pattern.saturating_sub(self.start)).ok() + } +} + +#[derive(Clone, Debug, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct RegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: bool, + pub validator_id: Option, + pub validator_input: Option, + pub min_byte_length: Option, +} + +impl RegexMatchMeta { + #[must_use] + pub fn new(label: impl Into, score: f64) -> Self { + Self { + label: label.into(), + score, + source_detail: None, + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct GazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct CountryMatchData { + pub labels: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct DenyListMatchData { + pub labels: StringGroups, + pub custom_labels: StringGroups, + pub originals: Vec, + pub sources: StringGroups, + pub filters: Option, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct StringGroups { + table: Vec, + groups: Vec>, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct StringGroup<'a> { + table: &'a [String], + indexes: &'a [u32], +} + +impl StringGroups { + #[must_use] + pub fn from_groups(groups: Vec>) -> Self { + let mut table = Vec::new(); + let mut table_indexes = BTreeMap::::new(); + let groups = groups + .into_iter() + .map(|group| { + group + .into_iter() + .map(|value| { + string_table_index(value, &mut table, &mut table_indexes) + }) + .collect() + }) + .collect(); + + Self { table, groups } + } + + pub fn from_table_indices( + table: Vec, + groups: Vec>, + field: &'static str, + ) -> Result { + for group in &groups { + for &index in group { + let Ok(index) = usize::try_from(index) else { + return Err(Error::InvalidStaticData { + field, + reason: String::from("string table index exceeds usize range"), + }); + }; + if index >= table.len() { + return Err(Error::InvalidStaticData { + field, + reason: String::from("string table index out of range"), + }); + } + } + } + + Ok(Self { table, groups }) + } + + #[must_use] + pub fn empty_groups(len: usize) -> Self { + Self { + table: Vec::new(), + groups: vec![Vec::new(); len], + } + } + + #[must_use] + pub const fn len(&self) -> usize { + self.groups.len() + } + + #[must_use] + pub const fn is_empty(&self) -> bool { + self.groups.is_empty() + } + + #[must_use] + pub fn get(&self, index: usize) -> Option> { + Some(StringGroup { + table: &self.table, + indexes: self.groups.get(index)?, + }) + } + + pub fn iter(&self) -> impl Iterator> { + self.groups.iter().map(|indexes| StringGroup { + table: &self.table, + indexes, + }) + } +} + +impl From>> for StringGroups { + fn from(groups: Vec>) -> Self { + Self::from_groups(groups) + } +} + +impl<'a> StringGroup<'a> { + #[must_use] + pub const fn is_empty(self) -> bool { + self.indexes.is_empty() + } + + pub fn iter(self) -> impl Iterator + 'a { + self + .indexes + .iter() + .filter_map(|index| usize::try_from(*index).ok()) + .filter_map(|index| self.table.get(index)) + .map(String::as_str) + } + + #[must_use] + pub fn contains(self, value: &str) -> bool { + self.iter().any(|entry| entry == value) + } + + #[must_use] + pub fn to_strings(self) -> Vec { + self.iter().map(String::from).collect() + } +} + +fn string_table_index( + value: String, + table: &mut Vec, + table_indexes: &mut BTreeMap, +) -> u32 { + if let Some(index) = table_indexes.get(&value) { + return *index; + } + let index = u32::try_from(table.len()).unwrap_or(u32::MAX); + table_indexes.insert(value.clone(), index); + table.push(value); + index +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct DenyListFilterData { + pub stopwords: BTreeSet, + pub allow_list: BTreeSet, + pub person_stopwords: BTreeSet, + pub person_trailing_nouns: BTreeSet, + pub address_stopwords: BTreeSet, + pub address_jurisdiction_prefixes: BTreeSet, + pub street_types: BTreeSet, + pub address_component_terms: BTreeSet, + pub ambiguous_street_type_terms: BTreeSet, + pub first_names: BTreeSet, + pub generic_roles: BTreeSet, + pub number_abbrev_prefixes: BTreeSet, + pub sentence_starters: BTreeSet, + pub trailing_address_word_exclusions: BTreeSet, + pub document_heading_words: BTreeSet, + pub document_heading_ordinal_markers: BTreeSet, + pub defined_term_cues: BTreeSet, + pub signing_place_guards: Vec, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct SigningPlaceGuardData { + pub prefix_phrases: BTreeSet, + pub suffix_phrases: BTreeSet, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +struct RawDenyListMatch { + pattern: usize, + start: u32, + end: u32, + labels: Vec, + custom_labels: Vec, + has_person_name_source: bool, + text: String, +} + +pub fn process_regex_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + meta: &[RegexMatchMeta], +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let pattern = found.pattern(); + let Some(local_index) = slice.local_index(pattern) else { + continue; + }; + let Some(entry) = meta.get(local_index) else { + continue; + }; + let text = offsets.slice(found.start(), found.end())?; + if let Some(validator_id) = &entry.validator_id { + if !validate_id(validator_id, &text, entry.validator_input.as_deref()) { + continue; + } + } else if entry.requires_validation { + return Err(Error::UnsupportedRegexValidation { pattern }); + } + if entry + .min_byte_length + .is_some_and(|min| byte_len(&text) < min) + { + continue; + } + + let mut entity = PipelineEntity::detected( + found.start(), + found.end(), + entry.label.clone(), + text, + entry.score, + DetectionSource::Regex, + ); + entity.source_detail = entry.source_detail; + results.push(entity); + } + + Ok(results) +} + +pub fn process_deny_list_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &DenyListMatchData, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut matches = + collect_deny_list_matches(matches, slice, full_text, data, &offsets)?; + suppress_shorter_curated_contained_matches(&mut matches); + + let mut results = Vec::new(); + let mut name_hits = Vec::new(); + + for found in &matches { + for label in &found.custom_labels { + let mut entity = PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + ); + entity.source_detail = Some(SourceDetail::CustomDenyList); + results.push(entity); + } + } + + for found in &matches { + if found.labels.is_empty() { + continue; + } + if found.labels.iter().any(|label| label == PERSON_LABEL) + && !filter_contains( + data + .filters + .as_ref() + .map(|filters| &filters.person_stopwords), + &found.text.to_lowercase(), + ) + { + name_hits.push(found.clone()); + } + + let suppress_address = should_suppress_address(full_text, data, found)?; + for label in found.labels.iter().filter(|label| *label != PERSON_LABEL) { + if label == ADDRESS_LABEL && suppress_address { + continue; + } + results.push(PipelineEntity::detected( + found.start, + found.end, + label.clone(), + found.text.clone(), + DENY_LIST_SCORE, + DetectionSource::DenyList, + )); + } + } + + append_person_name_hits( + &mut results, + full_text, + &offsets, + data, + &mut name_hits, + )?; + extend_city_districts( + &mut results, + full_text, + &offsets, + data.filters.as_ref(), + )?; + + Ok(results) +} + +fn suppress_shorter_curated_contained_matches( + matches: &mut [RawDenyListMatch], +) { + let mut ranges = Vec::<(u32, u32)>::new(); + for found in matches.iter() { + if found.labels.is_empty() { + continue; + } + ranges.push((found.start, found.end)); + } + + ranges.sort_by(|left, right| { + left.0.cmp(&right.0).then_with(|| right.1.cmp(&left.1)) + }); + + let mut suppress = BTreeSet::<(u32, u32)>::new(); + let mut max_end = None::; + let mut max_end_start = None::; + for (start, end) in ranges { + if max_end.is_some_and(|container_end| { + container_end > end + || (container_end == end + && max_end_start + .is_some_and(|container_start| container_start < start)) + }) { + suppress.insert((start, end)); + } + if max_end.is_none_or(|current| end > current) { + max_end = Some(end); + max_end_start = Some(start); + } + } + + if suppress.is_empty() { + return; + } + + for found in matches.iter_mut() { + if found.labels.is_empty() { + continue; + } + if suppress.contains(&(found.start, found.end)) { + found.labels.clear(); + } + } +} + +fn collect_deny_list_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &DenyListMatchData, + offsets: &ByteOffsets<'_>, +) -> Result> { + let mut results = Vec::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(labels) = data.labels.get(local_index) else { + continue; + }; + let Some(sources) = data.sources.get(local_index) else { + continue; + }; + validate_deny_list_sources(sources)?; + + let match_text = offsets.slice(found.start(), found.end())?; + let keyword = match_text.to_lowercase(); + let pattern = data.originals.get(local_index).map_or("", String::as_str); + let custom_pattern_labels = data + .custom_labels + .get(local_index) + .map(StringGroup::to_strings) + .unwrap_or_default(); + let custom_edges_are_valid = custom_match_has_valid_edges( + full_text, + offsets, + found.start(), + found.end(), + pattern, + )?; + let custom_labels = if custom_edges_are_valid { + custom_pattern_labels.clone() + } else { + Vec::new() + }; + + if labels.is_empty() && custom_labels.is_empty() { + continue; + } + + let curated_labels = if has_curated_source(sources) { + let filters = data.filters.as_ref().ok_or(Error::MissingStaticData { + field: "deny_list.filters", + })?; + curated_labels_for_match(&CuratedDenyListMatch { + full_text, + offsets, + start: found.start(), + match_text: &match_text, + keyword: &keyword, + pattern, + labels, + custom_pattern_labels: &custom_pattern_labels, + custom_edges_are_valid, + filters, + })? + } else { + Vec::new() + }; + + if curated_labels.is_empty() && custom_labels.is_empty() { + continue; + } + + results.push(RawDenyListMatch { + pattern: local_index, + start: found.start(), + end: found.end(), + labels: curated_labels, + custom_labels, + has_person_name_source: sources + .iter() + .any(|source| source == FIRST_NAME_SOURCE || source == SURNAME_SOURCE), + text: match_text, + }); + } + + results.sort_by(|left, right| { + left + .pattern + .cmp(&right.pattern) + .then_with(|| left.start.cmp(&right.start)) + .then_with(|| left.end.cmp(&right.end)) + }); + Ok(results) +} + +struct CuratedDenyListMatch<'a> { + full_text: &'a str, + offsets: &'a ByteOffsets<'a>, + start: u32, + match_text: &'a str, + keyword: &'a str, + pattern: &'a str, + labels: StringGroup<'a>, + custom_pattern_labels: &'a [String], + custom_edges_are_valid: bool, + filters: &'a DenyListFilterData, +} + +fn curated_labels_for_match( + args: &CuratedDenyListMatch<'_>, +) -> Result> { + let pattern_is_acronym = !args.pattern.is_empty() + && args.pattern.len() <= 5 + && all_upper(args.pattern); + let acronym_matches_acronym = + !pattern_is_acronym || all_upper(args.match_text); + let source_char = char_at(args.full_text, args.offsets, args.start)?; + let passes_filters = source_char.is_some_and(char::is_uppercase) + && !args.filters.stopwords.contains(args.keyword) + && !args.filters.allow_list.contains(args.keyword) + && acronym_matches_acronym + && !all_upper(args.match_text); + + if !passes_filters || !args.custom_edges_are_valid { + return Ok(Vec::new()); + } + + if is_dotted_acronym_suffix_collision( + args.full_text, + args.offsets, + args.start, + args.match_text, + )? { + return Ok(Vec::new()); + } + + Ok( + args + .labels + .iter() + .filter(|label| { + !args + .custom_pattern_labels + .iter() + .any(|custom| custom == label) + }) + .map(String::from) + .collect(), + ) +} + +fn should_suppress_address( + full_text: &str, + data: &DenyListMatchData, + found: &RawDenyListMatch, +) -> Result { + if !is_single_word(found.text.as_str()) { + return Ok(false); + } + let Some(filters) = &data.filters else { + return Ok(false); + }; + if is_signing_place_context(full_text, found.start, found.end, filters)? { + return Ok(true); + } + let lower = found.text.to_lowercase(); + if !filters.address_stopwords.contains(&lower) { + return Ok(false); + } + + Ok(!has_adjacent_address_evidence( + full_text, + found.start, + found.end, + filters, + )?) +} + +fn is_signing_place_context( + full_text: &str, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + if filters.signing_place_guards.is_empty() { + return Ok(false); + } + + let offsets = ByteOffsets::new(full_text); + let start_byte = offsets.validate_offset(start)?; + let end_byte = offsets.validate_offset(end)?; + let before = full_text.get(..start_byte).unwrap_or_default(); + let after = full_text.get(end_byte..).unwrap_or_default(); + + Ok(filters.signing_place_guards.iter().any(|guard| { + !guard.prefix_phrases.is_empty() + && !guard.suffix_phrases.is_empty() + && context_before_matches_any_phrase(before, &guard.prefix_phrases) + && context_after_matches_any_phrase(after, &guard.suffix_phrases) + })) +} + +fn context_before_matches_any_phrase( + before: &str, + phrases: &BTreeSet, +) -> bool { + phrases.iter().any(|phrase| { + phrase.is_empty() || context_before_matches_phrase(before, phrase) + }) +} + +fn context_after_matches_any_phrase( + after: &str, + phrases: &BTreeSet, +) -> bool { + phrases.iter().any(|phrase| { + phrase.is_empty() || context_after_matches_phrase(after, phrase) + }) +} + +fn context_before_matches_phrase(before: &str, phrase: &str) -> bool { + let trimmed = before.trim_end_matches(char::is_whitespace); + if trimmed.len() < phrase.len() { + return false; + } + let lower = trimmed.to_lowercase(); + if !lower.ends_with(phrase) { + return false; + } + let phrase_start = trimmed.len().saturating_sub(phrase.len()); + char_before_byte(trimmed, phrase_start).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn context_after_matches_phrase(after: &str, phrase: &str) -> bool { + let trimmed = after.trim_start_matches(char::is_whitespace); + let trimmed = trimmed.strip_prefix(',').map_or(trimmed, |value| { + value.trim_start_matches(char::is_whitespace) + }); + if trimmed.len() < phrase.len() { + return false; + } + let lower = trimmed.to_lowercase(); + if !lower.starts_with(phrase) { + return false; + } + char_after_byte(trimmed, phrase.len()).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn append_person_name_hits( + results: &mut Vec, + full_text: &str, + offsets: &ByteOffsets<'_>, + data: &DenyListMatchData, + name_hits: &mut [RawDenyListMatch], +) -> Result<()> { + name_hits.sort_by_key(|hit| hit.start); + let mut consumed = BTreeSet::::new(); + + for index in 0..name_hits.len() { + if consumed.contains(&index) { + continue; + } + let Some(hit) = name_hits.get(index) else { + continue; + }; + + let mut chain = vec![hit.clone()]; + let mut cursor = index.saturating_add(1); + + while cursor < name_hits.len() && chain.len() < 5 { + let Some(next) = name_hits.get(cursor) else { + break; + }; + let Some(prev) = chain.last() else { + break; + }; + if next.start < prev.end { + break; + } + let gap = offsets.slice(prev.end, next.start)?; + if person_chain_breaks(prev.text.as_str(), gap.as_str()) { + break; + } + + chain.push(next.clone()); + cursor = cursor.saturating_add(1); + } + + for consumed_index in index..index.saturating_add(chain.len()) { + consumed.insert(consumed_index); + } + + if !chain.iter().any(has_person_name_source) { + continue; + } + + let Some(first) = chain.first() else { + continue; + }; + let Some(last) = chain.last() else { + continue; + }; + let Some(filters) = &data.filters else { + continue; + }; + if is_suppressible_defined_term_quote( + full_text, + offsets, + first.start, + filters, + )? { + continue; + } + + let extended = + extend_person_name(full_text, offsets, first.start, last.end, filters)?; + let score = if chain.len() >= 2 { 0.9 } else { 0.5 }; + + if chain.len() == 1 + && !single_name_hit_has_context(full_text, offsets, last.end, filters)? + { + continue; + } + + results.push(PipelineEntity::detected( + first.start, + extended.end, + PERSON_LABEL, + extended.text, + score, + DetectionSource::DenyList, + )); + } + + Ok(()) +} + +pub fn process_gazetteer_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &GazetteerMatchData, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut results = Vec::new(); + let mut exact_spans = Vec::<(u32, u32)>::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + if data.is_fuzzy.get(local_index).copied().unwrap_or(false) { + continue; + } + + let Some(label) = data.labels.get(local_index) else { + continue; + }; + let extended = try_gazetteer_prefix_extension(&offsets, found)?; + let (end, text, source_detail) = if let Some(extension) = extended { + extension + } else { + ( + found.end(), + offsets.slice(found.start(), found.end())?, + None, + ) + }; + + exact_spans.push((found.start(), end)); + let mut entity = PipelineEntity::detected( + found.start(), + end, + label.clone(), + text, + GAZETTEER_EXACT_SCORE, + DetectionSource::Gazetteer, + ); + entity.source_detail = source_detail; + results.push(entity); + } + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + if !data.is_fuzzy.get(local_index).copied().unwrap_or(false) { + continue; + } + if fuzzy_distance(found) == Some(0) { + continue; + } + + let Some(label) = data.labels.get(local_index) else { + continue; + }; + if exact_spans + .iter() + .any(|(start, end)| found.start() < *end && found.end() > *start) + { + continue; + } + + results.push(PipelineEntity::detected( + found.start(), + found.end(), + label.clone(), + offsets.slice(found.start(), found.end())?, + GAZETTEER_FUZZY_SCORE, + DetectionSource::Gazetteer, + )); + } + + Ok(results) +} + +pub fn process_country_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &CountryMatchData, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut results = Vec::new(); + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(label) = data.labels.get(local_index) else { + continue; + }; + if !starts_as_proper_noun(full_text, &offsets, found.start())? { + continue; + } + + results.push(PipelineEntity::detected( + found.start(), + found.end(), + label.clone(), + offsets.slice(found.start(), found.end())?, + COUNTRY_SCORE, + DetectionSource::Country, + )); + } + + Ok(results) +} + +pub(crate) fn ensure_supported_deny_list_sources( + data: &DenyListMatchData, +) -> Result<()> { + let mut needs_filters = false; + for sources in data.sources.iter() { + validate_deny_list_sources(sources)?; + needs_filters |= has_curated_source(sources); + } + + if needs_filters && data.filters.is_none() { + return Err(Error::MissingStaticData { + field: "deny_list.filters", + }); + } + + Ok(()) +} + +fn validate_deny_list_sources(sources: StringGroup<'_>) -> Result<()> { + if sources.is_empty() { + return Err(Error::UnsupportedDenyListSource { + source: String::from(""), + }); + } + + for source in sources.iter() { + match source { + DENY_LIST_SOURCE + | CITY_SOURCE + | CUSTOM_DENY_LIST_SOURCE + | FIRST_NAME_SOURCE + | SURNAME_SOURCE + | TITLE_SOURCE => {} + _ => { + return Err(Error::UnsupportedDenyListSource { + source: String::from(source), + }); + } + } + } + + Ok(()) +} + +fn has_curated_source(sources: StringGroup<'_>) -> bool { + sources + .iter() + .any(|source| source != CUSTOM_DENY_LIST_SOURCE) +} + +const fn has_person_name_source(found: &RawDenyListMatch) -> bool { + found.has_person_name_source +} + +fn filter_contains(set: Option<&BTreeSet>, value: &str) -> bool { + set.is_some_and(|set| set.contains(value)) +} + +fn char_at( + full_text: &str, + offsets: &ByteOffsets<'_>, + offset: u32, +) -> Result> { + let byte = offsets.validate_offset(offset)?; + Ok(full_text.get(byte..).and_then(|tail| tail.chars().next())) +} + +fn char_before_byte(full_text: &str, byte: usize) -> Option { + full_text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) +} + +fn char_after_byte(full_text: &str, byte: usize) -> Option { + full_text + .get(byte..) + .and_then(|suffix| suffix.chars().next()) +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn all_upper(text: &str) -> bool { + let mut saw_letter = false; + for ch in text.chars() { + if !ch.is_alphabetic() || !ch.is_uppercase() { + return false; + } + saw_letter = true; + } + saw_letter +} + +fn is_single_word(text: &str) -> bool { + let mut saw_letter = false; + for ch in text.chars() { + if !ch.is_alphabetic() { + return false; + } + saw_letter = true; + } + saw_letter +} + +fn is_dotted_acronym(text: &str) -> bool { + if text.chars().count() < 3 { + return false; + } + + let mut segments = 0_u8; + let mut chars = text.chars().peekable(); + while let Some(ch) = chars.next() { + if !ch.is_alphabetic() { + return false; + } + segments = segments.saturating_add(1); + if segments > 4 { + return false; + } + match chars.peek().copied() { + Some('.') => { + let _ = chars.next(); + if chars.peek().is_none() { + break; + } + } + None => break, + Some(_) => return false, + } + } + + segments > 0 +} + +fn is_dotted_acronym_suffix_collision( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + match_text: &str, +) -> Result { + if !is_dotted_acronym(match_text) { + return Ok(false); + } + + let start_byte = offsets.validate_offset(start)?; + let prefix = full_text + .get(..start_byte) + .unwrap_or_default() + .chars() + .rev() + .take(2) + .collect::>(); + + Ok(matches!( + (prefix.first().copied(), prefix.get(1).copied()), + (Some('.'), Some(ch)) if ch.is_alphabetic() + )) +} + +fn has_adjacent_address_evidence( + full_text: &str, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let offsets = ByteOffsets::new(full_text); + let full_len = offsets.len()?; + let window_start = offsets.floor_offset(start.saturating_sub(40))?; + let window_end = + offsets.floor_offset(end.saturating_add(40).min(full_len))?; + let window = offsets.slice(window_start, window_end)?; + + Ok(has_address_format(&window) || has_street_type(&window, filters)) +} + +fn has_address_format(text: &str) -> bool { + has_state_after_comma(text) + || has_us_zip(text) + || has_cz_sk_postal_code(text) + || has_pl_postal_code(text) +} + +fn has_state_after_comma(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if chars.get(index) != Some(&',') { + continue; + } + let mut cursor = index.saturating_add(1); + while chars.get(cursor).is_some_and(|ch| ch.is_whitespace()) { + cursor = cursor.saturating_add(1); + } + let first = chars.get(cursor).copied(); + let second = chars.get(cursor.saturating_add(1)).copied(); + let after = chars.get(cursor.saturating_add(2)).copied(); + if first.is_some_and(char::is_uppercase) + && second.is_some_and(char::is_uppercase) + && !after.is_some_and(char::is_alphanumeric) + { + return true; + } + } + false +} + +fn has_us_zip(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if !five_digits_at(&chars, index) { + continue; + } + let after_five = index.saturating_add(5); + let has_zip4 = chars.get(after_five) == Some(&'-') + && four_digits_at(&chars, after_five.saturating_add(1)); + let end = if has_zip4 { + after_five.saturating_add(5) + } else { + after_five + }; + if !chars + .get(index.wrapping_sub(1)) + .is_some_and(char::is_ascii_digit) + && !chars.get(end).is_some_and(char::is_ascii_digit) + { + return true; + } + } + false +} + +fn has_cz_sk_postal_code(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if three_digits_at(&chars, index) + && chars.get(index.saturating_add(3)) == Some(&' ') + && two_digits_at(&chars, index.saturating_add(4)) + { + return true; + } + } + false +} + +fn has_pl_postal_code(text: &str) -> bool { + let chars = text.chars().collect::>(); + for index in 0..chars.len() { + if two_digits_at(&chars, index) + && chars.get(index.saturating_add(2)) == Some(&'-') + && three_digits_at(&chars, index.saturating_add(3)) + { + return true; + } + } + false +} + +fn digits_at(chars: &[char], start: usize, len: usize) -> bool { + start.checked_add(len).is_some_and(|end| end <= chars.len()) + && chars + .get(start..start.saturating_add(len)) + .is_some_and(|slice| slice.iter().all(char::is_ascii_digit)) +} + +fn two_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 2) +} + +fn three_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 3) +} + +fn four_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 4) +} + +fn five_digits_at(chars: &[char], start: usize) -> bool { + digits_at(chars, start, 5) +} + +fn has_street_type(window: &str, filters: &DenyListFilterData) -> bool { + let lower_window = window.to_lowercase(); + for street_type in &filters.street_types { + if street_type.is_empty() { + continue; + } + let lower_type = street_type.to_lowercase(); + if street_type_matches(lower_window.as_str(), lower_type.as_str()) { + return true; + } + } + false +} + +fn street_type_matches(window: &str, street_type: &str) -> bool { + for (byte, _) in window.match_indices(street_type) { + let before = char_before_byte(window, byte); + if before.is_some_and(char::is_alphanumeric) { + continue; + } + let end = byte.saturating_add(street_type.len()); + let Some(last) = street_type.chars().next_back() else { + continue; + }; + if last.is_alphanumeric() + && char_after_byte(window, end).is_some_and(char::is_alphanumeric) + { + continue; + } + return true; + } + false +} + +fn person_chain_breaks(previous_text: &str, gap: &str) -> bool { + byte_len(gap) > 4 + || gap.is_empty() + || gap.contains('\n') + || gap.contains('\t') + || gap + .chars() + .any(|ch| matches!(ch, '!' | '?' | ';' | ':' | ',')) + || (gap.contains('.') && !is_initial_continuation_gap(previous_text, gap)) +} + +fn is_initial_continuation_gap(text: &str, gap: &str) -> bool { + let mut chars = text.chars(); + let text_is_single_upper = + chars.next().is_some_and(char::is_uppercase) && chars.next().is_none(); + if text_is_single_upper && dot_space_gap(gap) { + return true; + } + + let mut remaining = gap; + let Some(after_space) = consume_horizontal_space(remaining, 1, 2) else { + return false; + }; + remaining = after_space; + let mut consumed_initial = false; + + loop { + let Some(ch) = remaining.chars().next() else { + return consumed_initial; + }; + if !ch.is_uppercase() { + return false; + } + let Some(after_initial) = remaining.strip_prefix(ch) else { + return false; + }; + let Some(after_dot) = after_initial.strip_prefix('.') else { + return false; + }; + let Some(after_initial_gap) = consume_horizontal_space(after_dot, 1, 2) + else { + return false; + }; + remaining = after_initial_gap; + consumed_initial = true; + } +} + +fn dot_space_gap(gap: &str) -> bool { + let Some(rest) = gap.strip_prefix('.') else { + return false; + }; + consume_horizontal_space(rest, 1, 2).is_some_and(str::is_empty) +} + +fn consume_horizontal_space( + text: &str, + min: usize, + max: usize, +) -> Option<&str> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if ch == '\n' || !ch.is_whitespace() || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min).then(|| text.get(byte..)).flatten() +} + +fn single_name_hit_has_context( + full_text: &str, + offsets: &ByteOffsets<'_>, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let tail = slice_from(full_text, offsets, end)?; + let rest = tail.trim_start(); + let mut chars = rest.chars(); + let next_is_upper = chars.next().is_some_and(char::is_uppercase) + && chars.next().is_some_and(char::is_lowercase); + if !next_is_upper { + return Ok(false); + } + + let next_word = rest + .chars() + .take_while(|ch| ch.is_alphabetic()) + .collect::(); + Ok( + !filters + .sentence_starters + .contains(&next_word.to_lowercase()), + ) +} + +fn slice_from<'a>( + full_text: &'a str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result<&'a str> { + let byte = offsets.validate_offset(start)?; + full_text + .get(byte..) + .ok_or(Error::ByteOffsetOutOfBounds { offset: start }) +} + +struct ExtendedName { + end: u32, + text: String, +} + +fn extend_person_name( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, + filters: &DenyListFilterData, +) -> Result { + let mut new_end = end; + + loop { + if char_at(full_text, offsets, new_end)? != Some(' ') { + break; + } + let word_start = new_end.saturating_add(1); + let Some(first) = char_at(full_text, offsets, word_start)? else { + break; + }; + if !first.is_uppercase() { + break; + } + + let word = read_until_whitespace(full_text, offsets, word_start)?; + let stripped = strip_trailing_name_punctuation(&word); + if stripped.chars().count() < 2 { + break; + } + let lower = stripped.to_lowercase(); + if filters.stopwords.contains(&lower) + || filters.person_stopwords.contains(&lower) + { + break; + } + + new_end = word_start.saturating_add(byte_len(stripped)); + } + + Ok(ExtendedName { + end: new_end, + text: offsets.slice(start, new_end)?, + }) +} + +fn read_until_whitespace( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result { + let tail = slice_from(full_text, offsets, start)?; + Ok(tail.chars().take_while(|ch| !ch.is_whitespace()).collect()) +} + +fn strip_trailing_name_punctuation(word: &str) -> &str { + word.trim_end_matches([',', ';', '.', '”', '"', '’', '\'', '“', '»']) +} + +struct DefinedTermQuote { + content: String, + after_closing_quote: String, +} + +fn is_suppressible_defined_term_quote( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + filters: &DenyListFilterData, +) -> Result { + let Some(quote) = + find_defined_term_quote_content(full_text, offsets, start, filters)? + else { + return Ok(false); + }; + let words = quote + .content + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()) + .collect::>(); + + if words.len() >= 2 + && starts_with_known_first_name("e.content, filters) + && has_person_role_definition("e.after_closing_quote, filters) + { + return Ok(false); + } + + Ok(words.len() >= 2) +} + +fn find_defined_term_quote_content( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + filters: &DenyListFilterData, +) -> Result> { + let start_byte = offsets.validate_offset(start)?; + let Some(quote_start) = find_opening_quote(full_text, start_byte) else { + return Ok(None); + }; + let Some((quote_end, quote_char)) = + find_closing_quote(full_text, quote_start, start_byte) + else { + return Ok(None); + }; + let after_start = quote_end.saturating_add(quote_char.len_utf8()); + let after = full_text.get(after_start..).unwrap_or_default(); + let after_window = take_bytes(after, 120); + if strip_defined_term_cue(&after_window, filters).is_none() { + return Ok(None); + } + + let quote_width = full_text + .get(quote_start..) + .and_then(|tail| tail.chars().next()) + .map_or(0, char::len_utf8); + let content_start = quote_start.saturating_add(quote_width); + + Ok(Some(DefinedTermQuote { + content: full_text + .get(content_start..quote_end) + .unwrap_or_default() + .to_owned(), + after_closing_quote: after_window, + })) +} + +fn find_opening_quote(full_text: &str, start_byte: usize) -> Option { + let prefix = full_text.get(..start_byte)?; + let mut distance = 0_u32; + for (byte, ch) in prefix.char_indices().rev() { + distance = distance.saturating_add(byte_len(ch.encode_utf8(&mut [0; 4]))); + if distance > 80 || ch == '\n' { + break; + } + if opening_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + return Some(byte); + } + if closing_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + break; + } + } + None +} + +fn find_closing_quote( + full_text: &str, + quote_start: usize, + start_byte: usize, +) -> Option<(usize, char)> { + let tail = full_text.get(start_byte..)?; + let mut distance = byte_len(full_text.get(quote_start..start_byte)?); + for (relative, ch) in tail.char_indices() { + if distance > 120 { + break; + } + let byte = start_byte.saturating_add(relative); + if closing_quotes().contains(&ch) && is_quote_boundary(full_text, byte, ch) + { + return Some((byte, ch)); + } + distance = distance.saturating_add(byte_len(ch.encode_utf8(&mut [0; 4]))); + } + None +} + +fn is_quote_boundary(full_text: &str, byte: usize, ch: char) -> bool { + if ch != '\'' && ch != '’' { + return true; + } + let after_byte = byte.saturating_add(ch.len_utf8()); + let before = char_before_byte(full_text, byte); + let after = char_after_byte(full_text, after_byte); + !(before.is_some_and(char::is_alphabetic) + && after.is_some_and(char::is_alphabetic)) +} + +fn opening_quotes() -> &'static BTreeSet { + static QUOTES: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + BTreeSet::from(['"', '\'', '“', '„', '‟', '‘', '‛', '«']) + }); + "ES +} + +fn closing_quotes() -> &'static BTreeSet { + static QUOTES: std::sync::LazyLock> = + std::sync::LazyLock::new(|| { + BTreeSet::from(['"', '\'', '”', '’', '»', '“']) + }); + "ES +} + +fn take_bytes(text: &str, max: u32) -> String { + let mut taken = String::new(); + let mut len = 0_u32; + for ch in text.chars() { + let width = byte_len(ch.encode_utf8(&mut [0; 4])); + if len.saturating_add(width) > max { + break; + } + taken.push(ch); + len = len.saturating_add(width); + } + taken +} + +fn strip_defined_term_cue<'a>( + after: &'a str, + filters: &DenyListFilterData, +) -> Option<&'a str> { + let trimmed = + after.trim_start_matches(|ch: char| ch.is_whitespace() || ch == ','); + let lower = trimmed.to_lowercase(); + for cue in &filters.defined_term_cues { + if lower.starts_with(cue) && word_boundary_after(lower.as_str(), cue.len()) + { + return trimmed.get(cue.len()..); + } + } + None +} + +fn word_boundary_after(text: &str, byte: usize) -> bool { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .is_none_or(|ch| !ch.is_alphabetic()) +} + +fn starts_with_known_first_name( + quote_content: &str, + filters: &DenyListFilterData, +) -> bool { + let first_word = quote_content + .trim() + .chars() + .take_while(|ch| ch.is_alphabetic()) + .collect::(); + !first_word.is_empty() + && filters.first_names.contains(&first_word.to_lowercase()) +} + +fn has_person_role_definition( + after_closing_quote: &str, + filters: &DenyListFilterData, +) -> bool { + let Some(after_cue) = strip_defined_term_cue(after_closing_quote, filters) + else { + return false; + }; + after_cue + .split(|ch: char| !ch.is_alphabetic()) + .filter(|word| !word.is_empty()) + .take(8) + .any(|word| filters.generic_roles.contains(&word.to_lowercase())) +} + +fn extend_city_districts( + entities: &mut [PipelineEntity], + full_text: &str, + offsets: &ByteOffsets<'_>, + filters: Option<&DenyListFilterData>, +) -> Result<()> { + for entity in entities { + if entity.label != ADDRESS_LABEL + || entity.source_detail == Some(SourceDetail::CustomDenyList) + { + continue; + } + + if let Some(suffix) = + match_district_suffix(slice_from(full_text, offsets, entity.end)?) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(entity.start, entity.end)?; + } + + if let Some(suffix) = + match_dash_district(slice_from(full_text, offsets, entity.end)?) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(entity.start, entity.end)?; + } + + let before = offsets.slice( + offsets.floor_offset(entity.start.saturating_sub(10))?, + entity.start, + )?; + if let Some(prefix) = postal_prefix(&before) { + entity.start = entity.start.saturating_sub(byte_len(prefix)); + entity.text = offsets.slice(entity.start, entity.end)?; + } + + if let Some(filters) = filters + && let Some(suffix) = match_trailing_address_word( + slice_from(full_text, offsets, entity.end)?, + filters, + ) + { + entity.end = entity.end.saturating_add(byte_len(suffix)); + entity.text = offsets.slice(entity.start, entity.end)?; + } + } + + Ok(()) +} + +fn match_district_suffix(after: &str) -> Option<&str> { + let rest = after.strip_prefix(' ')?; + let suffix = numeric_district(rest).or_else(|| roman_district(rest))?; + let end = ' '.len_utf8().saturating_add(suffix.len()); + let next = after.get(end..).and_then(|tail| tail.chars().next()); + next + .is_none_or(is_district_boundary) + .then(|| after.get(..end)) + .flatten() +} + +fn numeric_district(text: &str) -> Option<&str> { + let digits = text + .chars() + .take_while(char::is_ascii_digit) + .collect::(); + if digits.is_empty() || digits.len() > 2 { + return None; + } + text.get(..digits.len()) +} + +fn roman_district(text: &str) -> Option<&str> { + roman_districts() + .iter() + .find_map(|roman| text.starts_with(roman).then_some(*roman)) +} + +const fn roman_districts() -> &'static [&'static str] { + &[ + "XXX", "XXIX", "XXVIII", "XXVII", "XXVI", "XXV", "XXIV", "XXIII", "XXII", + "XXI", "XX", "XIX", "XVIII", "XVII", "XVI", "XV", "XIV", "XIII", "XII", + "XI", "X", "IX", "VIII", "VII", "VI", "IV", "III", "II", + ] +} + +const fn is_district_boundary(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, ',' | ';' | '.' | ')' | '"') +} + +fn match_dash_district(after: &str) -> Option<&str> { + let (space_len, after_space) = consume_spaces_or_tabs(after, 1, 4)?; + let dash = after_space.chars().next()?; + if dash != '-' && dash != '–' { + return None; + } + let after_dash = after_space.get(dash.len_utf8()..)?; + let (post_dash_spaces, word_start) = + consume_spaces_or_tabs(after_dash, 0, usize::MAX) + .unwrap_or((0, after_dash)); + let mut chars = word_start.chars(); + let first = chars.next()?; + let second = chars.next()?; + if !first.is_uppercase() || !second.is_lowercase() { + return None; + } + let word_len = first + .len_utf8() + .saturating_add(second.len_utf8()) + .saturating_add( + chars + .take_while(|ch| ch.is_lowercase()) + .map(char::len_utf8) + .sum::(), + ); + let total = space_len + .saturating_add(dash.len_utf8()) + .saturating_add(post_dash_spaces) + .saturating_add(word_len); + after.get(..total) +} + +fn consume_spaces_or_tabs( + text: &str, + min: usize, + max: usize, +) -> Option<(usize, &str)> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if (ch != ' ' && ch != '\t') || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min) + .then(|| text.get(byte..).map(|rest| (byte, rest))) + .flatten() +} + +fn postal_prefix(before: &str) -> Option<&str> { + let trimmed_end = before.trim_end(); + let suffix_ws = before.len().saturating_sub(trimmed_end.len()); + let before_dash = + trimmed_end.trim_end_matches(|ch: char| ch.is_whitespace() || is_dash(ch)); + let dash_ws = trimmed_end.len().saturating_sub(before_dash.len()); + + if let Some(code) = trailing_postal_code(before_dash) { + let start = before_dash.len().saturating_sub(code.len()); + let end = before + .len() + .saturating_sub(suffix_ws) + .saturating_add(dash_ws); + return before.get(start..end); + } + None +} + +fn trailing_postal_code(text: &str) -> Option<&str> { + let chars = text.chars().collect::>(); + if chars.len() >= 5 { + let start = chars.len().saturating_sub(5); + if five_digits_at(&chars, start) { + return text.get(byte_index_for_char(text, start)..); + } + } + if chars.len() >= 6 { + let start = chars.len().saturating_sub(6); + if three_digits_at(&chars, start) + && chars.get(start.saturating_add(3)) == Some(&' ') + && two_digits_at(&chars, start.saturating_add(4)) + { + return text.get(byte_index_for_char(text, start)..); + } + } + None +} + +fn byte_index_for_char(text: &str, char_index: usize) -> usize { + text + .char_indices() + .nth(char_index) + .map_or(text.len(), |(byte, _)| byte) +} + +const fn is_dash(ch: char) -> bool { + matches!(ch, '-' | '–' | '—') +} + +fn match_trailing_address_word<'a>( + after: &'a str, + filters: &DenyListFilterData, +) -> Option<&'a str> { + let (space_len, word_start) = consume_whitespace_no_newline(after, 1, 4)?; + let mut chars = word_start.chars(); + let first = chars.next()?; + let second = chars.next()?; + if !first.is_uppercase() || !second.is_lowercase() { + return None; + } + let rest_len = chars + .take_while(|ch| ch.is_lowercase()) + .map(char::len_utf8) + .sum::(); + let word_len = first + .len_utf8() + .saturating_add(second.len_utf8()) + .saturating_add(rest_len); + let word = word_start.get(..word_len)?; + if filters + .trailing_address_word_exclusions + .contains(&word.to_lowercase()) + { + return None; + } + after.get(..space_len.saturating_add(word_len)) +} + +fn consume_whitespace_no_newline( + text: &str, + min: usize, + max: usize, +) -> Option<(usize, &str)> { + let mut consumed = 0_usize; + let mut byte = 0_usize; + for ch in text.chars() { + if ch == '\n' || !ch.is_whitespace() || consumed == max { + break; + } + consumed = consumed.saturating_add(1); + byte = byte.saturating_add(ch.len_utf8()); + } + (consumed >= min) + .then(|| text.get(byte..).map(|rest| (byte, rest))) + .flatten() +} + +fn try_gazetteer_prefix_extension( + offsets: &ByteOffsets<'_>, + found: &SearchMatch, +) -> Result)>> { + let max_end = offsets + .offset_after_utf16_units(found.end(), MAX_GAZETTEER_PREFIX_OVERSHOOT)?; + if max_end <= found.end().saturating_add(1) { + return Ok(None); + } + + let after = offsets.slice(found.end(), max_end)?; + if !after.starts_with(' ') { + return Ok(None); + } + + let suffix_end = next_space_offset_after_initial(&after); + if suffix_end <= 1 { + return Ok(None); + } + + let new_end = found.end().saturating_add(suffix_end); + Ok(Some(( + new_end, + offsets.slice(found.start(), new_end)?, + Some(SourceDetail::GazetteerExtension), + ))) +} + +fn next_space_offset_after_initial(text: &str) -> u32 { + let mut offset = 0_u32; + + for ch in text.chars() { + let width = u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX); + if offset > 0 && ch == ' ' { + return offset; + } + offset = offset.saturating_add(width); + } + + offset +} + +fn starts_as_proper_noun( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result { + let start_byte = offsets.validate_offset(start)?; + let Some(ch) = full_text + .get(start_byte..) + .and_then(|tail| tail.chars().next()) + else { + return Ok(false); + }; + + let upper = ch.to_uppercase().to_string(); + let lower = ch.to_lowercase().to_string(); + if upper == lower { + return Ok(true); + } + + Ok(ch.to_string() == upper) +} + +fn custom_match_has_valid_edges( + full_text: &str, + offsets: &ByteOffsets<'_>, + start: u32, + end: u32, + pattern: &str, +) -> Result { + if !pattern.chars().any(char::is_alphanumeric) { + return Ok(true); + } + + let start_byte = offsets.validate_offset(start)?; + let end_byte = offsets.validate_offset(end)?; + let previous = full_text + .get(..start_byte) + .and_then(|prefix| prefix.chars().next_back()); + if previous.is_some_and(char::is_alphanumeric) { + return Ok(false); + } + + let next = full_text + .get(end_byte..) + .and_then(|suffix| suffix.chars().next()); + if next.is_some_and(char::is_alphanumeric) { + return Ok(false); + } + + Ok(true) +} + +const fn fuzzy_distance(found: &SearchMatch) -> Option { + let SearchMatch::Fuzzy { distance, .. } = found else { + return None; + }; + Some(*distance) +} diff --git a/crates/anonymize-core/src/redact.rs b/crates/anonymize-core/src/redact.rs new file mode 100644 index 00000000..f30fdfee --- /dev/null +++ b/crates/anonymize-core/src/redact.rs @@ -0,0 +1,185 @@ +use crate::byte_offsets::ByteOffsets; +use crate::normalize::placeholder_fallback; +use crate::placeholders::build_placeholder_map; +use crate::types::{ + Entity, EntityKind, OperatorConfig, OperatorEntry, OperatorType, + RedactionEntry, RedactionResult, Result, +}; + +pub fn redact_text( + full_text: &str, + entities: &[Entity], + config: &OperatorConfig, +) -> Result { + if entities.is_empty() { + return Ok(RedactionResult { + redacted_text: full_text.to_owned(), + redaction_map: Vec::new(), + operator_map: Vec::new(), + entity_count: 0, + }); + } + + let offsets = ByteOffsets::new(full_text); + validate_spans(entities, &offsets)?; + + let placeholder_map = build_placeholder_map(entities, full_text); + let mut sorted = redaction_spans(entities, &offsets)?; + sorted.sort_by_key(|span| span.entity.start); + + // Existing contract: first accepted span wins overlaps. + let mut non_overlapping = Vec::::new(); + let mut last_end = 0; + for span in sorted { + if span.entity.start >= last_end { + last_end = span.entity.end; + non_overlapping.push(span); + } + } + + let mut parts = Vec::::new(); + let mut redaction_map = Vec::::new(); + let mut operator_map = Vec::::new(); + let mut cursor = 0; + + for span in &non_overlapping { + let entity = &span.entity; + if entity.start > cursor { + parts.push(offsets.slice(cursor, entity.start)?); + } + + let placeholder = placeholder_map + .get_entity(entity) + .map_or_else(|| placeholder_fallback(&entity.label), ToOwned::to_owned); + let operator = operator_for(config, &entity.label); + let replacement = match operator { + OperatorType::Replace => placeholder.clone(), + OperatorType::Redact => config.redact_string.clone(), + }; + + parts.push(replacement); + set_operator_entry(&mut operator_map, &placeholder, operator); + + if operator == OperatorType::Replace + && redaction_value(&redaction_map, &placeholder).is_none() + { + redaction_map.push(RedactionEntry { + placeholder: placeholder.clone(), + original: redaction_original_text(span), + }); + } + + cursor = entity.end; + } + + let full_text_len = offsets.len()?; + if cursor < full_text_len { + parts.push(offsets.slice(cursor, full_text_len)?); + } + + Ok(RedactionResult { + redacted_text: parts.concat(), + redaction_map, + operator_map, + entity_count: non_overlapping.len(), + }) +} + +#[must_use] +pub fn deanonymise( + redacted_text: &str, + redaction_map: &[RedactionEntry], +) -> String { + let mut result = redacted_text.to_owned(); + + for entry in redaction_map { + result = result.replace(&entry.placeholder, &entry.original); + } + + result +} + +fn validate_spans( + entities: &[Entity], + offsets: &ByteOffsets<'_>, +) -> Result<()> { + for entity in entities { + // Empty spans would insert without redacting. + if entity.start >= entity.end { + return Err(crate::types::Error::InvalidSpan { + start: entity.start, + end: entity.end, + }); + } + + offsets.validate_offset(entity.start)?; + offsets.validate_offset(entity.end)?; + } + + Ok(()) +} + +struct RedactionSpan { + entity: Entity, + source_text: String, +} + +fn redaction_spans( + entities: &[Entity], + offsets: &ByteOffsets<'_>, +) -> Result> { + let mut resolved = Vec::with_capacity(entities.len()); + + for entity in entities { + resolved.push(RedactionSpan { + entity: entity.clone(), + source_text: offsets.slice(entity.start, entity.end)?, + }); + } + + Ok(resolved) +} + +fn operator_for(config: &OperatorConfig, label: &str) -> OperatorType { + config + .operators + .get(label) + .copied() + .unwrap_or(OperatorType::Replace) +} + +fn set_operator_entry( + operator_map: &mut Vec, + placeholder: &str, + operator: OperatorType, +) { + if let Some(entry) = operator_map + .iter_mut() + .find(|entry| entry.placeholder == placeholder) + { + entry.operator = operator; + return; + } + + operator_map.push(OperatorEntry { + placeholder: placeholder.to_owned(), + operator, + }); +} + +fn redaction_value<'a>( + redaction_map: &'a [RedactionEntry], + placeholder: &str, +) -> Option<&'a str> { + redaction_map + .iter() + .find(|entry| entry.placeholder == placeholder) + .map(|entry| entry.original.as_str()) +} + +fn redaction_original_text(span: &RedactionSpan) -> String { + match &span.entity.kind { + EntityKind::Detected => span.source_text.clone(), + EntityKind::Coreference { source_text } => source_text.clone(), + } +} diff --git a/crates/anonymize-core/src/resolution/boundary.rs b/crates/anonymize-core/src/resolution/boundary.rs new file mode 100644 index 00000000..0bff96a6 --- /dev/null +++ b/crates/anonymize-core/src/resolution/boundary.rs @@ -0,0 +1,429 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::byte_offsets::ByteOffsets; +use crate::types::Result; + +use super::common::{byte_len, contains_span, entity_len, is_caller_owned}; +use super::{DetectionSource, PipelineEntity}; + +pub fn enforce_boundary_consistency( + entities: &[PipelineEntity], + full_text: &str, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let spans = char_spans(full_text); + let boundaries = word_boundaries(&spans); + let fixed = fix_partial_words(entities, &offsets, &spans, &boundaries)?; + let resolved = resolve_cross_label_overlaps(&fixed, &offsets)?; + let deduped = deduplicate_spans(&resolved); + let merged = merge_adjacent(&deduped, &offsets)?; + Ok(remove_nested_same_label(&merged)) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CharSpan { + start: u32, + end: u32, + ch: char, +} + +fn fix_partial_words( + entities: &[PipelineEntity], + offsets: &ByteOffsets<'_>, + spans: &[CharSpan], + boundaries: &BTreeSet, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut fixed = Vec::with_capacity(sorted.len()); + + for (index, entity) in sorted.iter().enumerate() { + if has_locked_boundary(entity) || has_detector_locked_boundary(entity) { + fixed.push(entity.clone()); + continue; + } + + if entity.text != offsets.slice(entity.start, entity.end)? { + fixed.push(entity.clone()); + continue; + } + + let mut new_start = word_start_at(entity.start, boundaries, spans); + let mut new_end = word_end_at(entity.end, boundaries, spans); + + for (other_index, other) in sorted.iter().enumerate() { + if other_index == index || other.label == entity.label { + continue; + } + if other.end > new_start && other.end <= entity.start { + new_start = new_start.max(other.end); + } + if other.start >= entity.end && other.start < new_end { + new_end = new_end.min(other.start); + } + } + + if new_start == entity.start && new_end == entity.end { + fixed.push(entity.clone()); + continue; + } + + let mut adjusted = entity.clone(); + adjusted.start = new_start; + adjusted.end = new_end; + adjusted.text = offsets.slice(new_start, new_end)?; + fixed.push(adjusted); + } + + Ok(fixed) +} + +fn resolve_cross_label_overlaps( + entities: &[PipelineEntity], + offsets: &ByteOffsets<'_>, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let mut left_index = 0; + while left_index < sorted.len() { + let mut right_index = left_index.saturating_add(1); + while right_index < sorted.len() { + let Some(left) = sorted.get(left_index) else { + break; + }; + let Some(right) = sorted.get(right_index) else { + break; + }; + if right.start >= left.end { + break; + } + if left.label == right.label + || contains_span(left, right) + || contains_span(right, left) + { + right_index = right_index.saturating_add(1); + continue; + } + + let left_len = entity_len(left); + let right_len = entity_len(right); + let left_locked = has_locked_boundary(left); + let right_locked = has_locked_boundary(right); + let left_wins = if left_locked == right_locked { + match left.score.total_cmp(&right.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => left_len >= right_len, + } + } else { + left_locked + }; + + if left_wins { + let new_start = left.end; + if let Some(right_mut) = sorted.get_mut(right_index) { + right_mut.start = new_start; + right_mut.text = offsets.slice(new_start, right_mut.end)?; + } + right_index = right_index.saturating_add(1); + continue; + } + + let new_end = right.start; + if let Some(left_mut) = sorted.get_mut(left_index) { + left_mut.end = new_end; + left_mut.text = offsets.slice(left_mut.start, new_end)?; + } + break; + } + + left_index = left_index.saturating_add(1); + } + + Ok( + sorted + .into_iter() + .filter(|entity| entity.start < entity.end) + .collect(), + ) +} + +fn deduplicate_spans(entities: &[PipelineEntity]) -> Vec { + let mut seen = BTreeMap::<(u32, u32, String), PipelineEntity>::new(); + + for entity in entities { + let key = (entity.start, entity.end, entity.label.clone()); + let replace = seen + .get(&key) + .is_none_or(|existing| entity.score.total_cmp(&existing.score).is_gt()); + if replace { + seen.insert(key, entity.clone()); + } + } + + seen.into_values().collect() +} + +fn merge_adjacent( + entities: &[PipelineEntity], + offsets: &ByteOffsets<'_>, +) -> Result> { + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + let mut result = Vec::::new(); + let mut last_by_label = BTreeMap::::new(); + + for entity in &sorted { + if has_locked_boundary(entity) { + result.push(entity.clone()); + continue; + } + + let Some(previous_index) = last_by_label.get(&entity.label).copied() else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + let Some(previous) = result.get(previous_index) else { + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + continue; + }; + + if !has_locked_boundary(previous) && entity.start < previous.end { + merge_into_previous(&mut result, previous_index, entity, offsets)?; + continue; + } + + let gap = offsets.slice(previous.end, entity.start)?; + let gap_start = previous.end; + let gap_end = entity.start; + let gap_occupied = sorted.iter().any(|other| { + other.label != entity.label + && other.start < gap_end + && other.end > gap_start + }); + let legal_form_comma = (is_legal_form_organization(previous) + || is_legal_form_organization(entity)) + && gap.contains(','); + + if !has_locked_boundary(previous) + && !legal_form_comma + && entity.label != "country" + && !gap_occupied + && is_mergeable_gap(&gap) + { + merge_into_previous(&mut result, previous_index, entity, offsets)?; + continue; + } + + let index = result.len(); + result.push(entity.clone()); + last_by_label.insert(entity.label.clone(), index); + } + + Ok(result) +} + +fn remove_nested_same_label( + entities: &[PipelineEntity], +) -> Vec { + let mut sorted = entities.to_vec(); + sorted.sort_by(|left, right| { + left + .start + .cmp(&right.start) + .then_with(|| entity_len(right).cmp(&entity_len(left))) + }); + + let mut result = Vec::new(); + let mut max_end_by_label = BTreeMap::::new(); + + for entity in sorted { + if max_end_by_label + .get(&entity.label) + .is_some_and(|max_end| entity.end <= *max_end) + { + continue; + } + max_end_by_label.insert(entity.label.clone(), entity.end); + result.push(entity); + } + + result +} + +fn char_spans(text: &str) -> Vec { + let mut spans = Vec::new(); + let mut offset = 0_u32; + + for ch in text.chars() { + let width = u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX); + let end = offset.saturating_add(width); + spans.push(CharSpan { + start: offset, + end, + ch, + }); + offset = end; + } + + spans +} + +fn word_boundaries(spans: &[CharSpan]) -> BTreeSet { + let mut boundaries = BTreeSet::new(); + let mut run_start = None::; + let mut run_end = None::; + + for (index, span) in spans.iter().enumerate() { + if is_word_body(span.ch) || is_word_connector_between(spans, index) { + if run_start.is_none() { + run_start = Some(span.start); + } + run_end = Some(span.end); + continue; + } + + if let (Some(start), Some(end)) = (run_start.take(), run_end.take()) { + boundaries.insert(start); + boundaries.insert(end); + } + } + + if let (Some(start), Some(end)) = (run_start, run_end) { + boundaries.insert(start); + boundaries.insert(end); + } + + boundaries +} + +fn is_word_connector_between(spans: &[CharSpan], index: usize) -> bool { + let Some(span) = spans.get(index) else { + return false; + }; + if !is_word_connector(span.ch) { + return false; + } + + let Some(previous) = index.checked_sub(1).and_then(|prev| spans.get(prev)) + else { + return false; + }; + let Some(next) = spans.get(index.saturating_add(1)) else { + return false; + }; + + is_word_body(previous.ch) && is_word_body(next.ch) +} + +const fn is_word_connector(ch: char) -> bool { + matches!(ch, '\'' | '\u{2018}' | '\u{2019}' | '\u{02bc}' | '\u{ff07}') +} + +fn is_word_body(ch: char) -> bool { + ch.is_alphanumeric() || is_combining_mark(ch) +} + +const fn is_combining_mark(ch: char) -> bool { + matches!( + ch, + '\u{0300}'..='\u{036f}' + | '\u{1ab0}'..='\u{1aff}' + | '\u{1dc0}'..='\u{1dff}' + | '\u{20d0}'..='\u{20ff}' + | '\u{fe20}'..='\u{fe2f}' + ) +} + +fn word_start_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + while cursor > 0 && !boundaries.contains(&cursor) { + let index = spans.partition_point(|span| span.end <= cursor); + if index == 0 { + return cursor; + } + let Some(previous) = spans.get(index.saturating_sub(1)) else { + return cursor; + }; + if is_word_start_stop(previous.ch) { + return cursor; + } + cursor = previous.start; + } + cursor +} + +fn word_end_at( + position: u32, + boundaries: &BTreeSet, + spans: &[CharSpan], +) -> u32 { + let mut cursor = position; + let text_end = spans.last().map_or(0, |span| span.end); + while cursor < text_end && !boundaries.contains(&cursor) { + let index = spans.partition_point(|span| span.start < cursor); + let Some(next) = spans.get(index) else { + return cursor; + }; + if is_word_end_stop(next.ch) { + return cursor; + } + cursor = next.end; + } + cursor +} + +fn merge_into_previous( + entities: &mut [PipelineEntity], + previous_index: usize, + entity: &PipelineEntity, + offsets: &ByteOffsets<'_>, +) -> Result<()> { + if let Some(previous) = entities.get_mut(previous_index) { + previous.end = previous.end.max(entity.end); + previous.text = offsets.slice(previous.start, previous.end)?; + if entity.score.total_cmp(&previous.score).is_gt() { + previous.score = entity.score; + } + } + Ok(()) +} + +const fn has_locked_boundary(entity: &PipelineEntity) -> bool { + is_caller_owned(entity) +} + +fn has_detector_locked_boundary(entity: &PipelineEntity) -> bool { + entity.label == "phone number" && entity.source == DetectionSource::Trigger +} + +fn is_legal_form_organization(entity: &PipelineEntity) -> bool { + entity.label == "organization" && entity.source == DetectionSource::LegalForm +} + +fn is_mergeable_gap(gap: &str) -> bool { + gap.is_empty() + || (byte_len(gap) <= 3 + && gap.chars().all(|ch| matches!(ch, ' ' | '\t' | ',' | '-'))) +} + +const fn is_word_start_stop(ch: char) -> bool { + matches!(ch, '\n' | '\r' | ',' | ';' | '(' | ')' | '[' | ']' | '&') +} + +const fn is_word_end_stop(ch: char) -> bool { + matches!( + ch, + '\n' | '\r' | ',' | ';' | '.' | '(' | ')' | '[' | ']' | '&' + ) +} diff --git a/crates/anonymize-core/src/resolution/common.rs b/crates/anonymize-core/src/resolution/common.rs new file mode 100644 index 00000000..7afe2b7e --- /dev/null +++ b/crates/anonymize-core/src/resolution/common.rs @@ -0,0 +1,23 @@ +use super::{PipelineEntity, SourceDetail}; + +pub(crate) const fn contains_span( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.start <= inner.start && outer.end >= inner.end +} + +pub(crate) const fn entity_len(entity: &PipelineEntity) -> u32 { + entity.end.saturating_sub(entity.start) +} + +pub(crate) const fn is_caller_owned(entity: &PipelineEntity) -> bool { + matches!( + entity.source_detail, + Some(SourceDetail::CustomDenyList | SourceDetail::CustomRegex) + ) +} + +pub(crate) fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} diff --git a/crates/anonymize-core/src/resolution/merge.rs b/crates/anonymize-core/src/resolution/merge.rs new file mode 100644 index 00000000..e36089d7 --- /dev/null +++ b/crates/anonymize-core/src/resolution/merge.rs @@ -0,0 +1,482 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use super::common::{entity_len, is_caller_owned}; +use super::sanitize::sanitize_entities; +use super::{DetectionSource, PipelineEntity}; + +#[must_use] +pub fn merge_and_dedup(entities: &[PipelineEntity]) -> Vec { + if entities.is_empty() { + return Vec::new(); + } + + let mut sorted = entities.to_vec(); + sorted.sort_by_key(|entity| entity.start); + + let Some(first) = sorted.first() else { + return Vec::new(); + }; + let mut merged = vec![first.clone()]; + + for entity in sorted.into_iter().skip(1) { + let overlaps = overlapping_indexes(&merged, &entity); + if overlaps.is_empty() { + merged.push(entity); + continue; + } + + let has_partial_overlap = overlaps.iter().any(|index| { + merged.get(*index).is_some_and(|existing| { + existing.start != entity.start || existing.end != entity.end + }) + }); + + if !has_partial_overlap { + let same_label_index = overlaps.iter().find_map(|index| { + merged + .get(*index) + .is_some_and(|existing| existing.label == entity.label) + .then_some(*index) + }); + + let Some(index) = same_label_index else { + merged.push(entity); + merged.sort_by_key(|entry| entry.start); + continue; + }; + + if let Some(existing) = merged.get(index) + && should_replace(&entity, existing) + { + replace_at(&mut merged, index, entity); + } + continue; + } + + let replaces_all = overlaps.iter().all(|index| { + merged + .get(*index) + .is_some_and(|existing| should_replace(&entity, existing)) + }); + if !replaces_all { + continue; + } + + let Some(insert_at) = overlaps.first().copied() else { + continue; + }; + for index in overlaps.iter().rev() { + remove_at(&mut merged, *index); + } + insert_at_or_push(&mut merged, insert_at, entity); + } + + resolve_same_span_label_conflicts(&sanitize_entities(&merged)) +} + +fn overlapping_indexes( + entities: &[PipelineEntity], + entity: &PipelineEntity, +) -> Vec { + entities + .iter() + .enumerate() + .filter_map(|(index, existing)| { + (existing.end > entity.start && existing.start < entity.end) + .then_some(index) + }) + .collect() +} + +fn should_replace( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + let candidate_len = entity_len(candidate); + let existing_len = entity_len(existing); + let candidate_caller_owned = is_caller_owned(candidate); + let existing_caller_owned = is_caller_owned(existing); + if candidate_caller_owned != existing_caller_owned { + return candidate_caller_owned; + } + + if literal_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if literal_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if curated_organization_contains_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if curated_organization_contains_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if address_contains_bare_postal(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if address_contains_bare_postal(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if legal_form_contains(candidate, existing) && candidate_len > existing_len { + return true; + } + if legal_form_contains(existing, candidate) && existing_len > candidate_len { + return false; + } + + if same_start_longest_wins(candidate, existing) + && candidate_len != existing_len + { + return candidate_len > existing_len; + } + + if regex_shape_contains_trigger_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if regex_shape_contains_trigger_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if person_regex_contains_name_fragment(candidate, existing) + && candidate_len > existing_len + { + return true; + } + if person_regex_contains_name_fragment(existing, candidate) + && existing_len > candidate_len + { + return false; + } + + if country_inside_person_or_org(candidate, existing) + && existing_len > candidate_len + { + return false; + } + if country_inside_person_or_org(existing, candidate) + && candidate_len > existing_len + { + return true; + } + + let candidate_priority = candidate.source.priority(); + let existing_priority = existing.source.priority(); + if candidate_priority != existing_priority { + return candidate_priority > existing_priority; + } + + match candidate.score.total_cmp(&existing.score) { + std::cmp::Ordering::Greater => true, + std::cmp::Ordering::Less => false, + std::cmp::Ordering::Equal => candidate_len > existing_len, + } +} + +fn resolve_same_span_label_conflicts( + entities: &[PipelineEntity], +) -> Vec { + if entities.len() < 2 { + return entities.to_vec(); + } + + let mut by_offsets = BTreeMap::<(u32, u32), Vec>::new(); + for (index, entity) in entities.iter().enumerate() { + by_offsets + .entry((entity.start, entity.end)) + .or_default() + .push(index); + } + + let mut dropped = BTreeSet::::new(); + for group in by_offsets.values() { + if group.len() < 2 { + continue; + } + + let labels = group + .iter() + .filter_map(|index| entities.get(*index)) + .map(|entity| entity.label.as_str()) + .collect::>(); + if labels.len() < 2 { + continue; + } + + let has_person = labels.contains("person"); + let has_precise_non_address = labels + .iter() + .any(|label| *label != "address" && precise_over_address(label)); + let mut yielding_to_person = BTreeSet::::new(); + + if has_person { + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if !is_caller_owned(entity) && person_preferred_over(&entity.label) { + yielding_to_person.insert(*index); + } + } + } + + let mut max_priority = None::; + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) || yielding_to_person.contains(index) { + continue; + } + max_priority = Some(max_priority.map_or_else( + || entity.source.priority(), + |priority| priority.max(entity.source.priority()), + )); + } + + for index in group { + let Some(entity) = entities.get(*index) else { + continue; + }; + if is_caller_owned(entity) { + continue; + } + if yielding_to_person.contains(index) { + dropped.insert(*index); + continue; + } + if max_priority + .is_some_and(|priority| entity.source.priority() < priority) + { + dropped.insert(*index); + continue; + } + if has_precise_non_address && entity.label == "address" { + dropped.insert(*index); + } + } + } + + entities + .iter() + .enumerate() + .filter(|(index, _)| !dropped.contains(index)) + .map(|(_, entity)| entity.clone()) + .collect() +} + +fn replace_at( + entities: &mut [PipelineEntity], + index: usize, + entity: PipelineEntity, +) { + if let Some(slot) = entities.get_mut(index) { + *slot = entity; + } +} + +fn remove_at(entities: &mut Vec, index: usize) { + if index < entities.len() { + entities.remove(index); + } +} + +fn insert_at_or_push( + entities: &mut Vec, + index: usize, + entity: PipelineEntity, +) { + if index <= entities.len() { + entities.insert(index, entity); + return; + } + entities.push(entity); +} + +fn literal_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && matches!( + outer.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn curated_organization_contains_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + matches!( + outer.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) && outer.label == "organization" + && matches!(inner.label.as_str(), "address" | "country") + && !is_caller_owned(inner) + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn address_contains_bare_postal( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == "address" + && inner.label == "address" + && outer.start <= inner.start + && outer.end >= inner.end + && is_bare_postal_code(&inner.text) +} + +fn legal_form_contains(outer: &PipelineEntity, inner: &PipelineEntity) -> bool { + outer.label == inner.label + && outer.source == DetectionSource::LegalForm + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn same_start_longest_wins( + candidate: &PipelineEntity, + existing: &PipelineEntity, +) -> bool { + candidate.label == existing.label + && candidate.start == existing.start + && longest_wins_label(&candidate.label) +} + +fn regex_shape_contains_trigger_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == inner.label + && outer.source == DetectionSource::Regex + && inner.source == DetectionSource::Trigger + && outer.start <= inner.start + && outer.end >= comparable_trigger_fragment_end(inner) + && regex_shape_preferred_label(&outer.label) +} + +fn comparable_trigger_fragment_end(entity: &PipelineEntity) -> u32 { + let mut end = entity.end; + let mut text = entity.text.as_str(); + while let Some((index, ch)) = text.char_indices().next_back() { + if !is_trigger_fragment_trailing_trim(ch) { + break; + } + end = end.saturating_sub(u32_char_len(ch)); + text = text.get(..index).unwrap_or_default(); + } + end +} + +const fn is_trigger_fragment_trailing_trim(ch: char) -> bool { + matches!(ch, ',' | ';' | ':' | '!' | '?' | ' ' | '\t' | '\n' | '\r') +} + +fn u32_char_len(ch: char) -> u32 { + u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX) +} + +fn regex_shape_preferred_label(label: &str) -> bool { + matches!( + label, + "date" + | "date of birth" + | "phone number" + | "tax identification number" + | "registration number" + | "national identification number" + | "social security number" + | "birth number" + | "identity card number" + | "passport number" + | "credit card number" + | "bank account number" + | "iban" + ) +} + +fn person_regex_contains_name_fragment( + outer: &PipelineEntity, + inner: &PipelineEntity, +) -> bool { + outer.label == "person" + && inner.label == "person" + && outer.source == DetectionSource::Regex + && matches!( + inner.source, + DetectionSource::Trigger | DetectionSource::DenyList + ) + && outer.start <= inner.start + && outer.end >= inner.end +} + +fn country_inside_person_or_org( + country: &PipelineEntity, + container: &PipelineEntity, +) -> bool { + country.label == "country" + && matches!(container.label.as_str(), "person" | "organization") + && container.start <= country.start + && container.end >= country.end +} + +fn longest_wins_label(label: &str) -> bool { + matches!( + label, + "date" + | "date of birth" + | "monetary amount" + | "phone number" + | "email address" + | "url" + ) +} + +fn precise_over_address(label: &str) -> bool { + matches!( + label, + "person" + | "date" + | "date of birth" + | "phone number" + | "email address" + | "monetary amount" + | "iban" + | "bank account number" + | "tax identification number" + | "registration number" + | "identity card number" + | "national identification number" + | "passport number" + | "credit card number" + ) +} + +fn person_preferred_over(label: &str) -> bool { + matches!(label, "address" | "country" | "land parcel") +} + +fn is_bare_postal_code(text: &str) -> bool { + let compact = text + .chars() + .filter(|ch| !ch.is_whitespace() && *ch != '-' && *ch != '–') + .collect::(); + let len = compact.len(); + matches!(len, 5 | 8 | 9) && compact.chars().all(|ch| ch.is_ascii_digit()) +} diff --git a/crates/anonymize-core/src/resolution/mod.rs b/crates/anonymize-core/src/resolution/mod.rs new file mode 100644 index 00000000..0c9463a8 --- /dev/null +++ b/crates/anonymize-core/src/resolution/mod.rs @@ -0,0 +1,11 @@ +mod boundary; +mod common; +mod merge; +mod sanitize; +mod types; + +pub use boundary::enforce_boundary_consistency; +pub use merge::merge_and_dedup; +pub use sanitize::sanitize_entities; +pub(crate) use sanitize::sanitize_entities_with_source; +pub use types::{DetectionSource, PipelineEntity, SourceDetail}; diff --git a/crates/anonymize-core/src/resolution/sanitize.rs b/crates/anonymize-core/src/resolution/sanitize.rs new file mode 100644 index 00000000..0d4cf379 --- /dev/null +++ b/crates/anonymize-core/src/resolution/sanitize.rs @@ -0,0 +1,315 @@ +use crate::byte_offsets::ByteOffsets; +use crate::types::Result; + +use super::common::{byte_len, is_caller_owned}; +use super::{DetectionSource, PipelineEntity, SourceDetail}; + +const LEGAL_PERIOD_SUFFIXES: &str = + include_str!("../../data/legal-period-suffixes.txt"); +const ADDRESS_FINAL_ABBREVS: &str = + include_str!("../../data/address-final-abbrevs.txt"); + +#[must_use] +pub fn sanitize_entities(entities: &[PipelineEntity]) -> Vec { + let mut sanitized = Vec::new(); + + for entity in entities { + if is_caller_owned(entity) || has_curated_literal_boundary(entity) { + sanitized.push(entity.clone()); + continue; + } + + let Some(cleaned) = clean_entity_text(entity, &entity.text) else { + continue; + }; + sanitized.push(cleaned); + } + + sanitized +} + +pub(crate) fn sanitize_entities_with_source( + entities: &[PipelineEntity], + full_text: &str, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut sanitized = Vec::new(); + + for entity in entities { + if is_caller_owned(entity) || has_curated_literal_boundary(entity) { + sanitized.push(entity.clone()); + continue; + } + + let raw_text = offsets.slice(entity.start, entity.end)?; + let Some(cleaned) = clean_entity_text(entity, &raw_text) else { + continue; + }; + sanitized.push(cleaned); + } + + Ok(sanitized) +} + +fn clean_entity_text( + entity: &PipelineEntity, + raw_text: &str, +) -> Option { + let mut start_byte = 0; + let mut end_byte = raw_text.len(); + + while let Some((ch, len)) = first_char(raw_text.get(start_byte..end_byte)?) { + if ch.is_whitespace() || is_leading_trim(ch, &entity.label) { + start_byte = start_byte.saturating_add(len); + continue; + } + break; + } + + trim_leading_date_artifacts(entity, raw_text, &mut start_byte, end_byte); + + while let Some((ch, len)) = first_char(raw_text.get(start_byte..end_byte)?) { + if ch.is_whitespace() { + start_byte = start_byte.saturating_add(len); + continue; + } + break; + } + + while let Some((ch, len)) = last_char(raw_text.get(start_byte..end_byte)?) { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if should_strip_period(entity, raw_text, start_byte, end_byte) { + end_byte = end_byte.saturating_sub('.'.len_utf8()); + } + + while let Some((ch, len)) = last_char(raw_text.get(start_byte..end_byte)?) { + if ch.is_whitespace() || is_trailing_trim(ch, &entity.label) { + end_byte = end_byte.saturating_sub(len); + continue; + } + break; + } + + if start_byte >= end_byte { + return None; + } + + let cleaned_raw = raw_text.get(start_byte..end_byte)?; + if !cleaned_raw.chars().any(char::is_alphanumeric) { + return None; + } + + let display_text = collapse_display_whitespace(cleaned_raw); + let start = entity + .start + .saturating_add(byte_len(raw_text.get(..start_byte).unwrap_or_default())); + let end = start.saturating_add(byte_len(cleaned_raw)); + + let mut cleaned = entity.clone(); + cleaned.start = start; + cleaned.end = end; + cleaned.text = display_text; + Some(cleaned) +} + +fn has_curated_literal_boundary(entity: &PipelineEntity) -> bool { + matches!( + entity.source, + DetectionSource::DenyList | DetectionSource::Gazetteer + ) && entity.label != "person" + && entity.source_detail != Some(SourceDetail::GazetteerExtension) + && entity + .text + .chars() + .next() + .into_iter() + .chain(entity.text.chars().next_back()) + .any(is_literal_boundary_punct) +} + +fn is_leading_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '«' | '¿' | '¡' + ) + } +} + +fn trim_leading_date_artifacts( + entity: &PipelineEntity, + raw_text: &str, + start_byte: &mut usize, + end_byte: usize, +) { + if !matches!(entity.label.as_str(), "date" | "date of birth") { + return; + } + + let Some(text) = raw_text.get(*start_byte..end_byte) else { + return; + }; + let dot_len = leading_dot_run_len(text); + if dot_len == 0 { + return; + } + + let should_trim = dot_len >= 2 + || text + .get(dot_len..) + .and_then(|suffix| suffix.chars().next()) + .is_some_and(char::is_whitespace); + if should_trim { + *start_byte = (*start_byte).saturating_add(dot_len); + } +} + +fn leading_dot_run_len(text: &str) -> usize { + let mut len = 0usize; + for ch in text.chars() { + if ch != '.' { + break; + } + len = len.saturating_add(ch.len_utf8()); + } + len +} + +fn is_trailing_trim(ch: char, label: &str) -> bool { + if label_allows_colon(label) { + matches!( + ch, + ',' | ';' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } else { + matches!( + ch, + ',' | ';' | ':' | '"' | '\'' | '“' | '”' | '‘' | '’' | '»' | '!' | '?' + ) + } +} + +const fn is_literal_boundary_punct(ch: char) -> bool { + matches!( + ch, + '"' + | '\'' + | '“' + | '”' + | '„' + | '‟' + | '‘' + | '’' + | '‛' + | '«' + | '»' + | '!' + | '.' + ) +} + +fn should_strip_period( + entity: &PipelineEntity, + raw_text: &str, + start_byte: usize, + end_byte: usize, +) -> bool { + if !matches!( + entity.label.as_str(), + "organization" | "location" | "address" + ) { + return false; + } + let Some(text) = raw_text.get(start_byte..end_byte) else { + return false; + }; + if !text.ends_with('.') || known_period_suffix(text) { + return false; + } + if entity.source == DetectionSource::LegalForm { + return false; + } + if entity.label == "address" && known_address_final_abbrev(text) { + return false; + } + !(entity.label == "location" && known_location_final_abbrev(text)) +} + +fn known_period_suffix(text: &str) -> bool { + LEGAL_PERIOD_SUFFIXES + .lines() + .any(|suffix| text.ends_with(suffix)) +} + +fn known_address_final_abbrev(text: &str) -> bool { + ADDRESS_FINAL_ABBREVS.lines().any(|suffix| { + text + .strip_suffix(suffix) + .is_some_and(|prefix| prefix.ends_with(char::is_whitespace)) + }) +} + +fn known_location_final_abbrev(text: &str) -> bool { + text.ends_with("D.C.") + || text + .split_whitespace() + .next_back() + .is_some_and(|token| token.chars().filter(|ch| *ch == '.').count() >= 2) +} + +fn label_allows_colon(label: &str) -> bool { + matches!(label, "ip address" | "mac address") +} + +fn collapse_display_whitespace(text: &str) -> String { + let mut output = String::new(); + let mut whitespace = String::new(); + + for ch in text.chars() { + if ch.is_whitespace() { + whitespace.push(ch); + continue; + } + + flush_whitespace(&mut output, &mut whitespace); + output.push(ch); + } + + flush_whitespace(&mut output, &mut whitespace); + output +} + +fn flush_whitespace(output: &mut String, whitespace: &mut String) { + if whitespace.is_empty() { + return; + } + + if whitespace.chars().any(|ch| matches!(ch, '\n' | '\r')) + || whitespace.chars().count() >= 2 + { + output.push(' '); + } else if let Some(ch) = whitespace.chars().next() { + output.push(ch); + } + + whitespace.clear(); +} + +fn first_char(text: &str) -> Option<(char, usize)> { + text.chars().next().map(|ch| (ch, ch.len_utf8())) +} + +fn last_char(text: &str) -> Option<(char, usize)> { + text.chars().next_back().map(|ch| (ch, ch.len_utf8())) +} diff --git a/crates/anonymize-core/src/resolution/types.rs b/crates/anonymize-core/src/resolution/types.rs new file mode 100644 index 00000000..3b1b5b53 --- /dev/null +++ b/crates/anonymize-core/src/resolution/types.rs @@ -0,0 +1,94 @@ +use crate::types::EntityKind; + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum DetectionSource { + Trigger, + Regex, + DenyList, + LegalForm, + Gazetteer, + Country, + Ner, + Coreference, +} + +impl DetectionSource { + pub(crate) const fn priority(self) -> u8 { + match self { + Self::Gazetteer => 5, + Self::Trigger => 4, + Self::LegalForm | Self::Regex | Self::Country => 3, + Self::DenyList | Self::Coreference => 2, + Self::Ner => 1, + } + } +} + +#[derive( + Clone, Copy, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub enum SourceDetail { + CustomDenyList, + CustomRegex, + GazetteerExtension, + AddressContext, +} + +/// Internal pipeline entity span. `start` and `end` are UTF-8 byte offsets. +#[derive(Clone, Debug, PartialEq)] +pub struct PipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: DetectionSource, + pub source_detail: Option, + pub kind: EntityKind, +} + +impl PipelineEntity { + #[must_use] + pub fn detected( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source: DetectionSource, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source, + source_detail: None, + kind: EntityKind::Detected, + } + } + + #[must_use] + pub fn coreference( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + score: f64, + source_text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + score, + source: DetectionSource::Coreference, + source_detail: None, + kind: EntityKind::Coreference { + source_text: source_text.into(), + }, + } + } +} diff --git a/crates/anonymize-core/src/search.rs b/crates/anonymize-core/src/search.rs new file mode 100644 index 00000000..7c0d8980 --- /dev/null +++ b/crates/anonymize-core/src/search.rs @@ -0,0 +1,663 @@ +use stella_text_search_core as text_search; + +use crate::artifact_bytes::{ArtifactReader, ArtifactWriter}; +use crate::types::{Error, Result, SearchEngine, SearchMatch}; + +const SEARCH_INDEX_ARTIFACTS_HEADER: [u8; 8] = *b"ANONIDX1"; +const SEARCH_INDEX_ARTIFACTS_VERSION: u32 = 1; + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum SearchPattern { + Literal(String), + LiteralWithOptions { + pattern: String, + case_insensitive: Option, + whole_words: Option, + }, + Regex(String), + RegexWithOptions { + pattern: String, + lazy: bool, + prefilter_any: Vec, + prefilter_case_insensitive: Option, + prefilter_regex: Option, + }, + Fuzzy { + pattern: String, + distance: Option, + }, +} + +#[derive( + bon::Builder, + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] +pub struct SearchOptions { + #[builder(default)] + pub literal: LiteralSearchOptions, + #[builder(default)] + pub regex: RegexSearchOptions, + #[builder(default)] + pub fuzzy: FuzzySearchOptions, +} + +#[derive( + bon::Builder, + Clone, + Copy, + Debug, + Default, + Eq, + Ord, + PartialEq, + PartialOrd, + serde::Deserialize, + serde::Serialize, +)] +pub struct LiteralSearchOptions { + #[builder(default)] + pub case_insensitive: bool, + #[builder(default)] + pub whole_words: bool, +} + +#[derive( + bon::Builder, + Clone, + Copy, + Debug, + Default, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] +pub struct RegexSearchOptions { + #[builder(default)] + pub whole_words: bool, + #[builder(default)] + pub overlap_all: bool, +} + +#[derive( + bon::Builder, + Clone, + Copy, + Debug, + Eq, + PartialEq, + serde::Deserialize, + serde::Serialize, +)] +pub struct FuzzySearchOptions { + #[builder(default)] + pub case_insensitive: bool, + #[builder(default = true)] + pub whole_words: bool, + #[builder(default)] + pub normalize_diacritics: bool, +} + +impl Default for FuzzySearchOptions { + fn default() -> Self { + Self { + case_insensitive: false, + whole_words: true, + normalize_diacritics: false, + } + } +} + +pub struct SearchIndex { + slots: Vec, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SearchIndexArtifacts { + pub slots: Vec, +} + +impl SearchIndexArtifacts { + pub fn to_bytes(&self) -> Result> { + let mut writer = ArtifactWriter::new( + SEARCH_INDEX_ARTIFACTS_HEADER, + SEARCH_INDEX_ARTIFACTS_VERSION, + ); + writer.write_len(self.slots.len(), "search_index.slots")?; + for slot in &self.slots { + let slot_bytes = slot.to_bytes().map_err(|error| search_error(&error))?; + writer.write_len_prefixed_bytes("search_index.slot", &slot_bytes)?; + } + Ok(writer.into_bytes()) + } + + pub fn from_bytes(bytes: &[u8]) -> Result { + let mut reader = ArtifactReader::new( + bytes, + SEARCH_INDEX_ARTIFACTS_HEADER, + SEARCH_INDEX_ARTIFACTS_VERSION, + "search_index_artifacts", + )?; + let count = reader.read_usize()?; + let mut slots = Vec::new(); + for _ in 0..count { + slots.push( + text_search::PreparedTextSearchArtifacts::from_bytes( + reader.read_len_prefixed_bytes()?, + ) + .map_err(|error| search_error(&error))?, + ); + } + reader.finish()?; + Ok(Self { slots }) + } +} + +struct SearchSlot { + engine: SlotEngine, + search: text_search::TextSearch, + pattern_indexes: Vec, +} + +#[derive(Clone, Copy)] +enum SlotEngine { + Literal, + Regex, + Fuzzy, +} + +struct SearchIndexParts { + literals: Vec, + literal_indexes: Vec, + regex: Vec, + regex_indexes: Vec, + fuzzy: Vec, + fuzzy_indexes: Vec, +} + +struct SearchIndexArtifactCursor<'a> { + slots: &'a [text_search::PreparedTextSearchArtifacts], + index: usize, +} + +impl<'a> SearchIndexArtifactCursor<'a> { + const fn new(slots: &'a [text_search::PreparedTextSearchArtifacts]) -> Self { + Self { slots, index: 0 } + } + + fn next(&mut self) -> Result<&'a text_search::PreparedTextSearchArtifacts> { + let index = self.index; + let Some(artifacts) = self.slots.get(index) else { + return Err(search_message(format!( + "Missing prepared text-search artifact at slot {index}" + ))); + }; + self.index = self.index.saturating_add(1); + Ok(artifacts) + } + + fn finish(&self) -> Result<()> { + if self.index == self.slots.len() { + return Ok(()); + } + Err(search_message(format!( + "Expected {} prepared text-search artifacts, got {}", + self.index, + self.slots.len() + ))) + } +} + +impl SearchIndex { + pub fn new( + patterns: Vec, + options: SearchOptions, + ) -> Result { + let parts = partition_patterns(patterns)?; + build_search_index(parts, options, None) + } + + pub fn prepare_artifacts( + patterns: Vec, + options: SearchOptions, + ) -> Result { + let parts = partition_patterns(patterns)?; + let mut slots = Vec::new(); + capture_slot_artifacts( + &mut slots, + parts.literals, + literal_options(options.literal), + )?; + capture_regex_slot_artifacts(&mut slots, parts.regex, options.regex)?; + capture_slot_artifacts( + &mut slots, + parts.fuzzy, + fuzzy_options(options.fuzzy), + )?; + Ok(SearchIndexArtifacts { slots }) + } + + pub fn new_with_artifacts( + patterns: Vec, + options: SearchOptions, + artifacts: &SearchIndexArtifacts, + ) -> Result { + if patterns.is_empty() && !artifacts.slots.is_empty() { + return Self::new_all_literal_with_artifacts(options, artifacts); + } + + let parts = partition_patterns(patterns)?; + let mut cursor = SearchIndexArtifactCursor::new(&artifacts.slots); + let search = build_search_index(parts, options, Some(&mut cursor))?; + cursor.finish()?; + Ok(search) + } + + fn new_all_literal_with_artifacts( + options: SearchOptions, + artifacts: &SearchIndexArtifacts, + ) -> Result { + let mut cursor = SearchIndexArtifactCursor::new(&artifacts.slots); + let slot_artifacts = cursor.next()?; + let search = text_search::TextSearch::with_prepared_all_literal_artifacts( + literal_options(options.literal), + slot_artifacts, + ) + .map_err(|error| search_error(&error))?; + cursor.finish()?; + let pattern_indexes = (0..search.len()) + .map(pattern_index) + .collect::>>()?; + Ok(Self { + slots: vec![SearchSlot { + engine: SlotEngine::Literal, + search, + pattern_indexes, + }], + }) + } + + pub fn find_iter(&self, haystack: &str) -> Result> { + let mut matches = Vec::new(); + for slot in &self.slots { + for found in slot + .search + .find_iter(haystack) + .map_err(|error| search_error(&error))? + { + let pattern = remap_pattern(slot, found.pattern)?; + matches.push(match slot.engine { + SlotEngine::Literal => SearchMatch::Literal { + pattern, + start: found.start, + end: found.end, + }, + SlotEngine::Regex => SearchMatch::Regex { + pattern, + start: found.start, + end: found.end, + }, + SlotEngine::Fuzzy => SearchMatch::Fuzzy { + pattern, + start: found.start, + end: found.end, + distance: found.distance.unwrap_or(0), + }, + }); + } + } + + matches.sort_by(|left, right| { + left + .start() + .cmp(&right.start()) + .then_with(|| left.end().cmp(&right.end())) + .then_with(|| left.pattern().cmp(&right.pattern())) + }); + Ok(matches) + } + + pub fn is_match(&self, haystack: &str) -> Result { + for slot in &self.slots { + if slot + .search + .is_match(haystack) + .map_err(|error| search_error(&error))? + { + return Ok(true); + } + } + + Ok(false) + } + + pub fn warm_lazy_regex(&self) -> Result<()> { + for slot in &self.slots { + slot + .search + .warm_lazy_regex() + .map_err(|error| search_error(&error))?; + } + Ok(()) + } + + #[must_use] + pub fn len(&self) -> usize { + self + .slots + .iter() + .map(|slot| slot.pattern_indexes.len()) + .fold(0usize, usize::saturating_add) + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self + .slots + .iter() + .all(|slot| slot.pattern_indexes.is_empty()) + } +} + +fn partition_patterns( + patterns: Vec, +) -> Result { + let mut literals = Vec::new(); + let mut literal_indexes = Vec::new(); + let mut regex = Vec::new(); + let mut regex_indexes = Vec::new(); + let mut fuzzy = Vec::new(); + let mut fuzzy_indexes = Vec::new(); + + for (index, entry) in patterns.into_iter().enumerate() { + let pattern_index = pattern_index(index)?; + match entry { + SearchPattern::Literal(pattern) => { + literals.push(text_search::PatternEntry::Auto(pattern)); + literal_indexes.push(pattern_index); + } + SearchPattern::LiteralWithOptions { + pattern, + case_insensitive, + whole_words, + } => { + literals.push(text_search::PatternEntry::Literal( + text_search::LiteralPattern { + pattern, + name: None, + case_insensitive, + whole_words, + }, + )); + literal_indexes.push(pattern_index); + } + SearchPattern::Regex(pattern) => { + regex.push(text_search::PatternEntry::Regex( + text_search::RegexPattern::new(pattern), + )); + regex_indexes.push(pattern_index); + } + SearchPattern::RegexWithOptions { + pattern, + lazy, + prefilter_any, + prefilter_case_insensitive, + prefilter_regex, + } => { + let mut regex_pattern = text_search::RegexPattern::new(pattern); + regex_pattern.lazy = lazy; + regex_pattern.prefilter_any = prefilter_any; + regex_pattern.prefilter_case_insensitive = prefilter_case_insensitive; + regex_pattern.prefilter_regex = prefilter_regex; + regex.push(text_search::PatternEntry::Regex(regex_pattern)); + regex_indexes.push(pattern_index); + } + SearchPattern::Fuzzy { pattern, distance } => { + fuzzy.push(text_search::PatternEntry::Fuzzy( + text_search::FuzzyPattern::new( + pattern, + distance.map_or( + text_search::FuzzyDistance::Auto, + text_search::FuzzyDistance::Exact, + ), + ), + )); + fuzzy_indexes.push(pattern_index); + } + } + } + + Ok(SearchIndexParts { + literals, + literal_indexes, + regex, + regex_indexes, + fuzzy, + fuzzy_indexes, + }) +} + +fn build_search_index( + parts: SearchIndexParts, + options: SearchOptions, + mut artifacts: Option<&mut SearchIndexArtifactCursor<'_>>, +) -> Result { + let mut slots = Vec::new(); + let literal_artifacts = slot_artifacts(&parts.literals, &mut artifacts)?; + push_slot( + &mut slots, + SlotEngine::Literal, + parts.literals, + parts.literal_indexes, + literal_options(options.literal), + literal_artifacts, + )?; + push_regex_slots( + &mut slots, + parts.regex, + parts.regex_indexes, + options.regex, + &mut artifacts, + )?; + let fuzzy_artifacts = slot_artifacts(&parts.fuzzy, &mut artifacts)?; + push_slot( + &mut slots, + SlotEngine::Fuzzy, + parts.fuzzy, + parts.fuzzy_indexes, + fuzzy_options(options.fuzzy), + fuzzy_artifacts, + )?; + + Ok(SearchIndex { slots }) +} + +fn slot_artifacts<'a>( + patterns: &[text_search::PatternEntry], + artifacts: &mut Option<&mut SearchIndexArtifactCursor<'a>>, +) -> Result> { + if patterns.is_empty() { + return Ok(None); + } + let Some(cursor) = artifacts else { + return Ok(None); + }; + cursor.next().map(Some) +} + +fn capture_regex_slot_artifacts( + slots: &mut Vec, + patterns: Vec, + options: RegexSearchOptions, +) -> Result<()> { + if !options.overlap_all { + return capture_slot_artifacts(slots, patterns, regex_options(options)); + } + + for pattern in patterns { + capture_slot_artifacts(slots, vec![pattern], regex_options(options))?; + } + Ok(()) +} + +fn push_regex_slots( + slots: &mut Vec, + patterns: Vec, + pattern_indexes: Vec, + options: RegexSearchOptions, + artifacts: &mut Option<&mut SearchIndexArtifactCursor<'_>>, +) -> Result<()> { + if !options.overlap_all { + let regex_artifacts = slot_artifacts(&patterns, artifacts)?; + return push_slot( + slots, + SlotEngine::Regex, + patterns, + pattern_indexes, + regex_options(options), + regex_artifacts, + ); + } + + for (pattern, pattern_index) in patterns.into_iter().zip(pattern_indexes) { + let regex_artifacts = + slot_artifacts(std::slice::from_ref(&pattern), artifacts)?; + push_slot( + slots, + SlotEngine::Regex, + vec![pattern], + vec![pattern_index], + regex_options(options), + regex_artifacts, + )?; + } + Ok(()) +} + +fn push_slot( + slots: &mut Vec, + engine: SlotEngine, + patterns: Vec, + pattern_indexes: Vec, + options: text_search::TextSearchOptions, + artifacts: Option<&text_search::PreparedTextSearchArtifacts>, +) -> Result<()> { + if patterns.is_empty() { + return Ok(()); + } + + let search = if let Some(artifacts) = artifacts { + text_search::TextSearch::with_prepared_artifacts( + patterns, options, artifacts, + ) + } else { + text_search::TextSearch::new(patterns, options) + } + .map_err(|error| search_error(&error))?; + slots.push(SearchSlot { + engine, + search, + pattern_indexes, + }); + Ok(()) +} + +fn capture_slot_artifacts( + slots: &mut Vec, + patterns: Vec, + options: text_search::TextSearchOptions, +) -> Result<()> { + if patterns.is_empty() { + return Ok(()); + } + slots.push( + text_search::TextSearch::prepare_artifacts(patterns, options) + .map_err(|error| search_error(&error))?, + ); + Ok(()) +} + +fn literal_options( + options: LiteralSearchOptions, +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + case_insensitive: options.case_insensitive, + whole_words: options.whole_words, + overlap_strategy: text_search::OverlapStrategy::All, + all_literal: true, + ..text_search::TextSearchOptions::default() + } +} + +fn regex_options( + options: RegexSearchOptions, +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + whole_words: options.whole_words, + overlap_strategy: if options.overlap_all { + text_search::OverlapStrategy::All + } else { + text_search::OverlapStrategy::Longest + }, + ..text_search::TextSearchOptions::default() + } +} + +fn fuzzy_options( + options: FuzzySearchOptions, +) -> text_search::TextSearchOptions { + text_search::TextSearchOptions { + case_insensitive: options.case_insensitive, + whole_words: options.whole_words, + normalize_diacritics: options.normalize_diacritics, + ..text_search::TextSearchOptions::default() + } +} + +fn remap_pattern(slot: &SearchSlot, local_pattern: u32) -> Result { + let index = usize::try_from(local_pattern).map_err(|_| { + Error::PatternIndexNotAddressable { + pattern: local_pattern, + } + })?; + slot + .pattern_indexes + .get(index) + .copied() + .ok_or_else(|| Error::Search { + engine: slot.engine.into(), + reason: format!("Missing pattern map entry for {local_pattern}"), + }) +} + +fn search_error(error: &text_search::Error) -> Error { + search_message(error.to_string()) +} + +const fn search_message(reason: String) -> Error { + Error::Search { + engine: SearchEngine::Text, + reason, + } +} + +impl From for SearchEngine { + fn from(value: SlotEngine) -> Self { + match value { + SlotEngine::Literal => Self::Literal, + SlotEngine::Regex => Self::Regex, + SlotEngine::Fuzzy => Self::Fuzzy, + } + } +} + +fn pattern_index(index: usize) -> Result { + u32::try_from(index).map_err(|_| Error::PatternIndexOutOfRange { index }) +} diff --git a/crates/anonymize-core/src/signatures.rs b/crates/anonymize-core/src/signatures.rs new file mode 100644 index 00000000..d2120ff4 --- /dev/null +++ b/crates/anonymize-core/src/signatures.rs @@ -0,0 +1,659 @@ +use crate::resolution::{DetectionSource, PipelineEntity}; + +const PERSON_LABEL: &str = "person"; +const MAX_NAME_LEN: usize = 60; +const MAX_WITNESS_SCAN_BYTES: usize = 600; +const NAME_PARTICLES: &[&str] = &[ + "de", + "del", + "della", + "der", + "den", + "di", + "du", + "da", + "das", + "do", + "dos", + "el", + "la", + "le", + "van", + "von", + "y", + "zu", + "af", + "ben", + "bin", + "al", + "d'", + "d\u{2019}", +]; +const POST_NOMINAL_SUFFIXES: &[&str] = &[ + "jr", "sr", "ii", "iii", "iv", "v", "esq", "esquire", "md", "phd", "jd", + "llm", "mba", "cpa", "pe", "rn", "dds", "dvm", "do", "cfa", "cfp", +]; +const ORG_SUFFIXES: &[&str] = &[ + "inc", + "inc.", + "llc", + "llp", + "lp", + "corp", + "corp.", + "corporation", + "ltd", + "ltd.", + "gmbh", + "ag", + "se", + "kg", + "ohg", + "sa", + "sas", + "sarl", + "s.a", + "s.a.", + "s.p.a", + "s.p.a.", + "plc", + "n.a", + "n.a.", + "n.v", + "n.v.", + "b.v", + "b.v.", + "pty ltd", + "pty ltd.", + "co", + "co.", + "s.r.o", + "s.r.o.", + "a.s", + "a.s.", + "z.s", + "z.s.", + "s.p", + "s.p.", + "s. p.", + "ltda", + "ltda.", + "eireli", + "epp", + "s/a", +]; + +#[must_use] +pub(crate) fn detect_signatures(full_text: &str) -> Vec { + let mut results = Vec::new(); + detect_slash_s(full_text, &mut results); + detect_labelled_names(full_text, &mut results); + detect_witness_blocks(full_text, &mut results); + results +} + +fn detect_slash_s(full_text: &str, results: &mut Vec) { + let mut cursor = 0usize; + while let Some(relative) = + full_text.get(cursor..).and_then(|tail| tail.find("/s/")) + { + let mark_start = cursor.saturating_add(relative); + let mut after_mark = mark_start.saturating_add("/s/".len()); + after_mark = skip_horizontal_ws(full_text, after_mark); + let line_end = find_line_end(full_text, after_mark); + let same_line = full_text + .get(after_mark..line_end) + .unwrap_or_default() + .trim(); + if same_line.is_empty() { + try_emit_forward_lines( + results, + full_text, + line_end.saturating_add(1), + 4, + 0.9, + ); + } else { + let first_cell_end = after_mark.saturating_add( + full_text + .get(after_mark..line_end) + .and_then(first_column_end) + .unwrap_or_else(|| line_end.saturating_sub(after_mark)), + ); + try_emit(results, full_text, after_mark, first_cell_end, 0.95); + } + + if let Some((prev_start, prev_end)) = find_prev_line(full_text, mark_start) + { + try_emit(results, full_text, prev_start, prev_end, 0.85); + } + cursor = mark_start.saturating_add("/s/".len()); + } +} + +fn detect_labelled_names(full_text: &str, results: &mut Vec) { + let mut line_start = 0usize; + while line_start <= full_text.len() { + let line_end = find_line_end(full_text, line_start); + if let Some(line) = full_text.get(line_start..line_end) { + detect_labelled_names_in_line(full_text, line_start, line, results); + } + if line_end >= full_text.len() { + break; + } + line_start = line_end.saturating_add(1); + } +} + +fn detect_labelled_names_in_line( + full_text: &str, + line_start: usize, + line: &str, + results: &mut Vec, +) { + let mut cursor = 0usize; + while let Some(label) = find_label(line, cursor) { + let mut value_start = label.value_start; + if let Some(after_slash) = slash_s_prefix_end(line, value_start) { + value_start = after_slash; + } + let value_end = value_start.saturating_add( + line + .get(value_start..) + .and_then(first_column_end) + .unwrap_or_else(|| line.len().saturating_sub(value_start)), + ); + let global_start = line_start.saturating_add(value_start); + let global_end = line_start.saturating_add(value_end); + let value_is_empty = line + .get(value_start..value_end) + .unwrap_or_default() + .trim() + .is_empty(); + if value_is_empty { + try_emit_forward_lines( + results, + full_text, + global_end.saturating_add(1), + 3, + 0.9, + ); + } else { + try_emit(results, full_text, global_start, global_end, 0.95); + } + cursor = value_end.max(label.next_cursor); + } +} + +fn detect_witness_blocks(full_text: &str, results: &mut Vec) { + let mut cursor = 0usize; + while let Some(relative) = find_ascii_case_insensitive( + full_text.get(cursor..).unwrap_or_default(), + "in witness whereof", + ) { + let anchor = cursor.saturating_add(relative); + if !has_word_boundaries(full_text, anchor, "in witness whereof".len()) { + cursor = anchor.saturating_add(1); + continue; + } + let anchor_line_end = find_line_end(full_text, anchor); + if anchor_line_end >= full_text.len() { + break; + } + let limit = + advance_char_boundary(full_text, anchor, MAX_WITNESS_SCAN_BYTES); + if let Some(scan_from) = find_witness_sentence_end(full_text, anchor, limit) + { + try_emit_forward_lines(results, full_text, scan_from, 6, 0.85); + } + cursor = anchor.saturating_add("in witness whereof".len()); + } +} + +fn try_emit_forward_lines( + results: &mut Vec, + full_text: &str, + from_pos: usize, + max_lines: usize, + score: f64, +) -> bool { + let mut pos = from_pos; + for _ in 0..max_lines { + if pos >= full_text.len() { + return false; + } + let line_end = find_line_end(full_text, pos); + let line = full_text.get(pos..line_end).unwrap_or_default().trim(); + if !line.is_empty() + && !is_image_stub(line) + && try_emit(results, full_text, pos, line_end, score) + { + return true; + } + pos = line_end.saturating_add(1); + } + false +} + +fn try_emit( + results: &mut Vec, + full_text: &str, + start: usize, + end: usize, + score: f64, +) -> bool { + let raw = full_text.get(start..end).unwrap_or_default(); + if contains_org_suffix(raw) { + return false; + } + let candidate = normalise_candidate(raw); + if !is_name_shape(&candidate) { + return false; + } + let Some(offset) = raw.find(&candidate) else { + return false; + }; + let abs_start = start.saturating_add(offset); + let abs_end = abs_start.saturating_add(candidate.len()); + let Ok(start_u32) = u32::try_from(abs_start) else { + return false; + }; + let Ok(end_u32) = u32::try_from(abs_end) else { + return false; + }; + results.push(PipelineEntity::detected( + start_u32, + end_u32, + PERSON_LABEL, + candidate, + score, + DetectionSource::Trigger, + )); + true +} + +fn normalise_candidate(text: &str) -> String { + let stripped = strip_post_nominal_suffix(text.trim()); + let first_cell_end = first_column_end(stripped).unwrap_or(stripped.len()); + stripped + .get(..first_cell_end) + .unwrap_or(stripped) + .trim() + .to_owned() +} + +fn strip_post_nominal_suffix(text: &str) -> &str { + let Some(comma) = text.rfind(',') else { + return text; + }; + let suffix = text + .get(comma.saturating_add(1)..) + .unwrap_or_default() + .trim() + .trim_end_matches('.'); + let compact = suffix + .chars() + .filter(|ch| *ch != '.') + .collect::() + .to_lowercase(); + if POST_NOMINAL_SUFFIXES.contains(&compact.as_str()) { + return text.get(..comma).unwrap_or(text).trim(); + } + text +} + +fn is_name_shape(text: &str) -> bool { + let text_len = text.chars().map(char::len_utf16).sum::(); + if !(3..=MAX_NAME_LEN).contains(&text_len) { + return false; + } + let tokens = text.split([' ', '\t']).filter(|token| !token.is_empty()); + let tokens = tokens.collect::>(); + if !(2..=5).contains(&tokens.len()) { + return false; + } + let Some(first) = tokens.first() else { + return false; + }; + if !is_cap_token(first) { + return false; + } + tokens + .iter() + .skip(1) + .all(|token| is_name_particle(token) || is_cap_token(token)) +} + +fn is_cap_token(token: &str) -> bool { + let mut chars = token.chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_uppercase() + && chars.take(30).all(|ch| { + ch.is_alphabetic() + || matches!(ch, '\u{0300}'..='\u{036f}' | '.' | '\'' | '-' | '’') + }) +} + +fn is_name_particle(token: &str) -> bool { + NAME_PARTICLES.contains(&token) +} + +fn contains_org_suffix(text: &str) -> bool { + let lower = text.to_lowercase(); + ORG_SUFFIXES + .iter() + .any(|suffix| contains_bounded(&lower, suffix)) +} + +fn contains_bounded(text: &str, needle: &str) -> bool { + let mut cursor = 0usize; + while let Some(relative) = + text.get(cursor..).and_then(|tail| tail.find(needle)) + { + let start = cursor.saturating_add(relative); + let end = start.saturating_add(needle.len()); + if boundary_before(text, start) && boundary_after(text, end) { + return true; + } + cursor = start.saturating_add(1); + } + false +} + +fn boundary_before(text: &str, byte: usize) -> bool { + char_before(text, byte).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn boundary_after(text: &str, byte: usize) -> bool { + char_after(text, byte).is_none_or(|ch| !ch.is_alphanumeric()) +} + +fn first_column_end(text: &str) -> Option { + let mut run_start = None::; + let mut run_len = 0usize; + for (index, ch) in text.char_indices() { + if ch == '\t' { + return Some(index); + } + if ch.is_whitespace() { + if run_start.is_none() { + run_start = Some(index); + } + run_len = run_len.saturating_add(1); + if run_len >= 3 { + return run_start; + } + continue; + } + run_start = None; + run_len = 0; + } + None +} + +#[derive(Clone, Copy)] +struct LabelMatch { + value_start: usize, + next_cursor: usize, +} + +fn find_label(line: &str, from: usize) -> Option { + let mut cursor = from; + while cursor < line.len() { + if !line.is_char_boundary(cursor) { + cursor = cursor.saturating_add(1); + continue; + } + if let Some(after_label) = label_end_at(line, cursor) { + let mut after_spaces = skip_horizontal_ws(line, after_label); + if line.get(after_spaces..)?.starts_with(':') { + after_spaces = skip_horizontal_ws(line, after_spaces.saturating_add(1)); + return Some(LabelMatch { + value_start: after_spaces, + next_cursor: after_spaces.saturating_add(1), + }); + } + } + cursor = cursor.saturating_add(1); + } + None +} + +fn label_end_at(line: &str, start: usize) -> Option { + if !boundary_before(line, start) { + return None; + } + if starts_with_ascii_ci(line.get(start..)?, "by") { + let end = start.saturating_add("by".len()); + return label_tail_is_valid(line, end).then_some(end); + } + if starts_with_ascii_ci(line.get(start..)?, "name") { + let end = start.saturating_add("name".len()); + return label_tail_is_valid(line, end).then_some(end); + } + None +} + +fn label_tail_is_valid(line: &str, end: usize) -> bool { + line + .get(end..) + .and_then(|tail| tail.chars().next()) + .is_some_and(|ch| ch == ':' || ch == ' ' || ch == '\t') +} + +fn slash_s_prefix_end(line: &str, start: usize) -> Option { + let tail = line.get(start..)?; + if !tail.starts_with("/s/") { + return None; + } + let after = start.saturating_add("/s/".len()); + let has_space = line + .get(after..) + .and_then(|value| value.chars().next()) + .is_some_and(|ch| ch == ' ' || ch == '\t'); + has_space.then(|| skip_horizontal_ws(line, after)) +} + +fn skip_horizontal_ws(text: &str, from: usize) -> usize { + let mut cursor = from; + while let Some(ch) = text.get(cursor..).and_then(|tail| tail.chars().next()) { + if ch != ' ' && ch != '\t' { + break; + } + cursor = cursor.saturating_add(ch.len_utf8()); + } + cursor +} + +fn find_line_end(text: &str, pos: usize) -> usize { + text + .get(pos..) + .and_then(|tail| tail.find('\n')) + .map_or(text.len(), |relative| pos.saturating_add(relative)) +} + +fn find_prev_line(full_text: &str, pos: usize) -> Option<(usize, usize)> { + if pos == 0 { + return None; + } + let bytes = full_text.as_bytes(); + let mut cursor = pos.saturating_sub(1); + while cursor > 0 && bytes.get(cursor).copied() != Some(b'\n') { + cursor = cursor.saturating_sub(1); + } + if bytes.get(cursor).copied() != Some(b'\n') { + return None; + } + + while cursor > 0 { + let line_end = cursor; + let mut line_start = line_end; + while line_start > 0 + && bytes.get(line_start.saturating_sub(1)).copied() != Some(b'\n') + { + line_start = line_start.saturating_sub(1); + } + let line = full_text + .get(line_start..line_end) + .unwrap_or_default() + .trim(); + if !line.is_empty() && !is_image_stub(line) { + return Some((line_start, line_end)); + } + if line_start == 0 { + break; + } + cursor = line_start.saturating_sub(1); + } + None +} + +fn find_witness_sentence_end( + full_text: &str, + from: usize, + limit: usize, +) -> Option { + let mut line_start = from; + while line_start < limit { + let line_end = find_line_end(full_text, line_start).min(limit); + let line = full_text + .get(line_start..line_end) + .unwrap_or_default() + .trim_end(); + if line.ends_with('.') || line.ends_with(':') || line.ends_with(';') { + return Some(line_end.saturating_add(1)); + } + let next_start = line_end.saturating_add(1); + if next_start >= limit { + return None; + } + let next_end = find_line_end(full_text, next_start).min(limit); + let next_line_empty = full_text + .get(next_start..next_end) + .unwrap_or_default() + .trim() + .is_empty(); + if next_line_empty { + return Some(next_end.saturating_add(1)); + } + line_start = next_start; + } + None +} + +fn advance_char_boundary(text: &str, start: usize, max_bytes: usize) -> usize { + let limit = start.saturating_add(max_bytes).min(text.len()); + if text.is_char_boundary(limit) { + return limit; + } + let mut cursor = limit; + while cursor > start && !text.is_char_boundary(cursor) { + cursor = cursor.saturating_sub(1); + } + cursor +} + +fn find_ascii_case_insensitive(text: &str, needle: &str) -> Option { + let needle_len = needle.len(); + if needle_len == 0 || text.len() < needle_len { + return None; + } + let mut cursor = 0usize; + while cursor.saturating_add(needle_len) <= text.len() { + if text.is_char_boundary(cursor) + && starts_with_ascii_ci(text.get(cursor..)?, needle) + { + return Some(cursor); + } + cursor = cursor.saturating_add(1); + } + None +} + +fn starts_with_ascii_ci(text: &str, prefix: &str) -> bool { + let Some(candidate) = text.get(..prefix.len()) else { + return false; + }; + candidate.eq_ignore_ascii_case(prefix) +} + +fn has_word_boundaries(text: &str, start: usize, len: usize) -> bool { + boundary_before(text, start) + && boundary_after(text, start.saturating_add(len)) +} + +fn char_before(text: &str, byte: usize) -> Option { + text.get(..byte)?.chars().next_back() +} + +fn char_after(text: &str, byte: usize) -> Option { + text.get(byte..)?.chars().next() +} + +fn is_image_stub(line: &str) -> bool { + let lower = line.trim_start().to_lowercase(); + lower.starts_with("[img") + || lower.starts_with("[image") + || lower.starts_with("[logo") + || lower.starts_with("(logo") +} + +#[cfg(test)] +mod tests { + use super::detect_signatures; + + #[test] + fn detects_slash_signature_same_line() { + let entities = detect_signatures("/s/ Jane Doe Chief Executive Officer"); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some("Jane Doe") + ); + } + + #[test] + fn counts_signature_name_length_in_text_units() { + let name = "Élodie ŽluťoučkýKůňÚpělĎábelskéÓdyÁÉÍÓÚÝČĎĚŇŘŠŤŽ"; + assert!(name.len() > super::MAX_NAME_LEN); + assert!( + name.chars().map(char::len_utf16).sum::() <= super::MAX_NAME_LEN + ); + + let entities = detect_signatures(&format!("/s/ {name}")); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some(name) + ); + } + + #[test] + fn detects_multiple_labelled_name_columns() { + let entities = + detect_signatures("Name: Priya Ramanathan Name: Jonathan H. Whitaker"); + + assert_eq!( + entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(), + vec!["Priya Ramanathan", "Jonathan H. Whitaker"] + ); + } + + #[test] + fn skips_organization_caption_before_signature_mark() { + let entities = detect_signatures("TWITTER, INC.\n/s/ Jane Doe"); + + assert_eq!(entities.len(), 1); + assert_eq!( + entities.first().map(|entity| entity.text.as_str()), + Some("Jane Doe") + ); + } +} diff --git a/crates/anonymize-core/src/triggers.rs b/crates/anonymize-core/src/triggers.rs new file mode 100644 index 00000000..88588cf5 --- /dev/null +++ b/crates/anonymize-core/src/triggers.rs @@ -0,0 +1,1693 @@ +use fancy_regex::Regex as FancyRegex; +use regex::{Regex, RegexBuilder}; + +use crate::byte_offsets::ByteOffsets; +use crate::diagnostics::{DiagnosticStage, StaticRedactionDiagnostics}; +use crate::resolution::{DetectionSource, PipelineEntity}; +use crate::types::{Error, Result, SearchMatch}; +use crate::validators::validate_named_id; + +use super::processors::PatternSlice; + +const TRIGGER_SCORE: f64 = 0.95; +const MAX_TRIGGER_VALUE_LEN: usize = 100; +const MIN_TRIGGER_PHONE_DIGITS: usize = 5; +const TRIGGER_LOOKAHEAD_MARGIN: usize = 128; +const LINE_TRIGGER_LOOKAHEAD: usize = 2_048; +const MATCH_PATTERN_LOOKAHEAD: usize = 512; + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct TriggerData { + pub rules: Vec, + pub address_stop_keywords: Vec, + pub party_position_terms: Vec, + pub legal_form_suffixes: Vec, + #[serde(default)] + pub post_nominals: Vec, + pub sentence_terminal_currency_terms: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct TriggerRule { + pub trigger: String, + pub label: String, + pub strategy: TriggerStrategy, + pub validations: Vec, + pub include_trigger: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum TriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: u32, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + pattern: String, + flags: Option, + }, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub enum TriggerValidation { + StartsUppercase, + MinLength(u32), + MaxLength(u32), + NoDigits, + HasDigits, + MatchesPattern { + pattern: String, + flags: Option, + }, + ValidId { + validator: String, + }, +} + +pub(crate) struct PreparedTriggerData { + rules: Vec, + address_stop_keywords: Vec, + party_position_terms: Vec, + legal_form_suffixes: Vec, + post_nominals: Vec, + sentence_terminal_currency_terms: Vec, +} + +struct PreparedTriggerRule { + trigger: String, + label: String, + strategy: PreparedTriggerStrategy, + validations: Vec, + include_trigger: bool, +} + +enum PreparedTriggerStrategy { + ToNextComma { + stop_words: Vec, + max_length: Option, + }, + ToEndOfLine, + NWords { + count: usize, + }, + CompanyIdValue, + Address { + max_chars: Option, + }, + MatchPattern { + regex: FancyRegex, + }, +} + +enum PreparedTriggerValidation { + StartsUppercase, + MinLength(usize), + MaxLength(usize), + NoDigits, + HasDigits, + MatchesPattern { regex: Regex }, + ValidId { validator: String }, +} + +#[derive(Clone)] +struct ExtractedValue { + start: u32, + end: u32, + text: String, +} + +struct TriggerExtractionData<'a> { + address_stop_keywords: &'a [String], + party_position_terms: &'a [String], + post_nominals: &'a [String], + sentence_terminal_currency_terms: &'a [String], +} + +impl PreparedTriggerData { + pub(crate) fn new(data: TriggerData) -> Result { + let rules = data + .rules + .into_iter() + .map(PreparedTriggerRule::new) + .collect::>>()?; + Ok(Self { + rules, + address_stop_keywords: data.address_stop_keywords, + party_position_terms: data.party_position_terms, + legal_form_suffixes: data.legal_form_suffixes, + post_nominals: data + .post_nominals + .into_iter() + .filter(|term| !term.trim().is_empty()) + .collect(), + sentence_terminal_currency_terms: data + .sentence_terminal_currency_terms + .into_iter() + .filter(|term| !term.is_empty()) + .collect(), + }) + } +} + +impl PreparedTriggerRule { + fn new(rule: TriggerRule) -> Result { + Ok(Self { + trigger: rule.trigger, + label: rule.label, + strategy: PreparedTriggerStrategy::new(rule.strategy)?, + validations: rule + .validations + .into_iter() + .map(PreparedTriggerValidation::new) + .collect::>>()?, + include_trigger: rule.include_trigger, + }) + } +} + +impl PreparedTriggerStrategy { + fn new(strategy: TriggerStrategy) -> Result { + Ok(match strategy { + TriggerStrategy::ToNextComma { + stop_words, + max_length, + } => Self::ToNextComma { + stop_words, + max_length: max_length.and_then(|value| usize::try_from(value).ok()), + }, + TriggerStrategy::ToEndOfLine => Self::ToEndOfLine, + TriggerStrategy::NWords { count } => Self::NWords { + count: usize::try_from(count).unwrap_or(usize::MAX), + }, + TriggerStrategy::CompanyIdValue => Self::CompanyIdValue, + TriggerStrategy::Address { max_chars } => Self::Address { + max_chars: max_chars.and_then(|value| usize::try_from(value).ok()), + }, + TriggerStrategy::MatchPattern { pattern, flags } => Self::MatchPattern { + regex: build_fancy_regex(&format!("^(?:{pattern})"), flags.as_deref())?, + }, + }) + } +} + +impl PreparedTriggerValidation { + fn new(validation: TriggerValidation) -> Result { + Ok(match validation { + TriggerValidation::StartsUppercase => Self::StartsUppercase, + TriggerValidation::MinLength(min) => { + Self::MinLength(usize::try_from(min).unwrap_or(usize::MAX)) + } + TriggerValidation::MaxLength(max) => { + Self::MaxLength(usize::try_from(max).unwrap_or(usize::MAX)) + } + TriggerValidation::NoDigits => Self::NoDigits, + TriggerValidation::HasDigits => Self::HasDigits, + TriggerValidation::MatchesPattern { pattern, flags } => { + Self::MatchesPattern { + regex: build_regex(&pattern, flags.as_deref())?, + } + } + TriggerValidation::ValidId { validator } => Self::ValidId { validator }, + }) + } +} + +pub(crate) fn process_trigger_matches( + matches: &[SearchMatch], + slice: PatternSlice, + full_text: &str, + data: &PreparedTriggerData, + mut diagnostics: Option<&mut StaticRedactionDiagnostics>, +) -> Result> { + let offsets = ByteOffsets::new(full_text); + let mut results = Vec::new(); + let extraction_data = TriggerExtractionData { + address_stop_keywords: &data.address_stop_keywords, + party_position_terms: &data.party_position_terms, + post_nominals: &data.post_nominals, + sentence_terminal_currency_terms: &data.sentence_terminal_currency_terms, + }; + + for found in matches { + let Some(local_index) = slice.local_index(found.pattern()) else { + continue; + }; + let Some(rule) = data.rules.get(local_index) else { + continue; + }; + if !has_left_boundary(full_text, &offsets, found.start())? { + record_trigger_rejection(&mut diagnostics, found, rule, "left-boundary"); + continue; + } + if !has_right_boundary(full_text, &offsets, found.end(), &rule.trigger)? { + record_trigger_rejection(&mut diagnostics, found, rule, "right-boundary"); + continue; + } + let Some(raw_value) = extract_value( + full_text, + &offsets, + found.end(), + &rule.strategy, + &rule.label, + &extraction_data, + )? + else { + record_trigger_rejection(&mut diagnostics, found, rule, "empty-value"); + continue; + }; + let Some(mut value) = strip_quotes(&raw_value) else { + record_trigger_rejection( + &mut diagnostics, + found, + rule, + "empty-quoted-value", + ); + continue; + }; + if !apply_validations(&value.text, &rule.validations) { + record_trigger_rejection(&mut diagnostics, found, rule, "validation"); + continue; + } + if rule.label == "phone number" + && !is_plausible_phone_trigger_value(&value.text) + { + record_trigger_rejection(&mut diagnostics, found, rule, "phone-shape"); + continue; + } + if rule.label == "phone number" + && char_count(&value.text) > MAX_TRIGGER_VALUE_LEN + && char_at(full_text, &offsets, value.end)? != Some('\n') + && char_at(full_text, &offsets, value.end)? != Some('\t') + { + value = cap_phone_value(&value); + } + + let entity_start = if rule.include_trigger { + found.start() + } else { + value.start + }; + let mut entity_end = value.end; + let mut entity_text = offsets.slice(entity_start, entity_end)?; + let mut label = if rule.label == "person" + && has_known_legal_form_suffix(&entity_text, &data.legal_form_suffixes) + { + String::from("organization") + } else { + rule.label.clone() + }; + + if label == "person" + && let Some(end) = person_name_run_end(&value.text) + && end < value.text.len() + && let Some(head) = value.text.get(..end) + { + entity_end = value.start.saturating_add(u32_len(head)); + entity_text = offsets.slice(entity_start, entity_end)?; + } + + if label.is_empty() { + label.clone_from(&rule.label); + } + results.push(PipelineEntity::detected( + entity_start, + entity_end, + label, + entity_text, + TRIGGER_SCORE, + DetectionSource::Trigger, + )); + } + + Ok(results) +} + +fn record_trigger_rejection( + diagnostics: &mut Option<&mut StaticRedactionDiagnostics>, + found: &SearchMatch, + rule: &PreparedTriggerRule, + reason: &'static str, +) { + let Some(diagnostics) = diagnostics.as_deref_mut() else { + return; + }; + diagnostics.record_rejection( + DiagnosticStage::EntityTrigger, + Some(found.pattern()), + Some(&rule.label), + Some(found.start()), + Some(found.end()), + reason, + ); +} + +fn extract_value( + text: &str, + offsets: &ByteOffsets<'_>, + trigger_end: u32, + strategy: &PreparedTriggerStrategy, + label: &str, + data: &TriggerExtractionData<'_>, +) -> Result> { + let trigger_end_byte = offsets.validate_offset(trigger_end)?; + let lookahead = get_trigger_lookahead(strategy); + let lookahead_end_offset = offsets.offset_after_utf16_units( + trigger_end, + u32::try_from(lookahead).unwrap_or(u32::MAX), + )?; + let lookahead_end = offsets.validate_offset(lookahead_end_offset)?; + let remaining = text + .get(trigger_end_byte..lookahead_end) + .unwrap_or_default(); + let stripped = remaining.trim_start_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, ':' | ';') + }); + let trimmed_offset = remaining.len().saturating_sub(stripped.len()); + let value_start_byte = trigger_end_byte.saturating_add(trimmed_offset); + if stripped.is_empty() { + return Ok(None); + } + + let extracted = match strategy { + PreparedTriggerStrategy::ToNextComma { + stop_words, + max_length, + } => extract_to_next_comma( + stripped, + value_start_byte, + label, + stop_words, + max_length.unwrap_or(MAX_TRIGGER_VALUE_LEN), + data.post_nominals, + data.sentence_terminal_currency_terms, + ), + PreparedTriggerStrategy::ToEndOfLine => { + extract_to_end_of_line(remaining, stripped, value_start_byte, label) + } + PreparedTriggerStrategy::NWords { count } => { + extract_n_words(stripped, value_start_byte, *count, label) + } + PreparedTriggerStrategy::CompanyIdValue => { + extract_company_id_value(text, trigger_end_byte) + } + PreparedTriggerStrategy::Address { max_chars } => extract_address( + stripped, + value_start_byte, + max_chars.unwrap_or(120), + data.address_stop_keywords, + data.party_position_terms, + data.sentence_terminal_currency_terms, + ), + PreparedTriggerStrategy::MatchPattern { regex } => { + extract_match_pattern(stripped, value_start_byte, regex) + } + }; + Ok(extracted.and_then(|value| byte_value_to_offsets(text, offsets, value))) +} + +fn extract_to_next_comma( + value_text: &str, + value_start_byte: usize, + label: &str, + stop_words: &[String], + length_cap: usize, + post_nominals: &[String], + sentence_terminal_currency_terms: &[String], +) -> Option { + let mut end = 0; + while end < value_text.len() { + let Some((ch, len)) = char_at_byte(value_text, end) else { + break; + }; + if matches!(ch, '\n' | '(' | ')' | '[' | ']' | '\t' | ';') { + break; + } + if ch == '.' + && is_sentence_terminator( + value_text, + end, + sentence_terminal_currency_terms, + ) + { + break; + } + if hits_stop_word(value_text, end, stop_words) { + break; + } + if ch == ',' { + let after = value_text.get(end..).unwrap_or_default(); + if is_decimal_comma(after) { + end = end.saturating_add(len); + continue; + } + if label == "person" + && let Some(skip) = post_nominal_len(after, post_nominals) + { + end = end.saturating_add(skip); + continue; + } + break; + } + end = end.saturating_add(len); + } + if prefix_char_count(value_text, end) > length_cap { + end = cap_at_word_boundary(value_text, length_cap); + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_to_end_of_line( + remaining: &str, + value_text: &str, + value_start_byte: usize, + label: &str, +) -> Option { + let consumed = remaining.len().saturating_sub(value_text.len()); + if consumed > 0 && remaining.get(..consumed)?.contains('\n') { + return None; + } + let mut end = value_text.len(); + let mut found_line_stop = false; + for ch in ['\n', '\t'] { + if let Some(index) = value_text.find(ch) + && index < end + { + end = index; + found_line_stop = true; + } + } + if label == "phone number" + && let Some(shape_end) = phone_shape_end(value_text.get(..end)?) + && shape_end < end + { + end = shape_end.min(MAX_TRIGGER_VALUE_LEN); + found_line_stop = true; + } + if !found_line_stop { + end = cap_at_word_boundary(value_text, end.min(MAX_TRIGGER_VALUE_LEN)); + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_n_words( + value_text: &str, + value_start_byte: usize, + count: usize, + _label: &str, +) -> Option { + let cell_end = value_text.find('\t').unwrap_or(value_text.len()); + let cell = value_text.get(..cell_end)?; + let mut words = Vec::>::new(); + for word in cell.split_whitespace() { + if punctuation_only(word) || number_marker(word) { + continue; + } + let search_pos = + words.last().map_or(0, |entry| entry.end.saturating_add(1)); + let relative = cell.get(search_pos..)?.find(word)?; + let start = search_pos.saturating_add(relative); + words.push(WordToken { + _text: word, + start, + end: start.saturating_add(word.len()), + }); + if words.len() >= count { + break; + } + } + let first = words.first().copied()?; + let last = words.last().copied()?; + byte_value( + cell.get(first.start..last.end)?, + value_start_byte.saturating_add(first.start), + last.end.saturating_sub(first.start), + ) +} + +#[derive(Clone, Copy)] +struct WordToken<'a> { + _text: &'a str, + start: usize, + end: usize, +} + +fn extract_company_id_value( + text: &str, + trigger_end_byte: usize, +) -> Option { + let raw = text.get(trigger_end_byte..)?; + let trigger_last = text.get(..trigger_end_byte)?.chars().next_back(); + let allow_empty_sep = matches!(trigger_last, Some('°' | 'º' | '№' | '#')); + let sep_len = separator_len(raw, allow_empty_sep)?; + let mut after_sep = raw.get(sep_len..)?; + let mut label_offset = 0; + if let Some(len) = number_label_len(after_sep) { + label_offset = len; + after_sep = after_sep.get(len..)?; + } + let id_raw = id_value_prefix(after_sep)?; + let id_text = id_raw.trim().trim_end_matches(|ch: char| { + matches!(ch, '.' | ',' | ';' | ':' | '!' | '?') + }); + if id_text.is_empty() { + return None; + } + let leading = id_raw.len().saturating_sub(id_raw.trim_start().len()); + Some(ByteValue { + start_byte: trigger_end_byte + .saturating_add(sep_len) + .saturating_add(label_offset) + .saturating_add(leading), + end_byte: trigger_end_byte + .saturating_add(sep_len) + .saturating_add(label_offset) + .saturating_add(leading) + .saturating_add(id_text.len()), + }) +} + +fn extract_address( + mut value_text: &str, + mut value_start_byte: usize, + max_len: usize, + stop_keywords: &[String], + party_position_terms: &[String], + sentence_terminal_currency_terms: &[String], +) -> Option { + if let Some(trimmed) = + trim_leading_party_position(value_text, party_position_terms) + { + value_start_byte = value_start_byte.saturating_add(trimmed); + value_text = value_text.get(trimmed..)?; + } + + let mut end = 0; + while end < value_text.len() && prefix_char_count(value_text, end) < max_len { + let Some((ch, len)) = char_at_byte(value_text, end) else { + break; + }; + if matches!(ch, '\n' | '(') { + break; + } + if matches!(ch, ' ' | '\t') + && address_stop_hit(value_text.get(end..)?.trim_start(), stop_keywords) + { + break; + } + if ch == '.' { + let after_period = value_text.get(end.saturating_add(len)..)?; + if address_stop_hit(after_period.trim_start(), stop_keywords) { + break; + } + if let Some((next, _)) = char_at_byte(value_text, end.saturating_add(len)) + && (next.is_alphabetic() || next.is_ascii_digit()) + { + end = end.saturating_add(len); + continue; + } + if value_text + .get(end.saturating_add(len)..) + .is_some_and(|tail| { + tail.starts_with(' ') + && tail.trim_start().chars().next().is_some_and(|next_ch| { + next_ch.is_alphabetic() || next_ch.is_ascii_digit() + }) + }) + && !is_sentence_terminator( + value_text, + end, + sentence_terminal_currency_terms, + ) + { + end = end.saturating_add(len); + continue; + } + break; + } + if ch == ',' { + let after = value_text.get(end.saturating_add(len)..)?.trim_start(); + if address_stop_hit(after, stop_keywords) { + break; + } + if after.chars().next().is_some_and(|next_ch| { + next_ch.is_ascii_digit() || next_ch.is_uppercase() + }) { + end = end.saturating_add(len); + continue; + } + break; + } + end = end.saturating_add(len); + } + if prefix_char_count(value_text, end) >= max_len + && let Some(last_space) = value_text.get(..end)?.rfind(' ') + && last_space > 0 + { + end = last_space; + } + byte_value(value_text, value_start_byte, end) +} + +fn extract_match_pattern( + value_text: &str, + value_start_byte: usize, + regex: &FancyRegex, +) -> Option { + let line = value_text + .split_once('\n') + .map_or(value_text, |(head, _)| head); + let found = regex.find(line).ok().flatten()?; + if found.start() != 0 || found.start() == found.end() { + return None; + } + Some(ByteValue { + start_byte: value_start_byte.saturating_add(found.start()), + end_byte: value_start_byte.saturating_add(found.end()), + }) +} + +#[derive(Clone, Copy)] +struct ByteValue { + start_byte: usize, + end_byte: usize, +} + +fn byte_value( + value_text: &str, + value_start_byte: usize, + end: usize, +) -> Option { + let raw = value_text.get(..end)?; + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + let leading = raw.len().saturating_sub(raw.trim_start().len()); + let trailing = raw.len().saturating_sub(raw.trim_end().len()); + Some(ByteValue { + start_byte: value_start_byte.saturating_add(leading), + end_byte: value_start_byte + .saturating_add(end) + .saturating_sub(trailing), + }) +} + +fn byte_value_to_offsets( + full_text: &str, + _offsets: &ByteOffsets<'_>, + value: ByteValue, +) -> Option { + if !full_text.is_char_boundary(value.start_byte) + || !full_text.is_char_boundary(value.end_byte) + { + return None; + } + Some(ExtractedValue { + start: byte_to_offset(value.start_byte)?, + end: byte_to_offset(value.end_byte)?, + text: full_text.get(value.start_byte..value.end_byte)?.to_owned(), + }) +} + +fn strip_quotes(value: &ExtractedValue) -> Option { + let leading = value.text.len().saturating_sub( + value + .text + .trim_start_matches(|ch: char| { + ch.is_whitespace() + || matches!(ch, '„' | '"' | '»' | '«' | '\'' | '(' | ')') + }) + .len(), + ); + let stripped = value.text.get(leading..)?.trim_end_matches(|ch: char| { + ch.is_whitespace() || matches!(ch, '"' | '»' | '«' | '\'' | '(' | ')') + }); + if stripped.is_empty() { + return None; + } + Some(ExtractedValue { + start: value + .start + .saturating_add(u32_len(value.text.get(..leading)?)), + end: value + .start + .saturating_add(u32_len(value.text.get(..leading)?)) + .saturating_add(u32_len(stripped)), + text: stripped.to_owned(), + }) +} + +fn apply_validations( + text: &str, + validations: &[PreparedTriggerValidation], +) -> bool { + let text_len = text.chars().count(); + validations.iter().all(|validation| match validation { + PreparedTriggerValidation::StartsUppercase => { + text.chars().next().is_some_and(char::is_uppercase) + } + PreparedTriggerValidation::MinLength(min) => text_len >= *min, + PreparedTriggerValidation::MaxLength(max) => text_len <= *max, + PreparedTriggerValidation::NoDigits => { + !text.chars().any(|ch| ch.is_ascii_digit()) + } + PreparedTriggerValidation::HasDigits => { + text.chars().any(|ch| ch.is_ascii_digit()) + } + PreparedTriggerValidation::MatchesPattern { regex } => regex.is_match(text), + PreparedTriggerValidation::ValidId { validator } => { + validate_named_id(validator, text) + } + }) +} + +fn build_regex(pattern: &str, flags: Option<&str>) -> Result { + let mut builder = RegexBuilder::new(pattern); + if flags.is_some_and(|flags| flags.contains('i')) { + builder.case_insensitive(true); + } + builder.build().map_err(|error| Error::Search { + engine: crate::types::SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn build_fancy_regex(pattern: &str, flags: Option<&str>) -> Result { + let source = if flags.is_some_and(|flags| flags.contains('i')) { + format!("(?i:{pattern})") + } else { + pattern.to_owned() + }; + FancyRegex::new(&source).map_err(|error| Error::Search { + engine: crate::types::SearchEngine::Regex, + reason: error.to_string(), + }) +} + +fn get_trigger_lookahead(strategy: &PreparedTriggerStrategy) -> usize { + match strategy { + PreparedTriggerStrategy::ToNextComma { max_length, .. } => max_length + .unwrap_or(MAX_TRIGGER_VALUE_LEN) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::ToEndOfLine => LINE_TRIGGER_LOOKAHEAD, + PreparedTriggerStrategy::NWords { count } => count + .saturating_mul(64) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::CompanyIdValue => 256, + PreparedTriggerStrategy::Address { max_chars } => max_chars + .unwrap_or(120) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN), + PreparedTriggerStrategy::MatchPattern { .. } => MATCH_PATTERN_LOOKAHEAD, + } +} + +fn has_left_boundary( + text: &str, + offsets: &ByteOffsets<'_>, + start: u32, +) -> Result { + if start == 0 { + return Ok(true); + } + let byte = offsets.validate_offset(start)?; + Ok( + !text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_alphabetic), + ) +} + +fn has_right_boundary( + text: &str, + offsets: &ByteOffsets<'_>, + end: u32, + trigger: &str, +) -> Result { + let Some(last) = trigger.chars().next_back() else { + return Ok(false); + }; + if !last.is_alphabetic() { + return Ok(true); + } + let byte = offsets.validate_offset(end)?; + Ok( + !text + .get(byte..) + .and_then(|suffix| suffix.chars().next()) + .is_some_and(char::is_alphabetic), + ) +} + +fn char_at( + text: &str, + offsets: &ByteOffsets<'_>, + offset: u32, +) -> Result> { + let byte = offsets.validate_offset(offset)?; + Ok(text.get(byte..).and_then(|suffix| suffix.chars().next())) +} + +fn char_at_byte(text: &str, byte: usize) -> Option<(char, usize)> { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .map(|ch| (ch, ch.len_utf8())) +} + +fn cap_at_word_boundary(value_text: &str, cap: usize) -> usize { + let mut capped = byte_index_after_chars(value_text, cap); + while capped > 0 + && previous_char_is_word(value_text, capped) + && is_word_byte(value_text, capped) + { + capped = previous_char_boundary(value_text, capped); + } + capped +} + +fn byte_index_after_chars(value_text: &str, count: usize) -> usize { + value_text + .char_indices() + .nth(count) + .map_or(value_text.len(), |(index, _)| index) +} + +fn prefix_char_count(value_text: &str, end: usize) -> usize { + value_text + .get(..end) + .map_or(usize::MAX, |prefix| prefix.chars().count()) +} + +fn char_count(value_text: &str) -> usize { + value_text.chars().count() +} + +fn previous_char_is_word(text: &str, byte: usize) -> bool { + text + .get(..byte) + .and_then(|prefix| prefix.chars().next_back()) + .is_some_and(char::is_alphanumeric) +} + +fn previous_char_boundary(text: &str, byte: usize) -> usize { + text + .get(..byte) + .and_then(|prefix| prefix.char_indices().next_back()) + .map_or(0, |(index, _)| index) +} + +fn is_word_byte(text: &str, byte: usize) -> bool { + text + .get(byte..) + .and_then(|tail| tail.chars().next()) + .is_some_and(char::is_alphanumeric) +} + +fn hits_stop_word(text: &str, byte: usize, stop_words: &[String]) -> bool { + if stop_words.is_empty() { + return false; + } + if byte > 0 && is_word_byte(text, byte.saturating_sub(1)) { + return false; + } + let Some(tail) = text.get(byte..) else { + return false; + }; + stop_words.iter().any(|word| { + unicode_case_prefix_len(tail, word).is_some_and(|word_len| { + tail + .get(word_len..) + .and_then(|after| after.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()) + }) + }) +} + +fn unicode_case_prefix_len(text: &str, prefix: &str) -> Option { + if prefix.is_empty() { + return None; + } + let prefix_chars = prefix.chars().count(); + let mut end = 0usize; + let mut count = 0usize; + for (index, ch) in text.char_indices() { + if count == prefix_chars { + break; + } + count = count.saturating_add(1); + end = index.saturating_add(ch.len_utf8()); + } + if count != prefix_chars { + return None; + } + let candidate = text.get(..end)?; + (candidate.to_lowercase() == prefix.to_lowercase()).then_some(end) +} + +fn is_decimal_comma(text: &str) -> bool { + let mut chars = text.chars(); + if chars.next() != Some(',') { + return false; + } + chars + .next() + .is_some_and(|ch| ch.is_ascii_digit() || matches!(ch, '-' | '–' | '—')) +} + +fn post_nominal_len(text: &str, post_nominals: &[String]) -> Option { + let trimmed = text.strip_prefix(',')?.trim_start(); + let len_before = text.len().saturating_sub(trimmed.len()); + post_nominals + .iter() + .filter_map(|term| post_nominal_prefix_len(trimmed, term)) + .max() + .map(|term_len| len_before.saturating_add(term_len)) +} + +fn post_nominal_prefix_len(text: &str, term: &str) -> Option { + let mut text_index = 0usize; + for expected in term.chars() { + if expected == '.' { + let next = text.get(text_index..)?.chars().next()?; + if next != '.' { + return None; + } + text_index = text_index.saturating_add(next.len_utf8()); + let rest = text.get(text_index..)?; + text_index = text_index + .saturating_add(rest.len().saturating_sub(rest.trim_start().len())); + continue; + } + + let next = text.get(text_index..)?.chars().next()?; + if !next.eq_ignore_ascii_case(&expected) { + return None; + } + text_index = text_index.saturating_add(next.len_utf8()); + } + + if text + .get(text_index..) + .is_some_and(|tail| tail.starts_with('.')) + { + text_index = text_index.saturating_add(1); + } + Some(text_index) +} + +fn is_sentence_terminator( + text: &str, + period_byte: usize, + sentence_terminal_currency_terms: &[String], +) -> bool { + let Some(tail) = text.get(period_byte..) else { + return false; + }; + if !next_is_sentence_start(tail) { + return false; + } + let head = text.get(..period_byte).unwrap_or_default(); + lowercase_tail_len(head) >= 5 + || currency_tail(head, sentence_terminal_currency_terms) + || head + .chars() + .next_back() + .is_some_and(|ch| ch.is_ascii_digit()) + || (proper_noun_tail(head) && next_is_real_sentence(tail)) +} + +fn next_is_sentence_start(tail: &str) -> bool { + let Some(after_period) = tail.strip_prefix('.') else { + return false; + }; + if after_period.trim_start().is_empty() { + return true; + } + if !after_period.starts_with(char::is_whitespace) { + return false; + } + after_period + .trim_start() + .chars() + .next() + .is_some_and(char::is_uppercase) +} + +fn next_is_real_sentence(tail: &str) -> bool { + let Some(after_period) = tail.strip_prefix('.') else { + return false; + }; + if !after_period.starts_with(char::is_whitespace) { + return false; + } + let mut chars = after_period.trim_start().chars(); + chars.next().is_some_and(char::is_uppercase) + && chars.next().is_some_and(char::is_lowercase) + && chars.next().is_some_and(char::is_lowercase) +} + +fn lowercase_tail_len(text: &str) -> usize { + text + .chars() + .rev() + .take_while(|ch| ch.is_lowercase()) + .count() +} + +fn currency_tail( + text: &str, + sentence_terminal_currency_terms: &[String], +) -> bool { + sentence_terminal_currency_terms + .iter() + .any(|term| has_currency_code_tail(text, term)) +} + +fn has_currency_code_tail(text: &str, code: &str) -> bool { + let Some(start) = text.len().checked_sub(code.len()) else { + return false; + }; + let Some(tail) = text.get(start..) else { + return false; + }; + if tail.to_lowercase() != code.to_lowercase() { + return false; + } + text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphabetic()) +} + +fn proper_noun_tail(text: &str) -> bool { + let mut start = text.len(); + for (index, ch) in text.char_indices().rev() { + if !ch.is_alphabetic() { + break; + } + start = index; + } + let Some(word) = text.get(start..) else { + return false; + }; + let mut chars = word.chars(); + if !chars.next().is_some_and(char::is_uppercase) { + return false; + } + if chars.clone().count() < 3 || !chars.all(char::is_lowercase) { + return false; + } + text + .get(..start) + .and_then(|prefix| prefix.chars().next_back()) + .is_none_or(|ch| !ch.is_alphabetic() && ch != '.') +} + +fn punctuation_only(text: &str) -> bool { + text.chars().all(|ch| !ch.is_alphanumeric()) +} + +fn number_marker(text: &str) -> bool { + matches!( + text.to_ascii_lowercase().as_str(), + "nº" | "no" | "n°" | "n." | "№" + ) +} + +fn phone_shape_end(text: &str) -> Option { + let mut chars = text.char_indices(); + let (_, first) = chars.next()?; + if !(first == '+' || first == '(' || first.is_ascii_digit()) { + return None; + } + let mut end = first.len_utf8(); + for (index, ch) in chars { + if ch == '.' + && text + .get(index.saturating_add(ch.len_utf8())..) + .is_some_and(|tail| tail.starts_with(char::is_whitespace)) + { + break; + } + if ch.is_ascii_digit() + || ch.is_whitespace() + || matches!(ch, '(' | ')' | '.' | '/' | '-' | '–' | '—' | '‑') + { + end = index.saturating_add(ch.len_utf8()); + continue; + } + break; + } + while end > 0 + && text + .get(..end) + .and_then(|head| head.chars().next_back()) + .is_some_and(|ch| !ch.is_ascii_digit()) + { + end = end.saturating_sub(next_len_backward(text, end)); + } + if let Some(extension_len) = + text.get(end..).and_then(phone_extension_suffix_len) + { + end = end.saturating_add(extension_len); + } + (end > 0).then_some(end) +} + +fn phone_extension_suffix_len(text: &str) -> Option { + let leading = text.len().saturating_sub(text.trim_start().len()); + let trimmed = text.get(leading..)?; + for label in ["extension", "ext", "x"] { + let Some(rest) = ascii_case_prefix_rest(trimmed, label) else { + continue; + }; + let (rest, dot_len) = if label == "ext" { + rest + .strip_prefix('.') + .map_or((rest, 0_usize), |after_dot| (after_dot, 1_usize)) + } else { + (rest, 0_usize) + }; + let whitespace = rest.len().saturating_sub(rest.trim_start().len()); + let digits = rest.get(whitespace..)?; + let mut digit_end = 0; + let mut digit_count = 0_usize; + for (index, ch) in digits.char_indices() { + if !ch.is_ascii_digit() || digit_count >= 6 { + break; + } + digit_count = digit_count.saturating_add(1); + digit_end = index.saturating_add(ch.len_utf8()); + } + if digit_count > 0 { + return Some( + leading + .saturating_add(label.len()) + .saturating_add(dot_len) + .saturating_add(whitespace) + .saturating_add(digit_end), + ); + } + } + None +} + +fn ascii_case_prefix_rest<'a>(text: &'a str, prefix: &str) -> Option<&'a str> { + let head = text.get(..prefix.len())?; + if !head.eq_ignore_ascii_case(prefix) { + return None; + } + text.get(prefix.len()..) +} + +fn next_len_backward(text: &str, byte: usize) -> usize { + text + .get(..byte) + .and_then(|head| head.chars().next_back()) + .map_or(1, char::len_utf8) +} + +fn is_plausible_phone_trigger_value(value: &str) -> bool { + let trimmed = value.trim_start(); + if !trimmed + .chars() + .next() + .is_some_and(|ch| ch == '+' || ch == '(' || ch.is_ascii_digit()) + { + return false; + } + if looks_like_iso_date(trimmed) || inline_field_label(trimmed) { + return false; + } + trimmed.chars().filter(char::is_ascii_digit).count() + >= MIN_TRIGGER_PHONE_DIGITS +} + +fn looks_like_iso_date(text: &str) -> bool { + let bytes = text.as_bytes(); + bytes.len() >= 10 + && bytes + .get(0..4) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) + && bytes.get(4) == Some(&b'-') + && bytes + .get(5..7) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) + && bytes.get(7) == Some(&b'-') + && bytes + .get(8..10) + .is_some_and(|part| part.iter().all(u8::is_ascii_digit)) +} + +fn inline_field_label(text: &str) -> bool { + let mut letters = 0_usize; + for ch in text.chars().take(40) { + if ch == ':' && letters >= 2 { + return true; + } + if ch.is_alphabetic() || matches!(ch, ' ' | '/' | '-') { + letters = letters.saturating_add(usize::from(ch.is_alphabetic())); + continue; + } + if letters > 0 { + break; + } + } + false +} + +fn cap_phone_value(value: &ExtractedValue) -> ExtractedValue { + let capped_end = cap_at_word_boundary(&value.text, MAX_TRIGGER_VALUE_LEN) + .min(MAX_TRIGGER_VALUE_LEN); + let capped = value.text.get(..capped_end).unwrap_or_default().trim_end(); + ExtractedValue { + start: value.start, + end: value.start.saturating_add(u32_len(capped)), + text: capped.to_owned(), + } +} + +fn trim_leading_party_position(text: &str, terms: &[String]) -> Option { + for prefix in terms { + let prefix_len = prefix.len(); + let Some(head) = text.get(..prefix_len) else { + continue; + }; + if head.to_lowercase() != *prefix { + continue; + } + let rest = text.get(prefix_len..)?; + let ws_len = rest.len().saturating_sub(rest.trim_start().len()); + if ws_len == 0 { + continue; + } + let candidate = rest.get(ws_len..)?; + if candidate + .chars() + .next() + .is_some_and(|ch| ch.is_uppercase() || ch.is_ascii_digit()) + { + return Some(prefix_len.saturating_add(ws_len)); + } + } + None +} + +fn address_stop_hit(text: &str, stop_keywords: &[String]) -> bool { + let lower = text.to_lowercase(); + stop_keywords.iter().any(|keyword| { + lower.starts_with(keyword) + && lower + .get(keyword.len()..) + .and_then(|after| after.chars().next()) + .is_none_or(|ch| { + ch.is_whitespace() + || matches!(ch, ':' | ';' | ',' | '.' | '!' | '?' | '(' | ')') + || ch.is_ascii_digit() + }) + }) +} + +fn separator_len(raw: &str, allow_empty: bool) -> Option { + let trimmed_colon = raw.trim_start(); + let leading = raw.len().saturating_sub(trimmed_colon.len()); + if let Some(after_colon) = trimmed_colon.strip_prefix(':') { + return Some( + leading.saturating_add(1).saturating_add( + after_colon + .len() + .saturating_sub(after_colon.trim_start().len()), + ), + ); + } + if leading > 0 || allow_empty { + return Some(leading); + } + None +} + +fn number_label_len(text: &str) -> Option { + let labels = ["nr", "nr.", "numer", "nº", "no", "no.", "n°", "n.", "№"]; + for label in labels { + let Some(rest) = text.get(label.len()..) else { + continue; + }; + if text + .get(..label.len()) + .is_some_and(|head| head.eq_ignore_ascii_case(label)) + && (rest.starts_with(char::is_whitespace) || rest.starts_with(':')) + { + return Some(label.len().saturating_add(separator_len(rest, false)?)); + } + } + None +} + +fn id_value_prefix(text: &str) -> Option<&str> { + let mut end = 0; + let mut digits = 0_usize; + let mut leading_alpha = 0_usize; + let mut previous_was_digit = false; + for (index, ch) in text.char_indices() { + let allowed = if ch.is_ascii_digit() { + digits = digits.saturating_add(1); + previous_was_digit = true; + true + } else if ch.is_ascii_alphabetic() { + let allow = digits == 0 || previous_was_digit; + if digits == 0 { + leading_alpha = leading_alpha.saturating_add(1); + } + previous_was_digit = false; + allow + } else if matches!(ch, ' ' | '.' | '-' | '/' | '\t') { + previous_was_digit = false; + true + } else { + false + }; + if !allowed { + break; + } + end = index.saturating_add(ch.len_utf8()); + } + let candidate = text.get(..end)?; + (digits >= 2 + && end >= 5 + && leading_alpha <= 3 + && !single_digit_dotted_prefix(candidate)) + .then_some(candidate) +} + +fn single_digit_dotted_prefix(text: &str) -> bool { + let mut chars = text.trim_start().chars(); + let Some(first) = chars.next() else { + return false; + }; + first.is_ascii_digit() + && chars.next() == Some('.') + && chars.next().is_some_and(|ch| ch.is_ascii_digit()) +} + +fn has_known_legal_form_suffix(text: &str, suffixes: &[String]) -> bool { + suffixes.iter().any(|suffix| { + let mut from = 0; + while let Some(relative) = + text.get(from..).and_then(|tail| tail.find(suffix)) + { + let start = from.saturating_add(relative); + let end = start.saturating_add(suffix.len()); + from = start.saturating_add(1); + if !suffix.chars().all(char::is_alphabetic) { + return true; + } + let left = text + .get(..start) + .and_then(|head| head.chars().next_back()) + .is_none_or(|ch| !ch.is_alphanumeric()); + let right = text + .get(end..) + .and_then(|tail| tail.chars().next()) + .is_none_or(|ch| !ch.is_alphanumeric()); + if left && right { + return true; + } + } + false + }) +} + +fn person_name_run_end(text: &str) -> Option { + let mut end = 0; + let mut saw_token = false; + let tokens = text.split_whitespace().collect::>(); + for (index, token) in tokens.iter().enumerate() { + let trimmed = trim_name_token(token); + if is_person_name_run_token(trimmed, saw_token, &tokens, index) { + let relative = text.get(end..)?.find(token)?; + end = end.saturating_add(relative).saturating_add(token.len()); + saw_token = true; + continue; + } + break; + } + saw_token.then_some(end) +} + +fn is_person_name_run_token( + token: &str, + saw_token: bool, + tokens: &[&str], + index: usize, +) -> bool { + if is_capitalized_name_token(token) { + return true; + } + if !saw_token { + return false; + } + if is_apostrophe_name_continuation(token) { + return true; + } + is_name_particle(token) && has_name_after_particle(tokens, index) +} + +fn has_name_after_particle(tokens: &[&str], index: usize) -> bool { + for token in tokens.iter().skip(index.saturating_add(1)) { + let trimmed = trim_name_token(token); + if is_capitalized_name_token(trimmed) + || is_apostrophe_name_continuation(trimmed) + { + return true; + } + if is_name_particle(trimmed) { + continue; + } + return false; + } + false +} + +fn is_capitalized_name_token(token: &str) -> bool { + token.chars().next().is_some_and(char::is_uppercase) +} + +fn is_apostrophe_name_continuation(token: &str) -> bool { + token + .strip_prefix("d'") + .or_else(|| token.strip_prefix("d’")) + .is_some_and(is_capitalized_name_token) +} + +fn is_name_particle(token: &str) -> bool { + matches!( + token, + "de" + | "del" + | "della" + | "der" + | "den" + | "di" + | "du" + | "da" + | "das" + | "do" + | "dos" + | "el" + | "la" + | "le" + | "van" + | "von" + | "y" + | "zu" + | "af" + | "ben" + | "bin" + | "al" + | "d'" + | "d’" + ) +} + +fn trim_name_token(token: &str) -> &str { + token.trim_matches(',') +} + +fn u32_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn byte_to_offset(byte: usize) -> Option { + u32::try_from(byte).ok() +} + +#[cfg(test)] +#[allow(clippy::indexing_slicing, clippy::unwrap_used)] +mod tests { + use crate::search::{SearchIndex, SearchOptions, SearchPattern}; + + use super::*; + + #[test] + fn court_trigger_includes_trigger_span() { + let text = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let start = text.find("Krajským soudem").unwrap(); + let end = start.saturating_add("Krajským soudem".len()); + let data = PreparedTriggerData::new(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("krajským soudem"), + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![String::from("oddíl")], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let entities = process_trigger_matches( + &[SearchMatch::Literal { + pattern: 0, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }], + PatternSlice { start: 0, end: 1 }, + text, + &data, + None, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].source, DetectionSource::Trigger); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } + + #[test] + fn court_trigger_survives_generated_slice_shape() { + let text = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let slice = PatternSlice { + start: 1372, + end: 2791, + }; + let mut patterns = Vec::new(); + for index in 0..slice.end { + let pattern = if index == slice.start.saturating_add(216) { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + patterns.push(SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(false), + }); + } + let search = SearchIndex::new(patterns, SearchOptions::default()).unwrap(); + let mut rules = Vec::new(); + for index in slice.start..slice.end { + let trigger = if index == slice.start.saturating_add(216) { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + rules.push(TriggerRule { + trigger, + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![ + String::from("dne"), + String::from("v oddíle"), + String::from("oddíl"), + String::from("vložka"), + ], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }); + } + let data = PreparedTriggerData::new(TriggerData { + rules, + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let matches = search.find_iter(text).unwrap(); + let entities = + process_trigger_matches(&matches, slice, text, &data, None).unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].source, DetectionSource::Trigger); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } + + #[test] + fn court_trigger_lookahead_can_end_inside_later_utf8_scalar() { + let prefix = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí nad Labem, oddíl B"; + let trigger_start = prefix.find("Krajským soudem").unwrap(); + let trigger_end = trigger_start.saturating_add("Krajským soudem".len()); + let lookahead_end = trigger_end + .saturating_add(MAX_TRIGGER_VALUE_LEN) + .saturating_add(TRIGGER_LOOKAHEAD_MARGIN); + let padding_len = + lookahead_end.saturating_sub(prefix.len()).saturating_sub(1); + let text = format!("{prefix}{}é trailing", "x".repeat(padding_len)); + let data = PreparedTriggerData::new(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("krajským soudem"), + label: String::from("organization"), + strategy: TriggerStrategy::ToNextComma { + stop_words: vec![String::from("oddíl")], + max_length: None, + }, + validations: vec![TriggerValidation::MinLength(3)], + include_trigger: true, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }) + .unwrap(); + + let entities = process_trigger_matches( + &[SearchMatch::Literal { + pattern: 0, + start: u32::try_from(trigger_start).unwrap(), + end: u32::try_from(trigger_end).unwrap(), + }], + PatternSlice { start: 0, end: 1 }, + &text, + &data, + None, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Krajským soudem v Ústí nad Labem"); + } +} diff --git a/crates/anonymize-core/src/types.rs b/crates/anonymize-core/src/types.rs new file mode 100644 index 00000000..2e97e4af --- /dev/null +++ b/crates/anonymize-core/src/types.rs @@ -0,0 +1,403 @@ +use std::collections::BTreeMap; +use std::{error, fmt}; + +pub type Result = std::result::Result; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + InvalidSpan { + start: u32, + end: u32, + }, + ByteOffsetOutOfBounds { + offset: u32, + }, + ByteOffsetInsideCodepoint { + offset: u32, + }, + Search { + engine: SearchEngine, + reason: String, + }, + InvalidPackedSearchResult { + engine: SearchEngine, + len: usize, + }, + PatternIndexOutOfRange { + index: usize, + }, + PatternIndexNotAddressable { + pattern: u32, + }, + UnsupportedRegexValidation { + pattern: u32, + }, + UnsupportedStaticSlice { + slice: &'static str, + }, + UnsupportedDenyListSource { + source: String, + }, + MissingStaticData { + field: &'static str, + }, + InvalidStaticData { + field: &'static str, + reason: String, + }, + StaticDataLengthMismatch { + field: &'static str, + expected: usize, + actual: usize, + }, +} + +impl fmt::Display for Error { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidSpan { start, end } => { + write!(formatter, "Invalid entity span: {start}..{end}") + } + Self::ByteOffsetOutOfBounds { offset } => { + write!(formatter, "Byte offset is out of bounds: {offset}") + } + Self::ByteOffsetInsideCodepoint { offset } => { + write!(formatter, "Byte offset is not a UTF-8 boundary: {offset}") + } + Self::Search { engine, reason } => { + write!(formatter, "{engine} search failed: {reason}") + } + Self::InvalidPackedSearchResult { engine, len } => { + write!( + formatter, + "{engine} search returned malformed packed matches of length {len}" + ) + } + Self::PatternIndexOutOfRange { index } => { + write!(formatter, "Search pattern index exceeds u32 range: {index}") + } + Self::PatternIndexNotAddressable { pattern } => { + write!( + formatter, + "Search pattern index is not addressable: {pattern}" + ) + } + Self::UnsupportedRegexValidation { pattern } => { + write!( + formatter, + "Regex pattern {pattern} requires validation that is not available in core" + ) + } + Self::UnsupportedStaticSlice { slice } => { + write!( + formatter, + "Static slice '{slice}' is configured but not supported by native core" + ) + } + Self::UnsupportedDenyListSource { source } => { + write!( + formatter, + "Deny-list source '{source}' is not supported by native core" + ) + } + Self::MissingStaticData { field } => { + write!(formatter, "Static data field '{field}' is required") + } + Self::InvalidStaticData { field, reason } => { + write!( + formatter, + "Static data field '{field}' is invalid: {reason}" + ) + } + Self::StaticDataLengthMismatch { + field, + expected, + actual, + } => { + write!( + formatter, + "Static data field '{field}' has {actual} item(s), expected {expected}" + ) + } + } + } +} + +impl error::Error for Error {} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum EntityKind { + Detected, + Coreference { source_text: String }, +} + +/// Source span with UTF-8 byte offsets. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Entity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub kind: EntityKind, +} + +impl Entity { + #[must_use] + pub fn detected( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + kind: EntityKind::Detected, + } + } + + #[must_use] + pub fn coreference( + start: u32, + end: u32, + label: impl Into, + text: impl Into, + source_text: impl Into, + ) -> Self { + Self { + start, + end, + label: label.into(), + text: text.into(), + kind: EntityKind::Coreference { + source_text: source_text.into(), + }, + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum OperatorType { + #[default] + Replace, + Redact, +} + +#[derive(bon::Builder, Clone, Debug, Eq, PartialEq)] +pub struct OperatorConfig { + #[builder(default)] + pub operators: BTreeMap, + #[builder(default = String::from("[REDACTED]"))] + pub redact_string: String, +} + +impl Default for OperatorConfig { + fn default() -> Self { + Self { + operators: BTreeMap::new(), + redact_string: String::from("[REDACTED]"), + } + } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PlaceholderEntry { + pub label: String, + pub text: String, + pub source_text: Option, + pub placeholder: String, +} + +/// Deterministic placeholder lookup for one document. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PlaceholderMap { + entries: Vec, +} + +impl PlaceholderMap { + #[must_use] + pub fn entries(&self) -> &[PlaceholderEntry] { + &self.entries + } + + #[must_use] + pub fn get(&self, label: &str, text: &str) -> Option<&str> { + self.get_with_source(label, text, None).or_else(|| { + self + .entries + .iter() + .find(|entry| entry.label == label && entry.text == text) + .map(|entry| entry.placeholder.as_str()) + }) + } + + #[must_use] + pub(crate) fn get_entity(&self, entity: &Entity) -> Option<&str> { + self.get_with_source( + &entity.label, + &entity.text, + coreference_source_text(entity), + ) + } + + fn get_with_source( + &self, + label: &str, + text: &str, + source_text: Option<&str>, + ) -> Option<&str> { + self + .entries + .iter() + .find(|entry| { + entry.label == label + && entry.text == text + && entry.source_text.as_deref() == source_text + }) + .map(|entry| entry.placeholder.as_str()) + } + + pub(super) fn has_entity(&self, entity: &Entity) -> bool { + self.get_entity(entity).is_some() + } + + pub(super) fn push_entity(&mut self, entity: &Entity, placeholder: &str) { + self.entries.push(PlaceholderEntry { + label: entity.label.clone(), + text: entity.text.clone(), + source_text: coreference_source_text(entity).map(ToOwned::to_owned), + placeholder: placeholder.to_owned(), + }); + } +} + +fn coreference_source_text(entity: &Entity) -> Option<&str> { + let EntityKind::Coreference { source_text } = &entity.kind else { + return None; + }; + Some(source_text) +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OperatorEntry { + pub placeholder: String, + pub operator: OperatorType, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: usize, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SearchEngine { + Literal, + Regex, + Fuzzy, + Text, +} + +impl fmt::Display for SearchEngine { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Literal => formatter.write_str("literal"), + Self::Regex => formatter.write_str("regex"), + Self::Fuzzy => formatter.write_str("fuzzy"), + Self::Text => formatter.write_str("text-search"), + } + } +} + +/// Search match with the caller's pattern index and UTF-8 byte offsets. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SearchMatch { + Literal { + pattern: u32, + start: u32, + end: u32, + }, + Regex { + pattern: u32, + start: u32, + end: u32, + }, + Fuzzy { + pattern: u32, + start: u32, + end: u32, + distance: u32, + }, +} + +impl SearchMatch { + #[must_use] + pub const fn engine(&self) -> SearchEngine { + match self { + Self::Literal { .. } => SearchEngine::Literal, + Self::Regex { .. } => SearchEngine::Regex, + Self::Fuzzy { .. } => SearchEngine::Fuzzy, + } + } + + #[must_use] + pub const fn pattern(&self) -> u32 { + match self { + Self::Literal { pattern, .. } + | Self::Regex { pattern, .. } + | Self::Fuzzy { pattern, .. } => *pattern, + } + } + + #[must_use] + pub const fn start(&self) -> u32 { + match self { + Self::Literal { start, .. } + | Self::Regex { start, .. } + | Self::Fuzzy { start, .. } => *start, + } + } + + #[must_use] + pub const fn end(&self) -> u32 { + match self { + Self::Literal { end, .. } + | Self::Regex { end, .. } + | Self::Fuzzy { end, .. } => *end, + } + } + + #[must_use] + pub(crate) const fn with_span(self, start: u32, end: u32) -> Self { + match self { + Self::Literal { pattern, .. } => Self::Literal { + pattern, + start, + end, + }, + Self::Regex { pattern, .. } => Self::Regex { + pattern, + start, + end, + }, + Self::Fuzzy { + pattern, distance, .. + } => Self::Fuzzy { + pattern, + start, + end, + distance, + }, + } + } +} diff --git a/crates/anonymize-core/src/validators.rs b/crates/anonymize-core/src/validators.rs new file mode 100644 index 00000000..e2e3dfa0 --- /dev/null +++ b/crates/anonymize-core/src/validators.rs @@ -0,0 +1,11 @@ +pub(crate) fn validate_named_id(validator: &str, value: &str) -> bool { + stella_stdnum_core::validate_named_id(validator, value) +} + +pub(crate) fn validate_id( + validator: &str, + value: &str, + input: Option<&str>, +) -> bool { + stella_stdnum_core::validate_id(validator, value, input) +} diff --git a/crates/anonymize-core/src/zones.rs b/crates/anonymize-core/src/zones.rs new file mode 100644 index 00000000..d6c54d33 --- /dev/null +++ b/crates/anonymize-core/src/zones.rs @@ -0,0 +1,450 @@ +use regex::{Regex, RegexBuilder}; + +use crate::resolution::PipelineEntity; +use crate::types::{Error, Result}; + +const MIN_TABS_FOR_TABLE: usize = 2; + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct ZoneData { + #[serde(default)] + pub section_heading_patterns: Vec, + #[serde(default)] + pub signing_clauses: Vec, +} + +#[derive(Clone, Debug, Eq, PartialEq, serde::Deserialize, serde::Serialize)] +pub struct ZonePatternData { + pub pattern: String, + #[serde(default)] + pub flags: String, +} + +#[derive( + Clone, Debug, Default, Eq, PartialEq, serde::Deserialize, serde::Serialize, +)] +pub struct ZoneSigningClauseData { + #[serde(default)] + pub prefix: String, + #[serde(default)] + pub suffix: String, + #[serde(default)] + pub prepositions: Vec, +} + +pub(crate) struct PreparedZoneData { + section_heading_patterns: Vec, + signing_clause_patterns: Vec, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum DocumentZone { + Header, + Signature, + Body, + Table, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ZoneSpan { + zone: DocumentZone, + start: u32, + end: u32, +} + +struct Line<'a> { + text: &'a str, + start: usize, +} + +pub(crate) struct ZoneAdjustmentResult { + pub(crate) entities: Vec, + pub(crate) boosted: usize, +} + +impl PreparedZoneData { + pub(crate) fn new(data: &ZoneData) -> Result { + Ok(Self { + section_heading_patterns: data + .section_heading_patterns + .iter() + .map(|pattern| { + compile_pattern("zone_data.section_heading_patterns", pattern) + }) + .collect::>>()?, + signing_clause_patterns: data + .signing_clauses + .iter() + .map(compile_signing_clause_pattern) + .collect::>>()?, + }) + } + + pub(crate) fn adjust_entities( + &self, + full_text: &str, + entities: Vec, + ) -> Result { + if entities.is_empty() { + return Ok(ZoneAdjustmentResult { + entities, + boosted: 0, + }); + } + + let zones = self.classify(full_text)?; + let mut boosted: usize = 0; + let adjusted = entities + .into_iter() + .map(|mut entity| { + let zone = zone_for_entity(&zones, &entity); + let adjustment = score_adjustment(zone); + if adjustment > 0.0 { + let score = f64::min(1.0, entity.score + adjustment); + if score > entity.score { + boosted = boosted.saturating_add(1); + entity.score = score; + } + } + entity + }) + .collect(); + + Ok(ZoneAdjustmentResult { + entities: adjusted, + boosted, + }) + } + + fn classify(&self, full_text: &str) -> Result> { + if full_text.is_empty() { + return Ok(Vec::new()); + } + + let lines = split_lines(full_text); + let header_end_line = + first_matching_line(&lines, &self.section_heading_patterns); + let signature_start_line = + last_matching_line(&lines, &self.signing_clause_patterns); + + let mut header_end_offset = header_end_line + .and_then(|line| lines.get(line)) + .map_or(0, |line| line.start); + let signature_start_offset = signature_start_line + .and_then(|line| lines.get(line)) + .map_or(full_text.len(), |line| line.start); + + let mut header_line = header_end_line; + if header_end_line.is_some_and(|line| line > 0) + && signature_start_line.is_some() + && header_end_offset > signature_start_offset + { + header_line = None; + header_end_offset = 0; + } + + let mut zones = Vec::new(); + if header_line.is_some_and(|line| line > 0) { + zones.push(ZoneSpan { + zone: DocumentZone::Header, + start: usize_to_u32("zone.header.start", 0)?, + end: usize_to_u32("zone.header.end", header_end_offset)?, + }); + } + + let body_start = if header_line.is_some_and(|line| line > 0) { + header_end_offset + } else { + 0 + }; + let body_end = signature_start_offset; + add_table_zones( + &mut zones, + &lines, + header_line.unwrap_or(0), + signature_start_line.unwrap_or(lines.len()), + body_end, + )?; + add_body_zones(&mut zones, body_start, body_end)?; + + if signature_start_line.is_some() { + zones.push(ZoneSpan { + zone: DocumentZone::Signature, + start: usize_to_u32("zone.signature.start", signature_start_offset)?, + end: usize_to_u32("zone.signature.end", full_text.len())?, + }); + } + + zones.sort_by_key(|zone| zone.start); + Ok(zones) + } +} + +fn first_matching_line( + lines: &[Line<'_>], + patterns: &[Regex], +) -> Option { + for (index, line) in lines.iter().enumerate() { + if patterns.iter().any(|pattern| pattern.is_match(line.text)) { + return Some(index); + } + } + None +} + +fn last_matching_line(lines: &[Line<'_>], patterns: &[Regex]) -> Option { + for (index, line) in lines.iter().enumerate().rev() { + if patterns.iter().any(|pattern| pattern.is_match(line.text)) { + return Some(index); + } + } + None +} + +fn add_table_zones( + zones: &mut Vec, + lines: &[Line<'_>], + start_line: usize, + end_line: usize, + body_end: usize, +) -> Result<()> { + let mut table_start = None; + for line in lines + .iter() + .enumerate() + .skip(start_line) + .take(end_line.saturating_sub(start_line)) + .map(|(_, line)| line) + { + if is_table_line(line.text) { + table_start.get_or_insert(line.start); + continue; + } + + if let Some(start) = table_start.take() { + zones.push(ZoneSpan { + zone: DocumentZone::Table, + start: usize_to_u32("zone.table.start", start)?, + end: usize_to_u32("zone.table.end", line.start)?, + }); + } + } + + if let Some(start) = table_start { + zones.push(ZoneSpan { + zone: DocumentZone::Table, + start: usize_to_u32("zone.table.start", start)?, + end: usize_to_u32("zone.table.end", body_end)?, + }); + } + + Ok(()) +} + +fn add_body_zones( + zones: &mut Vec, + body_start: usize, + body_end: usize, +) -> Result<()> { + let mut special = zones.clone(); + special.sort_by_key(|zone| zone.start); + + let mut cursor = usize_to_u32("zone.body.start", body_start)?; + let body_end = usize_to_u32("zone.body.end", body_end)?; + for span in special { + if span.zone == DocumentZone::Header { + continue; + } + if span.start > cursor { + zones.push(ZoneSpan { + zone: DocumentZone::Body, + start: cursor, + end: span.start, + }); + } + cursor = u32::max(cursor, span.end); + } + + if cursor < body_end { + zones.push(ZoneSpan { + zone: DocumentZone::Body, + start: cursor, + end: body_end, + }); + } + + Ok(()) +} + +fn zone_for_entity( + zones: &[ZoneSpan], + entity: &PipelineEntity, +) -> DocumentZone { + let midpoint = f64::midpoint(f64::from(entity.start), f64::from(entity.end)); + for zone in zones { + if midpoint >= f64::from(zone.start) && midpoint < f64::from(zone.end) { + return zone.zone; + } + } + DocumentZone::Body +} + +const fn score_adjustment(zone: DocumentZone) -> f64 { + match zone { + DocumentZone::Header => 0.1, + DocumentZone::Signature => 0.15, + DocumentZone::Body => 0.0, + DocumentZone::Table => 0.05, + } +} + +fn split_lines(full_text: &str) -> Vec> { + let mut offset: usize = 0; + let mut lines = Vec::new(); + for line in full_text.split('\n') { + let start = offset; + let end = start.saturating_add(line.len()); + lines.push(Line { text: line, start }); + offset = end.saturating_add(1); + } + lines +} + +fn is_table_line(line: &str) -> bool { + line + .chars() + .filter(|ch| *ch == '\t') + .take(MIN_TABS_FOR_TABLE) + .count() + >= MIN_TABS_FOR_TABLE +} + +fn compile_pattern( + field: &'static str, + data: &ZonePatternData, +) -> Result { + let mut builder = RegexBuilder::new(&data.pattern); + for flag in data.flags.chars() { + match flag { + 'u' => {} + 'i' => { + builder.case_insensitive(true); + } + 'm' => { + builder.multi_line(true); + } + 's' => { + builder.dot_matches_new_line(true); + } + _ => { + return Err(Error::InvalidStaticData { + field, + reason: format!("unsupported regex flag '{flag}'"), + }); + } + } + } + builder.build().map_err(|error| Error::InvalidStaticData { + field, + reason: error.to_string(), + }) +} + +fn compile_signing_clause_pattern( + data: &ZoneSigningClauseData, +) -> Result { + let place = if data.prepositions.is_empty() { + String::from(r"\p{Lu}\p{Ll}+(?:[- ]\p{Lu}\p{Ll}+)*") + } else { + format!( + r"\p{{Lu}}\p{{Ll}}+(?:\s+(?:{})\s+\p{{Lu}}\p{{Ll}}+)*(?:\s+\p{{Lu}}\p{{Ll}}+)*", + data.prepositions.join("|") + ) + }; + let pattern = format!(r"^\s*(?:{}{}{})", data.prefix, place, data.suffix); + compile_pattern( + "zone_data.signing_clauses", + &ZonePatternData { + pattern, + flags: String::new(), + }, + ) +} + +fn usize_to_u32(field: &'static str, value: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidStaticData { + field, + reason: String::from("offset exceeds u32 range"), + }) +} + +#[cfg(test)] +mod tests { + #![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + + use super::*; + use crate::resolution::{DetectionSource, PipelineEntity}; + + fn test_data() -> PreparedZoneData { + PreparedZoneData::new(&ZoneData { + section_heading_patterns: vec![ZonePatternData { + pattern: String::from(r"^\s*(?:Article|Článek)\s*1"), + flags: String::from("iu"), + }], + signing_clauses: vec![ZoneSigningClauseData { + prefix: String::from(r"(?:V|Ve)\s+"), + suffix: String::from(r"\s*,?\s*dne"), + prepositions: vec![String::from("nad")], + }], + }) + .unwrap() + } + + #[test] + fn classifies_header_table_and_signature_zones() { + let data = test_data(); + let text = [ + "Parties", + "Alice", + "Article 1", + "Name\tAddress\tId", + "Alice\tPrague\t123", + "Body", + "V Praze dne 1.1.2024", + "Alice", + ] + .join("\n"); + + let zones = data.classify(&text).unwrap(); + + assert_eq!(zones.first().unwrap().zone, DocumentZone::Header); + assert!(zones.iter().any(|zone| zone.zone == DocumentZone::Table)); + assert_eq!(zones.last().unwrap().zone, DocumentZone::Signature); + assert_eq!(zones.first().unwrap().start, 0); + assert_eq!( + zones.last().unwrap().end, + u32::try_from(text.len()).unwrap() + ); + } + + #[test] + fn boosts_scores_for_pii_dense_zones() { + let data = test_data(); + let text = ["Alice", "Article 1"].join("\n"); + let entities = vec![PipelineEntity::detected( + 0, + 5, + "person", + "Alice", + 0.45, + DetectionSource::Regex, + )]; + + let adjusted = data.adjust_entities(&text, entities).unwrap(); + + assert_eq!(adjusted.boosted, 1); + assert!((adjusted.entities[0].score - 0.55).abs() < 1e-12); + } +} diff --git a/crates/anonymize-core/tests/address_seed_parity.rs b/crates/anonymize-core/tests/address_seed_parity.rs new file mode 100644 index 00000000..2a1d67f4 --- /dev/null +++ b/crates/anonymize-core/tests/address_seed_parity.rs @@ -0,0 +1,293 @@ +#![allow(clippy::expect_used)] + +use stella_anonymize_core::{ + AddressSeedData, DenyListFilterData, DenyListMatchData, LiteralSearchOptions, + OperatorConfig, PatternSlice, PreparedSearch, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, SearchOptions, SearchPattern, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: None, + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + } +} + +fn address_texts( + result: &stella_anonymize_core::StaticRedactionResult, +) -> Vec<&str> { + result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect() +} + +#[test] +fn detects_state_qualified_zip_plus_four_address_seed() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Registered office: CA 94304-1050. Notices follow.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"CA 94304-1050"), + "resolved address entities: {:?}", + result.resolved_entities, + ); + assert!(!result.redaction.redacted_text.contains("94304-1050")); +} + +#[test] +fn detects_cue_gated_br_cep_address_seed() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Rua"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + street_types: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + address_seed_data: Some(AddressSeedData { + boundary_words: Vec::new(), + br_cep_cue_words: vec![String::from("CEP")], + unit_abbreviations: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Enviar para CEP 01001-000, Rua Boa Vista, 100. Obrigado.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"CEP 01001-000, Rua Boa Vista, 100"), + "resolved address entities: {:?}", + result.resolved_entities, + ); + assert!(!result.redaction.redacted_text.contains("01001-000")); +} + +#[test] +fn keeps_date_like_street_name_in_address_seed_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("May 15"))], + regex_meta: vec![RegexMatchMeta::new("date", 0.9)], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("London"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("London")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let result = prepared + .redact_static_entities( + "Notices go to May 15 Street, London 12345.", + &OperatorConfig::default(), + ) + .expect("static redaction should succeed"); + + assert!( + address_texts(&result).contains(&"May 15 Street, London 12345"), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); + assert!(!result.redaction.redacted_text.contains("May 15 Street")); +} + +#[test] +fn clusters_address_seeds_across_multibyte_text_gap() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Springfield"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Springfield")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + let gap = "á".repeat(140); + let full_text = + format!("Send notices to Main Street, {gap} Springfield 12345."); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .expect("static redaction should succeed"); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text.contains("Main Street") + && entity.text.contains("Springfield 12345")), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); +} + +#[test] +fn preserves_unit_abbreviation_inside_address_seed_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Springfield"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Springfield")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: Vec::new(), + br_cep_cue_words: Vec::new(), + unit_abbreviations: vec![String::from("apt.")], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("address seed data should prepare"); + + let suffix = "á".repeat(97); + let full_text = format!( + "Notices go to 10 Main Street, Springfield 12345 Apt. 5 {suffix}. Thank you." + ); + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .expect("static redaction should succeed"); + let expected = format!("10 Main Street, Springfield 12345 Apt. 5 {suffix}"); + + assert!( + address_texts(&result).contains(&expected.as_str()), + "resolved address entities: {:?}; address seed entities: {:?}", + result.resolved_entities, + result.detections.address_seed_entities, + ); + assert!(!result.redaction.redacted_text.contains("Apt. 5")); + assert!(!result.redaction.redacted_text.contains(&suffix)); +} diff --git a/crates/anonymize-core/tests/builders.rs b/crates/anonymize-core/tests/builders.rs new file mode 100644 index 00000000..601ea8f2 --- /dev/null +++ b/crates/anonymize-core/tests/builders.rs @@ -0,0 +1,39 @@ +use stella_anonymize_core::{ + FuzzySearchOptions, LiteralSearchOptions, OperatorConfig, RegexSearchOptions, + SearchOptions, +}; + +#[test] +fn search_options_builder_preserves_defaults() { + let options = SearchOptions::builder() + .literal( + LiteralSearchOptions::builder() + .case_insensitive(true) + .build(), + ) + .build(); + + assert!(options.literal.case_insensitive); + assert!(!options.literal.whole_words); + assert_eq!(options.regex, RegexSearchOptions::default()); + assert_eq!(options.fuzzy, FuzzySearchOptions::default()); +} + +#[test] +fn fuzzy_options_builder_preserves_whole_word_default() { + let options = FuzzySearchOptions::builder() + .normalize_diacritics(true) + .build(); + + assert!(!options.case_insensitive); + assert!(options.whole_words); + assert!(options.normalize_diacritics); +} + +#[test] +fn operator_config_builder_preserves_redaction_default() { + let config = OperatorConfig::builder().build(); + + assert!(config.operators.is_empty()); + assert_eq!(config.redact_string, "[REDACTED]"); +} diff --git a/crates/anonymize-core/tests/false_positive_parity.rs b/crates/anonymize-core/tests/false_positive_parity.rs new file mode 100644 index 00000000..83e621f8 --- /dev/null +++ b/crates/anonymize-core/tests/false_positive_parity.rs @@ -0,0 +1,229 @@ +#![allow(clippy::expect_used, clippy::unwrap_used)] + +use std::collections::BTreeSet; + +use stella_anonymize_core::{ + DenyListFilterData, DenyListMatchData, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, RegexMatchMeta, + SearchOptions, SearchPattern, TriggerData, TriggerRule, TriggerStrategy, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: None, + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + } +} + +fn empty_deny_list_data(filters: DenyListFilterData) -> DenyListMatchData { + DenyListMatchData { + labels: Vec::>::new().into(), + custom_labels: Vec::>::new().into(), + originals: vec![], + sources: Vec::>::new().into(), + filters: Some(filters), + } +} + +fn set(values: [&str; N]) -> BTreeSet { + values.into_iter().map(String::from).collect() +} + +fn resolved_texts(prepared: &PreparedSearch, text: &str) -> Vec { + prepared + .redact_static_entities(text, &OperatorConfig::default()) + .unwrap() + .resolved_entities + .into_iter() + .map(|entity| entity.text) + .collect() +} + +#[test] +fn keeps_trigger_address_with_extra_component_anchor() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("bytem"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("bytem"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(80), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + address_component_terms: set(["sídliště"]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert_eq!( + resolved_texts(&prepared, "Trvale bytem: sídliště Barrandov."), + [String::from("sídliště Barrandov")] + ); +} + +#[test] +fn rejects_non_trigger_numbers_after_number_abbreviations() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\b\d{4}\b"))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + number_abbrev_prefixes: set(["no.", "č.", "nr."]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "Invoice No. 1234, položka č. 5678, Akte Nr. 9012, account 7777."; + + assert_eq!(resolved_texts(&prepared, text), [String::from("7777")]); +} + +#[test] +fn rejects_document_structure_heading_organizations() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Schedule No\. 4|Příloha č\. 2|Acme No\. 4", + ))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("organization", 0.9)], + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + document_heading_words: set(["schedule", "příloha"]), + document_heading_ordinal_markers: set(["no.", "č."]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "Schedule No. 4\nPříloha č. 2\nAcme No. 4 signed."; + + assert_eq!( + resolved_texts(&prepared, text), + [String::from("Acme No. 4")] + ); +} + +#[test] +fn rejects_document_headings_without_deny_list_matching() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Schedule No\. 4|Acme No\. 4", + ))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("organization", 0.9)], + false_positive_filters: Some(DenyListFilterData { + document_heading_words: set(["schedule"]), + document_heading_ordinal_markers: set(["no."]), + ..DenyListFilterData::default() + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert_eq!( + resolved_texts(&prepared, "Schedule No. 4\nAcme No. 4 signed."), + [String::from("Acme No. 4")] + ); +} + +#[test] +fn rejects_only_ambiguous_street_type_trigger_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("demeurant"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("demeurant"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(80), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + deny_list_data: Some(empty_deny_list_data(DenyListFilterData { + street_types: set(["cours"]), + ambiguous_street_type_terms: set(["cours"]), + ..DenyListFilterData::default() + })), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + assert!( + resolved_texts(&prepared, "demeurant au cours du contrat.").is_empty() + ); + assert!(resolved_texts(&prepared, "demeurant Cours.").is_empty()); + assert_eq!( + resolved_texts(&prepared, "demeurant Cours Mirabeau."), + [String::from("Cours Mirabeau")] + ); +} diff --git a/crates/anonymize-core/tests/normalize.rs b/crates/anonymize-core/tests/normalize.rs new file mode 100644 index 00000000..13f5f98f --- /dev/null +++ b/crates/anonymize-core/tests/normalize.rs @@ -0,0 +1,20 @@ +use stella_anonymize_core::normalize_for_search; + +#[test] +fn normalize_for_search_matches_ts_replacements() { + assert_eq!(normalize_for_search("hello\u{00a0}world"), "hello world"); + assert_eq!(normalize_for_search("1\u{2007}000"), "1 000"); + assert_eq!(normalize_for_search("a\u{202f}b"), "a b"); + assert_eq!(normalize_for_search("2020\u{2013}2024"), "2020-2024"); + assert_eq!(normalize_for_search("a\u{2014}b"), "a-b"); + assert_eq!(normalize_for_search("\u{201c}hello\u{201d}"), "\"hello\""); +} + +#[test] +fn normalize_for_search_does_not_preserve_byte_width() { + let input = "a\u{00a0}\u{1f600}\u{2013}b"; + let output = normalize_for_search(input); + + assert_eq!(output, "a \u{1f600}-b"); + assert_ne!(output.len(), input.len()); +} diff --git a/crates/anonymize-core/tests/prepared.rs b/crates/anonymize-core/tests/prepared.rs new file mode 100644 index 00000000..a36a5003 --- /dev/null +++ b/crates/anonymize-core/tests/prepared.rs @@ -0,0 +1,2998 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use std::collections::{BTreeMap, BTreeSet}; + +use stella_anonymize_core::{ + AddressContextData, AddressSeedData, AmountWordsData, CoreferenceData, + CoreferencePatternData, CountryMatchData, CurrencyData, DateData, + DenyListFilterData, DenyListMatchData, DetectionSource, DiagnosticEventKind, + DiagnosticStage, EntityKind, Error, FuzzySearchOptions, GazetteerMatchData, + HotwordRule, HotwordRuleData, LegalFormData, LiteralSearchOptions, + MagnitudeSuffixData, MonetaryData, OperatorConfig, PatternSlice, + PreparedSearch, PreparedSearchArtifacts, PreparedSearchConfig, + PreparedSearchSlices, RegexMatchMeta, RegexSearchOptions, SearchOptions, + SearchPattern, SourceDetail, TriggerData, TriggerRule, TriggerStrategy, + TriggerValidation, WrittenAmountPatternData, ZoneData, ZonePatternData, + ZoneSigningClauseData, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: None, + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + } +} + +fn legal_form_prepared_search(suffixes: Vec<&str>) -> PreparedSearch { + let suffix_strings = suffixes + .iter() + .map(|suffix| (*suffix).to_owned()) + .collect::>(); + let regex_patterns = suffixes + .into_iter() + .map(|suffix| SearchPattern::Literal(suffix.to_owned())) + .collect::>(); + + PreparedSearch::new(PreparedSearchConfig { + regex_patterns, + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + legal_forms: PatternSlice { + start: 0, + end: u32::try_from(suffix_strings.len()).unwrap(), + }, + ..PreparedSearchSlices::default() + }, + legal_form_data: Some(LegalFormData { + suffixes: suffix_strings, + normalized_boundary_suffixes: vec![ + String::from("as"), + String::from("co"), + String::from("inc"), + String::from("ltd"), + String::from("llc"), + String::from("pty"), + String::from("sro"), + ], + normalized_in_name_words: vec![String::from("co")], + normalized_suffix_words: vec![ + String::from("as"), + String::from("co"), + String::from("inc"), + String::from("ltd"), + String::from("llc"), + String::from("pty"), + String::from("sro"), + ], + connector_words: vec![ + String::from("&"), + String::from("a"), + String::from("and"), + ], + and_connector_words: vec![String::from("and")], + in_name_prepositions: vec![String::from("of")], + company_suffix_words: vec![String::from("Company")], + sentence_verb_indicators: vec![ + String::from("include"), + String::from("is"), + String::from("podepsaly"), + ], + ..LegalFormData::default() + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap() +} + +fn address_context_data() -> AddressContextData { + AddressContextData { + address_prepositions: vec![String::from("na"), String::from("mezi")], + temporal_prepositions: vec![String::from("od"), String::from("do")], + street_abbreviations: vec![String::from("ul.")], + bare_house_stopwords: vec![String::from("section")], + } +} + +fn zone_data() -> ZoneData { + ZoneData { + section_heading_patterns: vec![ZonePatternData { + pattern: String::from(r"^\s*(?:Article|Článek)\s*1"), + flags: String::from("iu"), + }], + signing_clauses: vec![ZoneSigningClauseData { + prefix: String::from(r"(?:V|Ve)\s+"), + suffix: String::from(r"\s*,?\s*dne"), + prepositions: vec![String::from("nad")], + }], + } +} + +fn coreference_data() -> CoreferenceData { + CoreferenceData { + definition_patterns: vec![CoreferencePatternData { + pattern: String::from(r#"\((?:hereinafter|the)\s+["']([^"']+)["']\)"#), + flags: String::from("gi"), + }], + role_stop_terms: vec![String::from("seller")], + legal_form_aliases: vec![String::from("LLC")], + organization_suffixes: vec![String::from("LLC")], + organization_determiners: vec![String::from( + r"the\s+(?:company|corporation|firm)", + )], + } +} + +fn legal_form_coreference_prepared_search( + suffixes: Vec<&str>, +) -> PreparedSearch { + let suffix_strings = suffixes + .iter() + .map(|suffix| (*suffix).to_owned()) + .collect::>(); + let regex_patterns = suffixes + .into_iter() + .map(|suffix| SearchPattern::Literal(suffix.to_owned())) + .collect::>(); + + PreparedSearch::new(PreparedSearchConfig { + regex_patterns, + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + legal_forms: PatternSlice { + start: 0, + end: u32::try_from(suffix_strings.len()).unwrap(), + }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + legal_form_data: Some(LegalFormData { + suffixes: suffix_strings.clone(), + normalized_boundary_suffixes: vec![String::from("llc")], + normalized_suffix_words: vec![String::from("llc")], + company_suffix_words: vec![String::from("Company")], + ..LegalFormData::default() + }), + coreference_data: Some(CoreferenceData { + definition_patterns: vec![CoreferencePatternData { + pattern: String::from(r#"\((?:hereinafter|the)\s+["']([^"']+)["']\)"#), + flags: String::from("gi"), + }], + role_stop_terms: vec![String::from("seller")], + legal_form_aliases: suffix_strings.clone(), + organization_suffixes: suffix_strings, + organization_determiners: vec![String::from( + r"the\s+(?:company|corporation|firm)", + )], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap() +} + +#[test] +fn prepared_search_runs_legal_form_pass_on_normalized_text() { + let prepared = legal_form_prepared_search(vec!["Pty Ltd"]); + let result = prepared + .detect_static_entities("Acme Pty\u{00a0}Ltd signed the agreement.") + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!(result.legal_form_entities[0].text, "Acme Pty\u{00a0}Ltd"); +} + +#[test] +fn prepared_search_runs_normalized_literal_pass() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Acme\u{00a0}Corp. signed") + .unwrap(); + + assert_eq!(result.gazetteer_entities.len(), 1); + assert_eq!(result.gazetteer_entities[0].text, "Acme\u{00a0}Corp"); +} + +#[test] +fn prepared_search_adds_slash_house_number_address_context() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 2\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sídlo: Praha 2, Vinohradská 2512/2a", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "address" && entity.text.contains("Vinohradská 2512/2a") + })); +} + +#[test] +fn prepared_search_adds_orphan_header_street_line_context() { + let full_text = format!( + "ACME s.r.o.\nEvropská 710\n160 00 Praha\n{}", + "body ".repeat(200) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + &full_text, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.result.resolved_entities.iter().any(|entity| { + entity.label == "address" && entity.text == "Evropská 710" + })); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::EntityAddressContext + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); +} + +#[test] +fn prepared_search_keeps_address_context_above_threshold() { + let full_text = format!( + "ACME s.r.o.\nEvropská 710\n160 00 Praha\n{}", + "body ".repeat(200) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.9, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "address" + && entity.text == "Evropská 710" + && entity.source_detail.is_none() + })); +} + +#[test] +fn prepared_search_measures_bare_house_context_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("Praha 10 {} Evropská 710.", "á".repeat(40)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + +#[test] +fn prepared_search_filters_capitalized_bare_house_stopwords() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Praha 10 Section 183 follows.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Section 183") + ); +} + +#[test] +fn prepared_search_measures_slash_address_context_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("Praha 10 {} Vinohradská 2512/2a.", "á".repeat(145)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Vinohradská 2512/2a") + ); +} + +#[test] +fn prepared_search_finds_slash_address_context_after_long_multibyte_prefix() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bPraha 10\b"))], + regex_meta: vec![RegexMatchMeta::new("address", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!( + "{}\nPraha 10 {} Vinohradská 2512/2a.", + "č".repeat(4_000), + "á".repeat(145) + ); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.text == "Vinohradská 2512/2a") + ); +} + +#[test] +fn prepared_search_ignores_caller_owned_addresses_for_bare_house_context() { + let mut meta = RegexMatchMeta::new("address", 1.0); + meta.source_detail = Some(SourceDetail::CustomRegex); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bPraha 2\b", + ))], + custom_regex_meta: vec![meta], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Delivery area Praha 2, Evropská 710.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + +#[test] +fn prepared_search_measures_header_zone_in_text_offsets() { + let full_text = format!( + "{}\nACME s.r.o.\nEvropská 710\n{}", + "body ".repeat(80), + "é".repeat(2_000) + ); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"ACME s\.r\.o\.", + ))], + custom_regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization"), String::from("address")], + address_context_data: Some(address_context_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| entity.text == "Evropská 710") + ); +} + +#[test] +fn prepared_search_adds_coreference_aliases_with_source_placeholder() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Acme Corporation", + ))], + regex_meta: vec![RegexMatchMeta::new("organization", 1.0)], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + name_corpus_data: None, + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Acme") signed. Acme later paid."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference && entity.text == "Acme" + })); + assert_eq!( + result.redaction.redacted_text, + r#"[ORGANIZATION_1] (the "[ORGANIZATION_1]") signed. [ORGANIZATION_1] later paid."#, + ); +} + +#[test] +fn prepared_search_propagates_bare_organization_names() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + + let result = prepared + .redact_static_entities( + "Acme LLC signed. Acme paid.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference && entity.text == "Acme" + })); + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] paid.", + ); +} + +#[test] +fn prepared_search_extends_propagated_organization_determiners() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + + let result = prepared + .redact_static_entities( + "Acme LLC signed. The Company Acme paid.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.source == DetectionSource::Coreference + && entity.text == "The Company Acme" + })); + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] paid.", + ); +} + +#[test] +fn prepared_search_uses_propagated_orgs_as_defined_term_sources() { + let prepared = legal_form_coreference_prepared_search(vec!["LLC"]); + let full_text = format!( + "Acme LLC signed. {} Acme (the \"Acme Platform\") paid. Acme Platform renewed.", + "body ".repeat(50), + ); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + let EntityKind::Coreference { source_text } = &entity.kind else { + return false; + }; + entity.source == DetectionSource::Coreference + && entity.text == "Acme Platform" + && source_text == "Acme" + })); +} + +#[test] +fn prepared_search_does_not_seed_coreference_from_caller_owned_entities() { + let mut meta = RegexMatchMeta::new("organization", 1.0); + meta.source_detail = Some(SourceDetail::CustomRegex); + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"Acme Corporation", + ))], + custom_regex_meta: vec![meta], + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + name_corpus_data: None, + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Acme") signed. Acme later paid."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| { entity.source == DetectionSource::Coreference }) + ); +} + +#[test] +fn prepared_search_rejects_role_and_legal_form_coreference_aliases() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"Acme Corporation")), + SearchPattern::Regex(String::from(r"Beta LLC")), + ], + regex_meta: vec![ + RegexMatchMeta::new("organization", 1.0), + RegexMatchMeta::new("organization", 1.0), + ], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + threshold: 0.5, + allowed_labels: vec![String::from("organization")], + coreference_data: Some(coreference_data()), + name_corpus_data: None, + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + r#"Acme Corporation (the "Seller") signed. Seller paid. Beta LLC (the "LLC") joined. LLC remained."#, + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + !result + .resolved_entities + .iter() + .any(|entity| { entity.source == DetectionSource::Coreference }) + ); +} + +#[test] +fn prepared_search_artifacts_match_direct_prepare() { + let config = PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d{3}\b"))], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("identifier", 1.0)], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }; + let artifacts = PreparedSearch::prepare_artifacts(config.clone()).unwrap(); + assert!( + !artifacts.literals.slots.is_empty(), + "literal index should produce prepared artifacts" + ); + + let direct = PreparedSearch::new(config.clone()).unwrap(); + let prepared = + PreparedSearch::new_with_artifacts(config.clone(), &artifacts).unwrap(); + let text = "Acme\u{00a0}Corp. signed ID123"; + + assert_eq!( + prepared.find_matches(text).unwrap(), + direct.find_matches(text).unwrap() + ); + + let mut missing = artifacts; + missing.literals.slots.clear(); + assert!( + PreparedSearch::new_with_artifacts(config, &missing).is_err(), + "missing literal artifacts should fail" + ); +} + +#[test] +fn prepared_search_artifacts_roundtrip_bytes() { + let config = PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d{3}\b"))], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme Corp"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("identifier", 1.0)], + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + ..empty_config(PreparedSearchSlices::default()) + }; + let artifacts = PreparedSearch::prepare_artifacts(config.clone()).unwrap(); + let bytes = artifacts.to_bytes().unwrap(); + let decoded = PreparedSearchArtifacts::from_bytes(&bytes).unwrap(); + + assert_eq!(decoded, artifacts); + + let direct = PreparedSearch::new(config.clone()).unwrap(); + let prepared = PreparedSearch::new_with_artifacts(config, &decoded).unwrap(); + assert_eq!( + prepared.find_matches("Acme Corp signed ID123").unwrap(), + direct.find_matches("Acme Corp signed ID123").unwrap() + ); +} + +#[test] +fn prepared_search_artifacts_reject_invalid_bytes() { + let error = PreparedSearchArtifacts::from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, Error::InvalidStaticData { .. }), + "invalid prepared-search artifacts should fail at the format boundary" + ); +} + +#[test] +fn prepared_search_emits_static_detector_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bMAT-\d{3}\b", + ))], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Turkey"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + fuzzy: FuzzySearchOptions::default(), + ..SearchOptions::default() + }, + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + custom_regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + countries: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![RegexMatchMeta { + label: String::from("matter id"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: Some(CountryMatchData { + labels: vec![String::from("country")], + }), + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Acme s.r.o. filed AB1234 in Turkey under MAT-123") + .unwrap(); + + assert_eq!(result.regex_entities[0].label, "registration number"); + assert_eq!(result.custom_regex_entities[0].label, "matter id"); + assert_eq!( + result.custom_regex_entities[0].source_detail, + Some(SourceDetail::CustomRegex) + ); + assert_eq!(result.gazetteer_entities[0].text, "Acme s.r.o."); + assert_eq!(result.country_entities[0].source, DetectionSource::Country); +} + +#[test] +fn prepared_search_extends_gazetteer_suffix_in_text_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Acme spółka signed.", &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.iter().any(|entity| { + entity.label == "organization" && entity.text == "Acme spółka" + })); + assert_eq!(result.redaction.redacted_text, "[ORGANIZATION_1] signed."); +} + +#[test] +fn prepared_search_preserves_overlapping_custom_regex_matches() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + custom_regex_patterns: vec![ + SearchPattern::Regex(String::from("Alice")), + SearchPattern::Regex(String::from("Alice Smith")), + ], + custom_regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + custom_regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + custom_regex_meta: vec![ + RegexMatchMeta { + label: String::from("person"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }, + RegexMatchMeta { + label: String::from("person"), + score: 1.0, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }, + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Alice Smith signed.") + .unwrap(); + let custom_texts = result + .custom_regex_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!(custom_texts, ["Alice", "Alice Smith"]); +} + +#[test] +fn prepared_search_drops_person_spans_ending_in_trailing_noun() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\bCOBRA Reimbursement Period\b", + ))], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("person", 0.9)], + deny_list_data: Some(DenyListMatchData { + labels: Vec::new().into(), + custom_labels: Vec::new().into(), + originals: Vec::new(), + sources: Vec::new().into(), + filters: Some(DenyListFilterData { + person_trailing_nouns: BTreeSet::from([String::from("period")]), + ..DenyListFilterData::default() + }), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Payments continue during the COBRA Reimbursement Period.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_extracts_dates_from_anchored_data() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + date_data: Some(DateData { + month_names_by_language: BTreeMap::from([ + ( + String::from("en"), + vec![ + String::from("January"), + String::from("March"), + String::from("December"), + ], + ), + ( + String::from("cs"), + vec![String::from("ledna"), String::from("únor")], + ), + ]), + year_words_by_language: BTreeMap::from([( + String::from("cs"), + vec![String::from("roce")], + )]), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Signed 7 January 2025, renewed March 9, 2026, effective 2026. únor 3., filed 1.ledna 2026 and signed 1. ledna 2026. Ends December 31, \n\n2025. Výpis v roce 2026.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| (entity.text.as_str(), entity.label.as_str(), entity.source)) + .collect::>(); + + assert_eq!( + entities, + [ + ("7 January 2025", "date", DetectionSource::Regex), + ("March 9, 2026", "date", DetectionSource::Regex), + ("2026. únor 3.", "date", DetectionSource::Regex), + ("ledna 2026", "date", DetectionSource::Regex), + ("1. ledna 2026", "date", DetectionSource::Regex), + ("December 31, \n\n2025", "date", DetectionSource::Regex), + ("2026", "date", DetectionSource::Trigger), + ], + ); +} + +#[test] +fn prepared_search_extracts_uppercase_ordinal_dates() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + date_data: Some(DateData { + month_names_by_language: BTreeMap::from([( + String::from("en"), + vec![String::from("January")], + )]), + year_words_by_language: BTreeMap::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Filed on 1ST January 2025.") + .unwrap(); + + assert!( + result + .anchored_entities + .iter() + .any(|entity| entity.text == "1ST January 2025") + ); +} + +#[test] +fn prepared_search_extracts_written_date_of_birth_trigger() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 3 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Herr Müller, geboren am 21. März 1968, ist Geschäftsführer.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "date of birth" + && entity.text == "21. März 1968") + ); +} + +#[test] +fn prepared_search_honors_single_word_written_date_trigger_count() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Herr Müller, geboren am 21. März 1968, ist Geschäftsführer.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "date of birth" && entity.text == "21.") + ); +} + +#[test] +fn prepared_search_extracts_year_after_duplicate_year_word_noise() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: ["rok", "an", "roce"] + .into_iter() + .map(|pattern| SearchPattern::LiteralWithOptions { + pattern: String::from(pattern), + case_insensitive: Some(true), + whole_words: Some(false), + }) + .collect(), + regex_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 3 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: ["rok", "an", "roce"] + .into_iter() + .map(|trigger| TriggerRule { + trigger: String::from(trigger), + label: String::from("date"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: vec![TriggerValidation::MatchesPattern { + pattern: String::from(r"^(?:19|20)\d{2}\.?$"), + flags: None, + }], + include_trigger: false, + }) + .collect(), + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let text = "účetní uzávěrku za roky 2019, 2020, 2021, 2022, 2023 a 2024, výpis z valné hromady konané v\u{00a0}roce 2026 a to nejpozději"; + let result = prepared.detect_static_entities(text).unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "date" && entity.text == "2026") + ); +} + +#[test] +fn prepared_search_trigger_caps_by_characters_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("ve výši"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("ve výši"), + label: String::from("monetary amount"), + strategy: TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: None, + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let expected = "0,2 % z Ceny Plnění dle příslušné Dílčí smlouvy za každý i započatý kalendářní den prodlení"; + let result = prepared + .detect_static_entities(&format!("Smluvní pokuta ve výši {expected}.")) + .unwrap(); + + assert!( + result.trigger_entities.iter().any(|entity| entity.label + == "monetary amount" + && entity.text == expected) + ); +} + +#[test] +fn prepared_search_trigger_validations_count_characters_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("jméno"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("jméno"), + label: String::from("person"), + strategy: TriggerStrategy::NWords { count: 1 }, + validations: vec![ + TriggerValidation::MinLength(5), + TriggerValidation::MaxLength(5), + ], + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities("Smluvní jméno Áběčď bylo ověřeno.") + .unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "person" && entity.text == "Áběčď") + ); +} + +#[test] +fn prepared_search_rejects_lowercase_acronym_trigger_collisions() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("dni"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("DNI"), + label: String::from("national identification number"), + strategy: TriggerStrategy::CompanyIdValue, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let lower = prepared + .detect_static_entities("Cena je stanovena ke dni 6.11.2025.") + .unwrap(); + assert!(lower.trigger_entities.is_empty()); + + let upper = prepared + .detect_static_entities("Documento DNI 12345678Z.") + .unwrap(); + assert!( + upper + .trigger_entities + .iter() + .any(|entity| entity.text == "12345678Z" + && entity.label == "national identification number") + ); +} + +#[test] +fn prepared_search_trims_party_position_before_triggered_address() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("sídlo"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("sídlo"), + label: String::from("address"), + strategy: TriggerStrategy::Address { + max_chars: Some(120), + }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: vec![String::from("prodávajícího")], + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Místem předání je sídlo prodávajícího Na Květnici 1657/16, 140 00 Praha 4.", + ) + .unwrap(); + + assert!( + result + .trigger_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Na Květnici 1657/16, 140 00 Praha 4") + ); +} + +#[test] +fn prepared_search_extracts_money_from_anchored_data() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![String::from("USD"), String::from("EUR")], + symbols: vec![String::from("$")], + local_names: vec![String::from("Kč"), String::from("korun českých")], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![], + magnitude_suffixes: vec![MagnitudeSuffixData { + words: vec![String::from("million")], + abbreviations_case_insensitive: vec![], + abbreviations_case_sensitive: vec![], + }], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Fees are USD 1,250,000.00, $450,000, 25 million EUR and 275 000 Kč.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!( + entities, + [ + "USD 1,250,000.00", + "$450,000", + "25 million EUR", + "275 000 Kč", + ], + ); +} + +#[test] +fn prepared_search_rejects_long_ungrouped_money_numbers() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![String::from("USD")], + symbols: vec![String::from("$")], + local_names: vec![], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![], + magnitude_suffixes: vec![], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Reject USD 123456789012345 and $123456789012345. Keep USD 123456789, $123456789.00 and USD 1,234,567,890.", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert!(!entities.contains(&"USD 123456789012345")); + assert!(!entities.contains(&"$123456789012345")); + assert!(entities.contains(&"USD 123456789")); + assert!(entities.contains(&"$123456789.00")); + assert!(entities.contains(&"USD 1,234,567,890")); +} + +#[test] +fn prepared_search_extends_money_to_written_amount_parenthetical() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + monetary_data: Some(MonetaryData { + currencies: CurrencyData { + codes: vec![], + symbols: vec![], + local_names: vec![String::from("Kč")], + }, + amount_words: AmountWordsData { + written_amount_patterns: vec![WrittenAmountPatternData { + keywords: vec![String::from("slovy")], + }], + magnitude_suffixes: vec![], + share_quantity_terms: vec![], + }, + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .detect_static_entities( + "Smluvní pokuta je 50.000,- Kč (slovy: padesát tisíc korun českých).", + ) + .unwrap(); + let entities = result + .anchored_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!( + entities, + ["50.000,- Kč (slovy: padesát tisíc korun českých)"], + ); +} + +#[test] +fn prepared_search_redacts_static_entities_end_to_end() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![], + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Turkey"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + countries: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: Some(CountryMatchData { + labels: vec![String::from("country")], + }), + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Acme s.r.o. filed AB1234 in Turkey.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] filed [REGISTRATION_NUMBER_1] in [COUNTRY_1]." + ); + assert_eq!(result.redaction.entity_count, 3); + assert_eq!(result.resolved_entities.len(), 3); +} + +#[test] +fn prepared_search_applies_threshold_before_merge() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from("Acme")), + SearchPattern::Regex(String::from(r"Acme s\.r\.o\.")), + ], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + threshold: 0.5, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("organization", 0.9), + RegexMatchMeta::new("organization", 0.4), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Acme s.r.o. signed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!( + result.redaction.redacted_text, + "[ORGANIZATION_1] s.r.o. signed." + ); + assert_eq!(result.resolved_entities.len(), 1); + assert_eq!(result.resolved_entities[0].text, "Acme"); +} + +#[test] +fn prepared_search_applies_header_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.45)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + "Parties\nAlice\nArticle 1\nBody", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.result.resolved_entities.len(), 1); + assert_eq!(result.result.resolved_entities[0].text, "Alice"); + assert!((result.result.resolved_entities[0].score - 0.55).abs() < 1e-12); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::EntityZoneAdjustment + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); +} + +#[test] +fn prepared_search_applies_table_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.46)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Article 1\nName\tAddress\tId\nAlice\tPrague\t123", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert!((result.resolved_entities[0].score - 0.51).abs() < 1e-12); +} + +#[test] +fn prepared_search_applies_signature_zone_boost_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + regex_meta: vec![RegexMatchMeta::new("person", 0.36)], + threshold: 0.5, + allowed_labels: vec![String::from("person")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + zone_data: Some(zone_data()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Article 1\nBody\nV Praze dne 1.1.2024\nAlice", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert!((result.resolved_entities[0].score - 0.51).abs() < 1e-12); +} + +#[test] +fn prepared_search_boosts_near_miss_entities_when_enabled() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"\bANCHOR-\d+\b")), + SearchPattern::Regex(String::from(r"\bNEAR-\d+\b")), + ], + threshold: 0.5, + confidence_boost: true, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("registration number", 0.95), + RegexMatchMeta::new("matter id", 0.45), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "ANCHOR-123 signed with NEAR-456.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 2); + assert_eq!(result.resolved_entities[0].text, "ANCHOR-123"); + assert_eq!(result.resolved_entities[1].text, "NEAR-456"); + assert!((result.resolved_entities[1].score - 0.5).abs() < f64::EPSILON); + assert_eq!( + result.redaction.redacted_text, + "[REGISTRATION_NUMBER_1] signed with [MATTER_ID_1]." + ); +} + +#[test] +fn prepared_search_boost_counts_text_offsets_not_bytes() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![ + SearchPattern::Regex(String::from(r"\bANCHOR-\d+\b")), + SearchPattern::Regex(String::from(r"\bNEAR-\d+\b")), + ], + threshold: 0.5, + confidence_boost: true, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 2 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![ + RegexMatchMeta::new("registration number", 0.95), + RegexMatchMeta::new("matter id", 0.45), + ], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("ANCHOR-123 {} NEAR-456.", "á".repeat(120)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 2); + assert_eq!(result.resolved_entities[0].text, "ANCHOR-123"); + assert_eq!(result.resolved_entities[1].text, "NEAR-456"); + assert!((result.resolved_entities[1].score - 0.5).abs() < f64::EPSILON); +} + +#[test] +fn prepared_search_hotword_distance_uses_utf16_offsets() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + allowed_labels: vec![String::from("date of birth")], + threshold: 0.8, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.7)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("born")], + target_labels: vec![String::from("date")], + score_adjustment: 1.0, + reclassify_to: Some(String::from("date of birth")), + proximity_before: 40, + proximity_after: 40, + }], + pattern_rule_indices: vec![], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + let full_text = format!("born {} 12.03.1990", "😀".repeat(30)); + + let result = prepared + .redact_static_entities(&full_text, &OperatorConfig::default()) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_hotword_searches_original_text() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + allowed_labels: vec![String::from("date")], + threshold: 0.96, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.95)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("tax ID")], + target_labels: vec![String::from("date")], + score_adjustment: 0.1, + reclassify_to: None, + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "tax\u{00a0}ID 12.03.1990", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_rejects_legacy_hotword_slice() { + let result = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("born"))], + slices: PreparedSearchSlices { + hotwords: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("born")], + target_labels: vec![String::from("date")], + score_adjustment: 0.1, + reclassify_to: None, + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![0], + }), + ..empty_config(PreparedSearchSlices::default()) + }); + + assert!(matches!( + result, + Err(Error::UnsupportedStaticSlice { slice: "hotwords" }) + )); +} + +#[test] +fn prepared_search_applies_hotword_reclassification_before_threshold() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b\d{2}\.\d{2}\.\d{4}\b", + ))], + allowed_labels: vec![String::from("date of birth")], + threshold: 0.8, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("date", 0.7)], + hotword_data: Some(HotwordRuleData { + rules: vec![HotwordRule { + hotwords: vec![String::from("narozen")], + target_labels: vec![String::from("date")], + score_adjustment: 0.15, + reclassify_to: Some(String::from("date of birth")), + proximity_before: 60, + proximity_after: 60, + }], + pattern_rule_indices: vec![], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "narozen dne 12.03.1990 v Praze", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.resolved_entities.len(), 1); + assert_eq!(result.resolved_entities[0].label, "date of birth"); + assert_eq!(result.resolved_entities[0].text, "12.03.1990"); + assert_eq!( + result.redaction.redacted_text, + "narozen dne [DATE_OF_BIRTH_1] v Praze" + ); +} + +#[test] +fn prepared_search_applies_allowed_labels_before_redaction() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from("Alice"))], + allowed_labels: vec![String::from("date")], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("person", 1.0)], + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Alice signed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.redaction.redacted_text, "Alice signed."); + assert!(result.resolved_entities.is_empty()); +} + +#[test] +fn prepared_search_keeps_person_name_particles_after_trigger() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Pan"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("Pan"), + label: String::from("person"), + strategy: TriggerStrategy::ToEndOfLine, + validations: vec![TriggerValidation::StartsUppercase], + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let with_apostrophe = prepared + .detect_static_entities("Pan Jean d'Arc přijel pozdě.") + .unwrap(); + assert!( + with_apostrophe + .trigger_entities + .iter() + .any(|entity| entity.text == "Jean d'Arc") + ); + + let with_particle = prepared + .detect_static_entities("Pan João dos Santos přijel pozdě.") + .unwrap(); + assert!( + with_particle + .trigger_entities + .iter() + .any(|entity| entity.text == "João dos Santos") + ); + + let trailing_particle = prepared + .detect_static_entities("Pan Novák von tady odešel.") + .unwrap(); + assert!( + trailing_particle + .trigger_entities + .iter() + .any(|entity| entity.text == "Novák") + ); + assert!( + trailing_particle + .trigger_entities + .iter() + .all(|entity| !entity.text.contains("von")) + ); +} + +#[test] +fn prepared_search_reports_static_redaction_diagnostics() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"\b[A-Z]{2}\d{4}\b", + ))], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Acme"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: Some(GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![false], + }), + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }) + .unwrap(); + + let result = prepared + .redact_static_entities_with_diagnostics( + "Acme s.r.o. filed AB1234.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!( + result.result.redaction.redacted_text, + "[ORGANIZATION_1] filed [REGISTRATION_NUMBER_1]." + ); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::SearchRegex + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(1) + })); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::Sanitize + && event.kind == DiagnosticEventKind::Entity + && event.label.as_deref() == Some("organization") + && event.span_valid == Some(true) + })); + assert!( + result + .diagnostics + .events + .iter() + .all(|event| event.text.is_none()) + ); + assert!(result.diagnostics.events.iter().any(|event| { + event.stage == DiagnosticStage::Redaction + && event.kind == DiagnosticEventKind::StageSummary + && event.count == Some(2) + })); +} + +#[test] +fn prepared_search_redacts_custom_deny_list_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Secret Code"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]].into(), + filters: None, + }), + false_positive_filters: None, + gazetteer_data: None, + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Secret Code was disclosed.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert_eq!(result.detections.deny_list_entities.len(), 1); + assert_eq!(result.redaction.redacted_text, "[MATTER_1] was disclosed."); + assert_eq!(result.redaction.entity_count, 1); +} + +#[test] +fn prepared_search_rejects_unsupported_static_slices() { + let unsupported = PatternSlice { start: 0, end: 1 }; + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Secret"))], + ..empty_config(PreparedSearchSlices { + deny_list: unsupported, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("unsupported slice should be rejected"); + + assert_eq!(error, Error::UnsupportedStaticSlice { slice: "deny_list" }); +} + +#[test] +fn prepared_search_requires_gazetteer_metadata_for_gazetteer_slice() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Acme"))], + ..empty_config(PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("gazetteer slice should require metadata"); + + assert_eq!( + error, + Error::MissingStaticData { + field: "gazetteer_data" + } + ); +} + +#[test] +fn prepared_search_rejects_truncated_country_metadata() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Turkey"))], + country_data: Some(CountryMatchData { labels: Vec::new() }), + ..empty_config(PreparedSearchSlices { + countries: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("truncated country metadata should be rejected"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "country_data.labels", + expected: 1, + actual: 0 + } + ); +} + +#[test] +fn prepared_search_rejects_missing_regex_metadata() { + let error = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from(r"\bID\d+\b"))], + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + ..empty_config(PreparedSearchSlices::default()) + }) + .err() + .expect("regex slice should require parallel metadata"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "regex_meta", + expected: 1, + actual: 0 + } + ); +} + +#[test] +fn prepared_search_rejects_literal_slices_outside_patterns() { + let error = PreparedSearch::new(empty_config(PreparedSearchSlices { + gazetteer: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + })) + .err() + .expect("slice outside the literal pattern table should be rejected"); + + assert!( + matches!( + error, + Error::InvalidStaticData { + field: "slices.gazetteer", + .. + } + ), + "unexpected error: {error}" + ); +} + +#[test] +fn prepared_search_requires_address_seed_data_for_street_types() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Street"))], + ..empty_config(PreparedSearchSlices { + street_types: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("street types should require address seed data"); + + assert_eq!( + error, + Error::MissingStaticData { + field: "address_seed_data" + } + ); +} + +#[test] +fn prepared_search_expands_address_seeds_from_street_type_slice() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Boston"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Boston")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Send notices to 100 Main Street, Boston, MA 02101-1234.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "100 Main Street, Boston, MA 02101-1234") + ); +} + +#[test] +fn prepared_search_expands_address_seeds_from_city_and_postal_code() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Brno"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brno")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sídlo společnosti je Kamínky 302/16, Brno 634 00.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Kamínky 302/16, Brno 634 00") + ); +} + +#[test] +fn prepared_search_expands_compound_german_street_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Wiesbaden"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Düsseldorf")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "wohnhaft Schadowstraße 11, 40212 Düsseldorf.", + &OperatorConfig::default(), + ) + .unwrap(); + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Schadowstraße 11, 40212 Düsseldorf") + ); +} + +#[test] +fn prepared_search_expands_plain_postal_city_addresses() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("geboren am"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Düsseldorf"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: String::from("geboren am"), + label: String::from("date of birth"), + strategy: TriggerStrategy::NWords { count: 3 }, + validations: Vec::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: Vec::new(), + sentence_terminal_currency_terms: Vec::new(), + }), + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Wiesbaden")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "(2) Frau Karoline M. Brentano,\n geboren am 09. Juli 1982,\n wohnhaft Bismarckring 18, 65183 Wiesbaden,\n Steuer-ID: 78 123 456 789", + &OperatorConfig::default(), + ) + .unwrap(); + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Bismarckring 18, 65183 Wiesbaden") + ); +} + +#[test] +fn prepared_search_stops_address_before_notice_copy_instruction() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("Wilmington"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("Street"), + case_insensitive: Some(true), + whole_words: Some(true), + }, + ], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + street_types: PatternSlice { start: 1, end: 2 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Wilmington")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("with a copy")], + br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "1209 Orange Street, Wilmington, DE 19801; with a copy to general counsel.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "1209 Orange Street, Wilmington, DE 19801") + ); + assert!( + result + .resolved_entities + .iter() + .all(|entity| !entity.text.contains("with a copy")) + ); +} + +#[test] +fn prepared_search_splits_address_seed_clusters_at_paragraph_breaks() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Brno"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brno")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData::default()), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Kamínky 5, Brno 634 00\n\nIČ: 48511229\n\nKamínky 302/16, Brno 634 00.", + &OperatorConfig::default(), + ) + .unwrap(); + + assert!( + result + .resolved_entities + .iter() + .any(|entity| entity.label == "address" + && entity.text == "Kamínky 302/16, Brno 634 00") + ); +} + +#[test] +fn prepared_search_stops_address_seed_expansion_at_legal_prose() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Liberec"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Liberec")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("pokud")], + br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Fakturu zašlete na Náspu 5, 460 01 Liberec, pokud nebude dohodnuto jinak. Přílohou bude seznam.", + &OperatorConfig::default(), + ) + .unwrap(); + + let addresses = result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect::>(); + assert!(addresses.contains(&"Náspu 5, 460 01 Liberec")); + assert!(!addresses.iter().any(|text| text.contains("pokud"))); + assert!(!addresses.iter().any(|text| text.contains("Přílohou"))); +} + +#[test] +fn prepared_search_does_not_cluster_address_seed_inside_register_span() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::Regex(String::from( + r"Handelsregister des Amtsgerichts Düsseldorf unter HRB \d+", + ))], + regex_meta: vec![RegexMatchMeta::new("registration number", 0.9)], + regex_options: SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + ..SearchOptions::default() + }, + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Düsseldorf"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + slices: PreparedSearchSlices { + regex: PatternSlice { start: 0, end: 1 }, + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Düsseldorf")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + address_seed_data: Some(AddressSeedData { + boundary_words: vec![String::from("eingetragen")], + br_cep_cue_words: Vec::new(), + unit_abbreviations: Vec::new(), + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .unwrap(); + + let result = prepared + .redact_static_entities( + "Sitz: Königsallee 27, 40212 Düsseldorf,\n eingetragen im Handelsregister des Amtsgerichts Düsseldorf unter HRB 78219.", + &OperatorConfig::default(), + ) + .unwrap(); + + let addresses = result + .resolved_entities + .iter() + .filter(|entity| entity.label == "address") + .map(|entity| entity.text.as_str()) + .collect::>(); + assert!(addresses.contains(&"Königsallee 27, 40212 Düsseldorf")); + assert!(!addresses.iter().any(|text| text.contains("Sitz:"))); + assert!( + !addresses + .iter() + .any(|text| text.contains("Handelsregister")) + ); +} + +#[test] +fn prepared_search_redacts_curated_deny_list_entities() { + let prepared = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: String::from("Prague"), + case_insensitive: Some(true), + whole_words: Some(true), + }], + literal_options: SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }, + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .unwrap(); + + let result = prepared + .redact_static_entities("Prague filed.", &OperatorConfig::default()) + .unwrap(); + + assert_eq!(result.redaction.redacted_text, "[ADDRESS_1] filed."); +} + +#[test] +fn prepared_search_rejects_curated_deny_list_without_filters() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Prague"))], + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]].into(), + filters: None, + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("curated deny-list source should be rejected"); + + assert_eq!( + error, + Error::MissingStaticData { + field: "deny_list.filters" + } + ); +} + +#[test] +fn prepared_search_rejects_truncated_deny_list_data() { + let error = PreparedSearch::new(PreparedSearchConfig { + literal_patterns: vec![SearchPattern::Literal(String::from("Secret Code"))], + deny_list_data: Some(DenyListMatchData { + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![].into(), + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]].into(), + filters: None, + }), + ..empty_config(PreparedSearchSlices { + deny_list: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }) + }) + .err() + .expect("truncated deny-list data should be rejected"); + + assert_eq!( + error, + Error::StaticDataLengthMismatch { + field: "deny_list.custom_labels", + expected: 1, + actual: 0 + } + ); +} + +#[test] +fn prepared_search_detects_non_english_legal_form_entities() { + let prepared = legal_form_prepared_search(vec!["a.s.", "a. s."]); + + let result = prepared + .detect_static_entities("Smlouvu podepsaly Pražské služby, a.s. dnes.") + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!(result.legal_form_entities[0].text, "Pražské služby, a.s."); + assert_eq!( + result.legal_form_entities[0].source, + DetectionSource::LegalForm + ); +} + +#[test] +fn prepared_search_keeps_indented_line_wrapped_legal_form_suffix() { + let prepared = legal_form_prepared_search(vec!["Co.", "LLC"]); + + let result = prepared + .detect_static_entities( + "The underwriter is Goldman Sachs & Co.\n LLC, joint book-runner.", + ) + .unwrap(); + + assert_eq!(result.legal_form_entities.len(), 1); + assert_eq!( + result.legal_form_entities[0].text, + "Goldman Sachs & Co.\n LLC" + ); +} + +#[test] +fn prepared_search_splits_embedded_legal_form_lists() { + let prepared = legal_form_prepared_search(vec!["LLC", "Inc."]); + + let result = prepared + .detect_static_entities( + "The parties include Acme LLC, Beta Inc. and others.", + ) + .unwrap(); + let texts = result + .legal_form_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect::>(); + + assert_eq!(texts, vec!["Acme LLC", "Beta Inc."]); +} + +#[test] +fn prepared_search_rejects_dotted_citation_legal_form_substrings() { + let prepared = legal_form_prepared_search(vec!["S.C."]); + + let result = prepared + .detect_static_entities("See 18 U.S.C. Section 1833(b) for civil immunity.") + .unwrap(); + + assert!(result.legal_form_entities.is_empty()); +} diff --git a/crates/anonymize-core/tests/primitives_properties.rs b/crates/anonymize-core/tests/primitives_properties.rs new file mode 100644 index 00000000..da402273 --- /dev/null +++ b/crates/anonymize-core/tests/primitives_properties.rs @@ -0,0 +1,804 @@ +#![allow( + clippy::arithmetic_side_effects, + clippy::expect_used, + clippy::indexing_slicing, + clippy::panic, + clippy::unwrap_used +)] + +use proptest::prelude::{Just, ProptestConfig, Strategy, any}; +use proptest::{ + collection, prop_assert, prop_assert_eq, prop_assume, proptest, sample, +}; +use stella_anonymize_core::{ + DetectionSource, Entity, Error, LiteralSearchOptions, OperatorConfig, + PipelineEntity, RegexSearchOptions, SearchIndex, SearchIndexArtifacts, + SearchMatch, SearchOptions, SearchPattern, deanonymise, merge_and_dedup, + redact_text, sanitize_entities, +}; + +const PROPERTY_CASES: u32 = 128; + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +fn text_char() -> impl Strategy { + sample::select(vec![ + 'a', 'b', 'Z', '0', '9', ' ', '-', '.', ',', ':', '\u{00a0}', 'á', 'ř', + '界', '🦀', '\u{0301}', + ]) +} + +fn search_char() -> impl Strategy { + sample::select(vec!['a', 'b', 'Z', '0', '9', 'á', 'ř', '界', '🦀']) +} + +fn text_fragment(max_len: usize) -> impl Strategy { + collection::vec(text_char(), 0..max_len) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn entity_text() -> impl Strategy { + collection::vec(search_char(), 1..8) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn fuzzy_text() -> impl Strategy { + collection::vec(search_char(), 2..8) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn trim_text() -> impl Strategy { + collection::vec( + sample::select(vec![ + ' ', '\t', '\n', ',', ';', ':', '"', '\'', '“', '”', '‘', '’', '«', '»', + '!', '?', + ]), + 0..6, + ) + .prop_map(|chars| chars.into_iter().collect()) +} + +fn source_strategy() -> impl Strategy { + sample::select(vec![ + DetectionSource::Trigger, + DetectionSource::Regex, + DetectionSource::DenyList, + DetectionSource::LegalForm, + DetectionSource::Gazetteer, + DetectionSource::Country, + DetectionSource::Ner, + DetectionSource::Coreference, + ]) +} + +fn label_strategy() -> impl Strategy { + sample::select(vec![ + "person", + "organization", + "address", + "date", + "registration number", + ]) +} + +fn redaction_case() -> impl Strategy)> { + collection::vec((text_fragment(8), entity_text()), 1..8).prop_map( + |segments| { + let mut text = String::new(); + let mut entities = Vec::new(); + + for (index, (prefix, value)) in segments.into_iter().enumerate() { + text.push_str(&prefix); + let start = byte_len(&text); + text.push_str(&value); + let end = byte_len(&text); + entities.push(Entity::detected( + start, + end, + format!("generated label {index}"), + value, + )); + } + text.push_str(" tail"); + + (text, entities) + }, + ) +} + +fn reserved_person_redaction_case() +-> impl Strategy, u32)> { + ( + 1_u32..8, + collection::vec((text_fragment(8), entity_text()), 1..8), + ) + .prop_map(|(reserved_count, segments)| { + let mut text = (1..=reserved_count) + .map(|index| format!("[PERSON_{index}]")) + .collect::>() + .join(" "); + text.push(' '); + + let mut entities = Vec::new(); + for (prefix, value) in segments { + text.push_str(&prefix); + let start = byte_len(&text); + text.push_str(&value); + let end = byte_len(&text); + entities.push(Entity::detected(start, end, "person", value)); + } + + (text, entities, reserved_count) + }) +} + +fn displayed_entity_case() -> impl Strategy { + ( + text_fragment(8), + entity_text(), + text_fragment(8), + entity_text(), + ) + .prop_map(|(prefix, value, suffix, display_text)| { + let start = byte_len(&prefix); + let end = start.saturating_add(byte_len(&value)); + let text = format!("{prefix}{value}{suffix}"); + let entity = Entity::detected(start, end, "person", display_text); + (text, entity, value) + }) +} + +fn same_alias_coreference_case() +-> impl Strategy, String, String)> { + (entity_text(), entity_text(), entity_text()).prop_map( + |(alias, first_source_seed, second_source_seed)| { + let source_a = format!("{first_source_seed} source A"); + let source_b = format!("{second_source_seed} source B"); + let text = format!("{alias} met {alias}."); + let first_start = 0; + let first_end = byte_len(&alias); + let second_start = byte_len(&format!("{alias} met ")); + let second_end = second_start.saturating_add(byte_len(&alias)); + let entities = vec![ + Entity::coreference( + first_start, + first_end, + "person", + alias.clone(), + source_a.clone(), + ), + Entity::coreference( + second_start, + second_end, + "person", + alias, + source_b.clone(), + ), + ]; + (text, entities, source_a, source_b) + }, + ) +} + +fn pipeline_entity_strategy() -> impl Strategy { + ( + 0_u32..80, + 1_u32..24, + label_strategy(), + source_strategy(), + 0.0_f64..1.0, + ) + .prop_map(|(start, len, label, source, score)| { + let end = start.saturating_add(len); + PipelineEntity::detected( + start, + end, + label, + "x".repeat(usize::try_from(len).unwrap_or(0)), + score, + source, + ) + }) +} + +fn literal_search_case() +-> impl Strategy, SearchOptions, String)> { + collection::vec(entity_text(), 1..6) + .prop_flat_map(|needles| { + let patterns = needles + .iter() + .map(|needle| SearchPattern::LiteralWithOptions { + pattern: needle.clone(), + case_insensitive: Some(false), + whole_words: Some(false), + }) + .collect::>(); + ( + Just(patterns), + collection::vec((text_fragment(8), sample::select(needles)), 1..10), + text_fragment(8), + ) + }) + .prop_map(|(patterns, segments, suffix)| { + let mut haystack = String::new(); + for (prefix, needle) in segments { + haystack.push_str(&prefix); + haystack.push_str(&needle); + } + haystack.push_str(&suffix); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + +fn all_literal_identity_search_case() +-> impl Strategy, SearchOptions, String)> { + collection::vec(entity_text(), 1..6) + .prop_flat_map(|needles| { + let patterns = needles + .iter() + .map(|needle| SearchPattern::Literal(needle.clone())) + .collect::>(); + ( + Just(patterns), + collection::vec((text_fragment(8), sample::select(needles)), 1..10), + text_fragment(8), + ) + }) + .prop_map(|(patterns, segments, suffix)| { + let mut haystack = String::new(); + for (prefix, needle) in segments { + haystack.push_str(&prefix); + haystack.push_str(&needle); + } + haystack.push_str(&suffix); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + +fn mixed_search_case() +-> impl Strategy, SearchOptions, String)> { + ( + entity_text(), + entity_text(), + fuzzy_text(), + text_fragment(8), + text_fragment(8), + text_fragment(8), + ) + .prop_map(|(literal, regex_literal, fuzzy, prefix, middle, suffix)| { + let patterns = vec![ + SearchPattern::LiteralWithOptions { + pattern: literal.clone(), + case_insensitive: Some(false), + whole_words: Some(false), + }, + SearchPattern::Regex(regex::escape(®ex_literal)), + SearchPattern::Fuzzy { + pattern: fuzzy.clone(), + distance: Some(1), + }, + ]; + let haystack = + format!("{prefix}{literal} {middle}{regex_literal} {fuzzy}{suffix}"); + + ( + patterns, + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + haystack, + ) + }) +} + +fn mutated_search_patterns(patterns: &[SearchPattern]) -> Vec { + let mut result = patterns.to_vec(); + let Some(first) = result.first_mut() else { + return result; + }; + + match first { + SearchPattern::Literal(pattern) + | SearchPattern::Regex(pattern) + | SearchPattern::Fuzzy { pattern, .. } + | SearchPattern::LiteralWithOptions { pattern, .. } + | SearchPattern::RegexWithOptions { pattern, .. } => pattern.push('x'), + } + + result +} + +#[derive(Clone, Copy, Debug)] +enum ArtifactCorruption { + Header, + Version, + TrailingData, + Truncated, +} + +fn artifact_corruption() -> impl Strategy { + sample::select(vec![ + ArtifactCorruption::Header, + ArtifactCorruption::Version, + ArtifactCorruption::TrailingData, + ArtifactCorruption::Truncated, + ]) +} + +fn corrupt_artifact( + mut bytes: Vec, + corruption: ArtifactCorruption, +) -> Vec { + match corruption { + ArtifactCorruption::Header => { + let first = bytes.first_mut().expect("artifact header byte"); + *first ^= 0xff; + } + ArtifactCorruption::Version => { + let version_byte = bytes.get_mut(8).expect("artifact version byte"); + *version_byte ^= 0xff; + } + ArtifactCorruption::TrailingData => bytes.push(0), + ArtifactCorruption::Truncated => { + bytes.pop(); + } + } + bytes +} + +fn search_output_is_valid( + haystack: &str, + pattern_count: usize, + matches: &[SearchMatch], +) -> bool { + let mut previous: Option<(u32, u32, u32)> = None; + + for found in matches { + if found.start() >= found.end() { + return false; + } + + let Ok(pattern) = usize::try_from(found.pattern()) else { + return false; + }; + if pattern >= pattern_count { + return false; + } + + let Ok(start) = usize::try_from(found.start()) else { + return false; + }; + let Ok(end) = usize::try_from(found.end()) else { + return false; + }; + if haystack.get(start..end).is_none() { + return false; + } + + let current = (found.start(), found.end(), found.pattern()); + if previous.is_some_and(|last| last > current) { + return false; + } + previous = Some(current); + } + + true +} + +fn person_placeholder_number(placeholder: &str) -> Option { + placeholder + .strip_prefix("[PERSON_")? + .strip_suffix(']')? + .parse::() + .ok() +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: PROPERTY_CASES, + ..ProptestConfig::default() + })] + + #[test] + fn generated_redactions_round_trip_on_utf8_boundaries( + (text, entities) in redaction_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, entities.len()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + for entry in &result.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + } + } + + #[test] + fn generated_redactions_skip_reserved_person_placeholders( + (text, entities, reserved_count) in reserved_person_redaction_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, entities.len()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + + for entry in &result.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + let Some(index) = person_placeholder_number(&entry.placeholder) else { + prop_assert!(false, "unexpected placeholder {}", entry.placeholder); + continue; + }; + prop_assert!(index > reserved_count); + } + } + + #[test] + fn generated_detected_originals_use_source_slice_not_display_text( + (text, entity, source_slice) in displayed_entity_case(), + ) { + let result = redact_text( + &text, + std::slice::from_ref(&entity), + &OperatorConfig::default(), + ) + .unwrap(); + + prop_assert_eq!(result.redaction_map.len(), 1); + prop_assert_eq!(result.redaction_map[0].original.as_str(), source_slice.as_str()); + let restored = deanonymise(&result.redacted_text, &result.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + } + + #[test] + fn generated_same_alias_coreferences_keep_distinct_source_identity( + (text, entities, source_a, source_b) in same_alias_coreference_case(), + ) { + let result = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap(); + + prop_assert_eq!(result.entity_count, 2); + prop_assert_eq!(result.redaction_map.len(), 2); + prop_assert!( + result.redaction_map[0].placeholder + != result.redaction_map[1].placeholder, + ); + prop_assert_eq!(result.redaction_map[0].original.as_str(), source_a.as_str()); + prop_assert_eq!(result.redaction_map[1].original.as_str(), source_b.as_str()); + prop_assert!(result.redacted_text.contains(&result.redaction_map[0].placeholder)); + prop_assert!(result.redacted_text.contains(&result.redaction_map[1].placeholder)); + } + + #[test] + fn generated_entity_spans_fail_or_round_trip( + text in text_fragment(32), + spans in collection::vec((0_u32..80, 0_u32..80, label_strategy()), 0..16), + ) { + let entities = spans + .into_iter() + .map(|(start, end, label)| { + Entity::detected(start, end, label, String::from("generated")) + }) + .collect::>(); + + let result = redact_text(&text, &entities, &OperatorConfig::default()); + if let Ok(redacted) = result { + let restored = + deanonymise(&redacted.redacted_text, &redacted.redaction_map); + prop_assert_eq!(restored.as_str(), text.as_str()); + for entry in &redacted.redaction_map { + prop_assert!(!text.contains(&entry.placeholder)); + } + } + } + + #[test] + fn invalid_interior_utf8_offsets_are_rejected( + ch in any::().prop_filter( + "multi-byte scalar", + |candidate| candidate.len_utf8() > 1, + ), + ) { + let text = format!("a{ch}z"); + let end = 1_u32.saturating_add( + u32::try_from(ch.len_utf8()).unwrap_or(u32::MAX), + ); + let entities = vec![Entity::detected(2, end, "person", ch.to_string())]; + + let error = redact_text(&text, &entities, &OperatorConfig::default()) + .unwrap_err(); + + prop_assert_eq!(error, Error::ByteOffsetInsideCodepoint { offset: 2 }); + } + + #[test] + fn merge_and_dedup_never_leaves_partial_overlaps( + entities in collection::vec(pipeline_entity_strategy(), 0..32), + ) { + let result = merge_and_dedup(&entities); + let second_pass = merge_and_dedup(&result); + prop_assert_eq!(&second_pass, &result); + + for entity in &result { + prop_assert!(entity.start < entity.end); + } + + for pair in result.windows(2) { + let left = &pair[0]; + let right = &pair[1]; + prop_assert!(left.start <= right.start); + } + + for (index, left) in result.iter().enumerate() { + for right in result.iter().skip(index.saturating_add(1)) { + let overlaps = left.end > right.start && left.start < right.end; + let same_span = left.start == right.start && left.end == right.end; + prop_assert!( + !overlaps || same_span, + "partial overlap survived: {left:?} / {right:?}", + ); + } + } + } + + #[test] + fn sanitize_entities_keeps_trimmed_spans_inside_original_span( + leading in trim_text(), + core in entity_text(), + trailing in trim_text(), + label in label_strategy(), + base_start in 0_u32..20, + ) { + let raw = format!("{leading}{core}{trailing}"); + let original = PipelineEntity::detected( + base_start, + base_start.saturating_add(byte_len(&raw)), + label, + raw, + 0.5, + DetectionSource::Ner, + ); + + let result = sanitize_entities(std::slice::from_ref(&original)); + + for entity in &result { + prop_assert!(entity.start >= original.start); + prop_assert!(entity.end <= original.end); + prop_assert!(entity.start < entity.end); + prop_assert!(byte_len(&entity.text) <= entity.end.saturating_sub(entity.start)); + prop_assert!(entity.text.chars().any(char::is_alphanumeric)); + prop_assert_eq!(entity.text.trim(), entity.text.as_str()); + } + } + + #[test] + fn literal_search_matches_are_valid_utf8_slices( + prefix in text_fragment(12), + needle in entity_text(), + suffix in text_fragment(12), + ) { + let haystack = format!("{prefix}{needle}{suffix}"); + let expected_start = byte_len(&prefix); + let expected_end = expected_start.saturating_add(byte_len(&needle)); + let index = SearchIndex::new( + vec![SearchPattern::Literal(needle.clone())], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter(&haystack).unwrap(); + + let includes_expected = matches.iter().any(|found| matches!( + found, + SearchMatch::Literal { pattern: 0, start, end } + if *start == expected_start && *end == expected_end + )); + prop_assert!(includes_expected); + for found in matches { + let SearchMatch::Literal { start, end, .. } = found else { + continue; + }; + let start = usize::try_from(start).unwrap(); + let end = usize::try_from(end).unwrap(); + let Some(slice) = haystack.get(start..end) else { + prop_assert!(false, "literal match was not a valid UTF-8 slice"); + continue; + }; + prop_assert_eq!(slice, needle.as_str()); + } + } + + #[test] + fn prepared_literal_search_artifacts_match_direct_search( + (patterns, options, haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + prop_assume!(!artifacts.slots.is_empty()); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + prop_assert_eq!(&decoded, &artifacts); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns.clone(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn prepared_all_literal_artifacts_load_without_original_patterns( + (patterns, options, haystack) in all_literal_identity_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(Vec::new(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(prepared.len(), patterns.len()); + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn artifact_only_literal_loader_rejects_per_pattern_literal_options( + (patterns, options, _haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns, options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + prop_assert!( + SearchIndex::new_with_artifacts(Vec::new(), options, &decoded).is_err() + ); + } + + #[test] + fn prepared_mixed_search_artifacts_match_direct_search( + (patterns, options, haystack) in mixed_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&encoded).unwrap(); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns.clone(), options, &decoded) + .unwrap(); + let direct_matches = direct.find_iter(&haystack).unwrap(); + let prepared_matches = prepared.find_iter(&haystack).unwrap(); + + prop_assert_eq!(&prepared_matches, &direct_matches); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &prepared_matches, + )); + } + + #[test] + fn direct_mixed_search_match_presence_matches_find_iter( + (patterns, options, haystack) in mixed_search_case(), + ) { + let index = SearchIndex::new(patterns.clone(), options).unwrap(); + let matches = index.find_iter(&haystack).unwrap(); + + prop_assert_eq!(index.is_match(&haystack).unwrap(), !matches.is_empty()); + prop_assert!(search_output_is_valid( + &haystack, + patterns.len(), + &matches, + )); + } + + #[test] + fn prepared_mixed_search_artifacts_reject_same_shape_stale_patterns( + (patterns, options, _haystack) in mixed_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let stale_patterns = mutated_search_patterns(&patterns); + prop_assume!(stale_patterns != patterns); + + prop_assert!( + SearchIndex::new_with_artifacts(stale_patterns, options, &artifacts) + .is_err() + ); + } + + #[test] + fn malformed_search_artifacts_fail_closed( + (patterns, options, _haystack) in literal_search_case(), + corruption in artifact_corruption(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns, options).unwrap(); + let encoded = artifacts.to_bytes().unwrap(); + let corrupted = corrupt_artifact(encoded, corruption); + + prop_assert!(SearchIndexArtifacts::from_bytes(&corrupted).is_err()); + } + + #[test] + fn search_artifacts_reject_missing_and_extra_slots( + (patterns, options, _haystack) in literal_search_case(), + ) { + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + prop_assume!(!artifacts.slots.is_empty()); + + let missing = SearchIndexArtifacts::default(); + prop_assert!( + SearchIndex::new_with_artifacts(patterns.clone(), options, &missing) + .is_err() + ); + + let mut extra = artifacts; + let first = extra.slots.first().expect("prepared slot").clone(); + extra.slots.push(first); + prop_assert!( + SearchIndex::new_with_artifacts(patterns, options, &extra).is_err() + ); + } +} diff --git a/crates/anonymize-core/tests/processors.rs b/crates/anonymize-core/tests/processors.rs new file mode 100644 index 00000000..cac14c80 --- /dev/null +++ b/crates/anonymize-core/tests/processors.rs @@ -0,0 +1,633 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use stella_anonymize_core::{ + CountryMatchData, DenyListFilterData, DenyListMatchData, DetectionSource, + Error, GazetteerMatchData, PatternSlice, PipelineEntity, RegexMatchMeta, + SearchMatch, SigningPlaceGuardData, SourceDetail, process_country_matches, + process_deny_list_matches, process_gazetteer_matches, process_regex_matches, +}; + +#[test] +fn regex_processor_filters_slice_and_short_matches_by_meta() { + let matches = vec![ + SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 13, + end: 18, + }, + SearchMatch::Regex { + pattern: 2, + start: 20, + end: 32, + }, + ]; + let meta = vec![ + RegexMatchMeta::new("person", 0.8), + RegexMatchMeta { + label: String::from("short gated"), + score: 0.8, + source_detail: None, + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: Some(7), + }, + ]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Alice called 12345 then 123456789012", + &meta, + ) + .unwrap(); + + assert_eq!( + entities, + vec![PipelineEntity::detected( + 0, + 5, + "person", + "Alice", + 0.8, + DetectionSource::Regex + )] + ); +} + +#[test] +fn regex_processor_rejects_unported_validators() { + let matches = vec![SearchMatch::Regex { + pattern: 7, + start: 0, + end: 5, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("tax identification number"), + score: 0.9, + source_detail: None, + requires_validation: true, + validator_id: None, + validator_input: None, + min_byte_length: None, + }]; + + let err = process_regex_matches( + &matches, + PatternSlice { start: 7, end: 8 }, + "12345", + &meta, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Regex pattern 7 requires validation that is not available in core" + ); +} + +#[test] +fn regex_processor_applies_native_validator_ids() { + let matches = vec![ + SearchMatch::Regex { + pattern: 3, + start: 4, + end: 14, + }, + SearchMatch::Regex { + pattern: 3, + start: 19, + end: 29, + }, + ]; + let meta = vec![RegexMatchMeta { + label: String::from("tax identification number"), + score: 0.9, + source_detail: None, + requires_validation: true, + validator_id: Some(String::from("us.ein")), + validator_input: None, + min_byte_length: None, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 3, end: 4 }, + "EIN 87-2451993 bad 00-2451993", + &meta, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "87-2451993"); +} + +#[test] +fn regex_processor_applies_validator_input_kind() { + let matches = vec![SearchMatch::Regex { + pattern: 0, + start: 0, + end: 24, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("national identification number"), + score: 0.95, + source_detail: None, + requires_validation: true, + validator_id: Some(String::from("gb.nhs")), + validator_input: Some(String::from("digits-only")), + min_byte_length: None, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "NHS number: 401 023 2137", + &meta, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "NHS number: 401 023 2137"); +} + +#[test] +fn regex_processor_preserves_custom_regex_source_detail() { + let matches = vec![SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }]; + let meta = vec![RegexMatchMeta { + label: String::from("matter id"), + score: 0.7, + source_detail: Some(SourceDetail::CustomRegex), + requires_validation: false, + validator_id: None, + validator_input: None, + min_byte_length: None, + }]; + + let entities = process_regex_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "A-123", + &meta, + ) + .unwrap(); + + assert_eq!(entities[0].source_detail, Some(SourceDetail::CustomRegex)); +} + +#[test] +fn deny_list_processor_emits_custom_labels() { + let matches = vec![SearchMatch::Literal { + pattern: 3, + start: 0, + end: 11, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), + originals: vec![String::from("Secret Code")], + sources: vec![vec![String::from("custom-deny-list")]].into(), + filters: None, + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 3, end: 4 }, + "Secret Code filed", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Secret Code"); + assert_eq!(entities[0].source, DetectionSource::DenyList); + assert_eq!( + entities[0].source_detail, + Some(SourceDetail::CustomDenyList) + ); +} + +#[test] +fn deny_list_processor_rejects_embedded_custom_word_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }, + SearchMatch::Literal { + pattern: 0, + start: 14, + end: 20, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("matter")]].into(), + custom_labels: vec![vec![String::from("matter")]].into(), + originals: vec![String::from("Secret")], + sources: vec![vec![String::from("custom-deny-list")]].into(), + filters: None, + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Secret filed xSecret.", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Secret"); +} + +#[test] +fn deny_list_processor_emits_curated_non_person_labels() { + let matches = vec![SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Prague", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "address"); + assert_eq!(entities[0].source_detail, None); +} + +#[test] +fn deny_list_processor_suppresses_shorter_curated_same_start_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 7, + }, + SearchMatch::Literal { + pattern: 1, + start: 0, + end: 17, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")], vec![String::from("country")]] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("Česká"), String::from("Česká republika")], + sources: vec![vec![String::from("city")], vec![String::from("deny-list")]] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Česká republika", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "country"); + assert_eq!(entities[0].text, "Česká republika"); +} + +#[test] +fn deny_list_processor_suppresses_shorter_contained_curated_matches() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 17, + }, + SearchMatch::Literal { + pattern: 1, + start: 10, + end: 17, + }, + ]; + let data = DenyListMatchData { + labels: vec![ + vec![String::from("organization")], + vec![String::from("address")], + ] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("Nemocnice Blansko"), String::from("Blansko")], + sources: vec![vec![String::from("deny-list")], vec![String::from("city")]] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + "Nemocnice Blansko", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].label, "organization"); + assert_eq!(entities[0].text, "Nemocnice Blansko"); +} + +#[test] +fn deny_list_processor_handles_overlapping_person_name_hits() { + let text = "John Smith Jr arrived."; + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 10, + }, + SearchMatch::Literal { + pattern: 1, + start: 5, + end: 13, + }, + ]; + let data = DenyListMatchData { + labels: vec![vec![String::from("person")], vec![String::from("person")]] + .into(), + custom_labels: vec![vec![], vec![]].into(), + originals: vec![String::from("John Smith"), String::from("Smith Jr")], + sources: vec![ + vec![String::from("first-name")], + vec![String::from("surname")], + ] + .into(), + filters: Some(DenyListFilterData::default()), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 2 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "John Smith Jr"); +} + +#[test] +fn deny_list_processor_suppresses_signing_place_address() { + let text = "Podepsano V Brně dne 1. ledna 2026."; + let start = u32::try_from(text.find("Brně").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Brně".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brně")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + signing_place_guards: vec![SigningPlaceGuardData { + prefix_phrases: [String::from("v"), String::from("ve")].into(), + suffix_phrases: [String::from("dne")].into(), + }], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert!(entities.is_empty()); +} + +#[test] +fn deny_list_processor_keeps_real_address_city() { + let text = "Sidlo: Ulice 12, Brně 602 00."; + let start = u32::try_from(text.find("Brně").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Brně".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Brně")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + address_stopwords: [String::from("brně")].into(), + signing_place_guards: vec![SigningPlaceGuardData { + prefix_phrases: [String::from("v"), String::from("ve")].into(), + suffix_phrases: [String::from("dne")].into(), + }], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Brně"); +} + +#[test] +fn deny_list_processor_keeps_address_when_signing_guards_do_not_pair() { + let text = "Company is incorporated in Delaware."; + let start = u32::try_from(text.find("Delaware").unwrap()).unwrap(); + let end = start.saturating_add(u32::try_from("Delaware".len()).unwrap()); + let matches = vec![SearchMatch::Literal { + pattern: 0, + start, + end, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Delaware")], + sources: vec![vec![String::from("city")]].into(), + filters: Some(DenyListFilterData { + signing_place_guards: vec![ + SigningPlaceGuardData { + prefix_phrases: [String::new()].into(), + suffix_phrases: [String::from("den")].into(), + }, + SigningPlaceGuardData { + prefix_phrases: [String::from("signed in")].into(), + suffix_phrases: [String::new()].into(), + }, + ], + ..DenyListFilterData::default() + }), + }; + + let entities = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + text, + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Delaware"); +} + +#[test] +fn deny_list_processor_rejects_curated_sources_without_filters() { + let matches = vec![SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }]; + let data = DenyListMatchData { + labels: vec![vec![String::from("address")]].into(), + custom_labels: vec![vec![]].into(), + originals: vec![String::from("Prague")], + sources: vec![vec![String::from("city")]].into(), + filters: None, + }; + + let error = process_deny_list_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "Prague", + &data, + ) + .unwrap_err(); + + assert_eq!( + error, + Error::MissingStaticData { + field: "deny_list.filters" + } + ); +} + +#[test] +fn gazetteer_processor_extends_exact_matches_and_drops_overlapping_fuzzy() { + let matches = vec![ + SearchMatch::Literal { + pattern: 10, + start: 0, + end: 4, + }, + SearchMatch::Fuzzy { + pattern: 11, + start: 0, + end: 4, + distance: 1, + }, + ]; + let data = GazetteerMatchData { + labels: vec![String::from("organization"), String::from("organization")], + is_fuzzy: vec![false, true], + }; + + let entities = process_gazetteer_matches( + &matches, + PatternSlice { start: 10, end: 12 }, + "Acme s.r.o. signed", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Acme s.r.o."); + assert_eq!( + entities[0].source_detail, + Some(SourceDetail::GazetteerExtension) + ); +} + +#[test] +fn gazetteer_processor_emits_non_overlapping_fuzzy_matches() { + let matches = vec![SearchMatch::Fuzzy { + pattern: 2, + start: 10, + end: 15, + distance: 1, + }]; + let data = GazetteerMatchData { + labels: vec![String::from("organization")], + is_fuzzy: vec![true], + }; + + let entities = process_gazetteer_matches( + &matches, + PatternSlice { start: 2, end: 3 }, + "Signed by Akmee today", + &data, + ) + .unwrap(); + + assert_eq!(entities[0].text, "Akmee"); + assert_eq!(entities[0].score.to_bits(), 0.85_f64.to_bits()); +} + +#[test] +fn country_processor_requires_uppercase_letter_start() { + let matches = vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 6, + }, + SearchMatch::Literal { + pattern: 0, + start: 11, + end: 17, + }, + ]; + let data = CountryMatchData { + labels: vec![String::from("country")], + }; + + let entities = process_country_matches( + &matches, + PatternSlice { start: 0, end: 1 }, + "turkey and Turkey", + &data, + ) + .unwrap(); + + assert_eq!(entities.len(), 1); + assert_eq!(entities[0].text, "Turkey"); + assert_eq!(entities[0].source, DetectionSource::Country); +} diff --git a/crates/anonymize-core/tests/redaction.rs b/crates/anonymize-core/tests/redaction.rs new file mode 100644 index 00000000..3209a6bf --- /dev/null +++ b/crates/anonymize-core/tests/redaction.rs @@ -0,0 +1,438 @@ +#![allow( + clippy::expect_used, + clippy::indexing_slicing, + clippy::panic, + clippy::unwrap_used +)] + +use stella_anonymize_core::{ + Entity, Error, OperatorConfig, OperatorType, deanonymise, redact_text, +}; + +fn entity(text: &str, label: &str, value: &str) -> Entity { + entity_with_display_text(text, label, value, value) +} + +fn entity_with_display_text( + text: &str, + label: &str, + value: &str, + display_text: &str, +) -> Entity { + let byte_start = text + .find(value) + .unwrap_or_else(|| panic!("missing fixture value: {value}")); + let prefix = text + .get(..byte_start) + .unwrap_or_else(|| panic!("invalid fixture boundary: {byte_start}")); + let start = byte_len(prefix); + let end = start.saturating_add(byte_len(value)); + Entity::detected(start, end, label, display_text) +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +#[test] +fn repeated_values_share_first_non_colliding_placeholder() { + let value = "Alice Smith"; + let text = format!("Existing [PERSON_1]. {value} called. {value} signed."); + let first = text.find(value).unwrap_or(0); + let second = text + .get(first.saturating_add(1)..) + .and_then(|tail| tail.find(value)) + .map_or(first, |relative| { + first.saturating_add(1).saturating_add(relative) + }); + let entities = vec![ + Entity::detected( + u32::try_from(first).unwrap_or(u32::MAX), + u32::try_from(first.saturating_add(value.len())).unwrap_or(u32::MAX), + "person", + value, + ), + Entity::detected( + u32::try_from(second).unwrap_or(u32::MAX), + u32::try_from(second.saturating_add(value.len())).unwrap_or(u32::MAX), + "person", + value, + ), + ]; + + let result = + redact_text(&text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!( + result.redacted_text, + "Existing [PERSON_1]. [PERSON_2] called. [PERSON_2] signed." + ); + assert_eq!(result.redaction_map[0].placeholder, "[PERSON_2]"); + assert_eq!( + deanonymise(&result.redacted_text, &result.redaction_map), + text + ); +} + +#[test] +fn literal_placeholders_inside_extra_brackets_are_reserved() { + let text = "Keep [[PERSON_1]]; Alice Smith signs."; + let entities = vec![entity(text, "person", "Alice Smith")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "Keep [[PERSON_1]]; [PERSON_2] signs."); + assert_eq!(result.redaction_map[0].placeholder, "[PERSON_2]"); +} + +#[test] +fn normalized_identifier_values_share_placeholder() { + let text = "Mail Alice@Example.com and alice@example.com."; + let entities = vec![ + entity(text, "email address", "Alice@Example.com"), + entity(text, "email address", "alice@example.com"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[EMAIL_ADDRESS_1]"); +} + +#[test] +fn generic_identifier_cues_keep_distinct_placeholder_keys() { + let text = concat!( + "CNI: 12AB34567 was present. ", + "CNI nº 12AB34567 was repeated. ", + "CNI 12AB34567 was listed. ", + "12AB34567 was bare." + ); + let bare_start = byte_len( + text + .get(..text.rfind("12AB34567").unwrap_or(0)) + .unwrap_or(""), + ); + let entities = vec![ + entity(text, "national identification number", "CNI: 12AB34567"), + entity(text, "national identification number", "CNI nº 12AB34567"), + entity(text, "national identification number", "CNI 12AB34567"), + Entity::detected( + bare_start, + bare_start.saturating_add(byte_len("12AB34567")), + "national identification number", + "12AB34567", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 4); + assert_eq!(result.redacted_text.matches('[').count(), 4); +} + +#[test] +fn generic_identifier_normalization_keeps_trailing_prose_in_key() { + let text = "Reg AB12345 expires. Reg AB12345 repeats."; + let second_start = text + .rfind("AB12345") + .expect("fixture should contain repeated identifier"); + let second_start = byte_len( + text + .get(..second_start) + .expect("fixture boundary should be valid"), + ); + let entities = vec![ + entity_with_display_text( + text, + "registration number", + "AB12345 expires", + "AB12345 expires", + ), + Entity::detected( + second_start, + second_start.saturating_add(byte_len("AB12345")), + "registration number", + "AB12345", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 2); +} + +#[test] +fn spaced_identifier_values_still_share_placeholder() { + let text = + "Card 4242 4242 4242 4242 was present. Card 4242424242424242 repeated."; + let entities = vec![ + entity(text, "credit card number", "4242 4242 4242 4242"), + entity(text, "credit card number", "4242424242424242"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[CREDIT_CARD_NUMBER_1]" + ); +} + +#[test] +fn coreference_alias_uses_source_placeholder_and_value() { + let text = "Acme signed. Acme Corporation countersigned."; + let alias_start = text.find("Acme").unwrap_or(0); + let source_start = text.find("Acme Corporation").unwrap_or(0); + let entities = vec![ + Entity::coreference( + u32::try_from(alias_start).unwrap_or(u32::MAX), + u32::try_from(alias_start.saturating_add("Acme".len())) + .unwrap_or(u32::MAX), + "organization", + "Acme", + "Acme Corporation", + ), + Entity::detected( + u32::try_from(source_start).unwrap_or(u32::MAX), + u32::try_from(source_start.saturating_add("Acme Corporation".len())) + .unwrap_or(u32::MAX), + "organization", + "Acme Corporation", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!( + result.redacted_text, + "[ORGANIZATION_1] signed. [ORGANIZATION_1] countersigned." + ); + assert_eq!(result.redaction_map[0].original, "Acme Corporation"); +} + +#[test] +fn same_alias_text_can_point_to_different_source_placeholders() { + let text = "Smith met Smith."; + let first = text.find("Smith").unwrap_or(0); + let second = text.rfind("Smith").unwrap_or(first); + let entities = vec![ + Entity::coreference( + u32::try_from(first).unwrap_or(u32::MAX), + u32::try_from(first.saturating_add("Smith".len())).unwrap_or(u32::MAX), + "person", + "Smith", + "Alice Smith", + ), + Entity::coreference( + u32::try_from(second).unwrap_or(u32::MAX), + u32::try_from(second.saturating_add("Smith".len())).unwrap_or(u32::MAX), + "person", + "Smith", + "Bob Smith", + ), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "[PERSON_1] met [PERSON_2]."); + assert_eq!(result.redaction_map[0].original, "Alice Smith"); + assert_eq!(result.redaction_map[1].original, "Bob Smith"); +} + +#[test] +fn redact_operator_is_not_reversible() { + let text = "Contact Alice Smith at alice@example.com."; + let mut config = OperatorConfig::default(); + config + .operators + .insert(String::from("person"), OperatorType::Redact); + config.redact_string = String::from("[GONE]"); + let entities = vec![ + entity(text, "person", "Alice Smith"), + entity(text, "email address", "alice@example.com"), + ]; + + let result = redact_text(text, &entities, &config).unwrap(); + + assert!(result.redacted_text.contains("[GONE]")); + assert!( + result + .redaction_map + .iter() + .all(|entry| entry.placeholder != "[PERSON_1]") + ); + assert!( + result + .redaction_map + .iter() + .any(|entry| entry.placeholder == "[EMAIL_ADDRESS_1]") + ); +} + +#[test] +fn byte_offsets_apply_non_ascii_spans() { + let text = "A 🦀 Bob"; + let start = byte_len("A 🦀 "); + let end = start.saturating_add(byte_len("Bob")); + let entities = vec![Entity::detected(start, end, "person", "Bob")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "A 🦀 [PERSON_1]"); +} + +#[test] +fn detected_original_uses_redacted_source_span() { + let text = "Alice signed."; + let entities = vec![Entity::detected(0, 5, "person", "Bob")]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map[0].original, "Alice"); + assert_eq!( + deanonymise(&result.redacted_text, &result.redaction_map), + text + ); +} + +#[test] +fn detected_placeholder_identity_uses_sanitized_text() { + let text = "Dates: 21.\nMärz 1968 and 21. März 1968."; + let normalized = "21. März 1968"; + let entities = vec![ + entity_with_display_text(text, "date", "21.\nMärz 1968", normalized), + entity(text, "date", normalized), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "Dates: [DATE_1] and [DATE_1]."); + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].original, "21.\nMärz 1968"); +} + +#[test] +fn invalid_byte_boundary_is_rejected() { + let text = "A 🦀 Bob"; + let entities = vec![Entity::detected(3, 5, "person", " Bob")]; + + let error = redact_text(text, &entities, &OperatorConfig::default()) + .expect_err("offset inside a surrogate pair must fail"); + + assert_eq!(error, Error::ByteOffsetInsideCodepoint { offset: 3 }); +} + +#[test] +fn empty_spans_are_rejected() { + let text = "Alice"; + let entities = vec![Entity::detected(0, 0, "person", "")]; + + let error = redact_text(text, &entities, &OperatorConfig::default()) + .expect_err("empty entity spans must fail"); + + assert_eq!(error, Error::InvalidSpan { start: 0, end: 0 }); +} + +#[test] +fn overlapping_spans_keep_first_entity() { + let text = "Alice Smith"; + let entities = vec![ + Entity::detected(0, 11, "person", "Alice Smith"), + Entity::detected(6, 11, "person", "Smith"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redacted_text, "[PERSON_1]"); + assert_eq!(result.entity_count, 1); +} + +#[test] +fn equivalent_crypto_spellings_share_placeholders() { + let text = concat!( + "ETH wallet 0x742d35Cc6634C0532925a3b844Bc454e4438f44e.\n", + "ETH wallet 0x742d35cc6634c0532925a3b844bc454e4438f44e." + ); + let first = "0x742d35Cc6634C0532925a3b844Bc454e4438f44e"; + let second = "0x742d35cc6634c0532925a3b844bc454e4438f44e"; + let entities = vec![ + entity(text, "crypto", first), + entity(text, "crypto", second), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[CRYPTO_1]"); +} + +#[test] +fn equivalent_nhs_cues_share_placeholders() { + let text = concat!( + "NHS number 401 023 2137 was present.\n", + "National Health Service No. 401 023 2137 was repeated." + ); + let first = "NHS number 401 023 2137"; + let second = "National Health Service No. 401 023 2137"; + let entities = vec![ + entity(text, "national identification number", first), + entity(text, "national identification number", second), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!( + result.redaction_map[0].placeholder, + "[NATIONAL_IDENTIFICATION_NUMBER_1]" + ); +} + +#[test] +fn equivalent_passport_cues_share_placeholders() { + let text = concat!( + "US passport number X12345678 was inspected.\n", + "Passport No. X12345678 was listed." + ); + let entities = vec![ + entity(text, "passport number", "US passport number X12345678"), + entity(text, "passport number", "Passport No. X12345678"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 1); + assert_eq!(result.redaction_map[0].placeholder, "[PASSPORT_NUMBER_1]"); +} + +#[test] +fn passport_prefixes_split_by_separators_stay_distinct() { + let text = + "Passport X-12345678 was inspected. Passport Y 12345678 was listed."; + let entities = vec![ + entity(text, "passport number", "X-12345678"), + entity(text, "passport number", "Y 12345678"), + ]; + + let result = + redact_text(text, &entities, &OperatorConfig::default()).unwrap(); + + assert_eq!(result.redaction_map.len(), 2); + assert_eq!(result.redaction_map[0].placeholder, "[PASSPORT_NUMBER_1]"); + assert_eq!(result.redaction_map[1].placeholder, "[PASSPORT_NUMBER_2]"); +} diff --git a/crates/anonymize-core/tests/resolution.rs b/crates/anonymize-core/tests/resolution.rs new file mode 100644 index 00000000..1eb1b4f3 --- /dev/null +++ b/crates/anonymize-core/tests/resolution.rs @@ -0,0 +1,656 @@ +#![allow(clippy::expect_used, clippy::float_cmp, clippy::unwrap_used)] + +use stella_anonymize_core::{ + DetectionSource, PipelineEntity, SourceDetail, enforce_boundary_consistency, + merge_and_dedup, sanitize_entities, +}; + +fn entity( + source: DetectionSource, + score: f64, + start: u32, + end: u32, + label: &str, +) -> PipelineEntity { + PipelineEntity::detected( + start, + end, + label, + "x".repeat(usize::try_from(end.saturating_sub(start)).unwrap_or(0)), + score, + source, + ) +} + +fn text_entity( + text: &str, + label: &str, + source: DetectionSource, +) -> PipelineEntity { + PipelineEntity::detected(0, byte_len(text), label, text, 0.9, source) +} + +fn byte_len(text: &str) -> u32 { + u32::try_from(text.len()).unwrap_or(u32::MAX) +} + +#[test] +fn non_overlapping_entities_pass_through_sorted() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 20, 25, "person"), + entity(DetectionSource::Regex, 0.7, 0, 5, "person"), + entity(DetectionSource::Regex, 0.8, 10, 15, "person"), + ]); + + assert_eq!(result.len(), 3); + assert_eq!( + result.iter().map(|entry| entry.start).collect::>(), + vec![0, 10, 20] + ); +} + +#[test] +fn source_priority_beats_score_for_same_span() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.99, 0, 10, "person"), + entity(DetectionSource::Trigger, 0.7, 0, 10, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source, + DetectionSource::Trigger + ); +} + +#[test] +fn gazetteer_has_highest_source_priority() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.99, 5, 15, "person"), + entity(DetectionSource::Trigger, 0.99, 5, 15, "person"), + entity(DetectionSource::Gazetteer, 0.8, 5, 15, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source, + DetectionSource::Gazetteer + ); +} + +#[test] +fn same_priority_uses_score_then_length() { + let higher_score = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.85, 0, 8, "person"), + entity(DetectionSource::Regex, 0.92, 0, 8, "person"), + ]); + assert_eq!(higher_score.len(), 1); + assert_eq!(higher_score.first().expect("result").score, 0.92); + + let longer = merge_and_dedup(&[ + entity(DetectionSource::Ner, 0.9, 0, 5, "person"), + entity(DetectionSource::Ner, 0.9, 0, 10, "person"), + ]); + assert_eq!(longer.len(), 1); + assert_eq!(longer.first().expect("result").end, 10); +} + +#[test] +fn structured_regex_span_beats_inner_trigger_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.95, 0, 22, "registration number"), + entity(DetectionSource::Trigger, 0.95, 7, 8, "registration number"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 22); +} + +#[test] +fn structured_regex_span_beats_trigger_fragment_with_trailing_punctuation() { + let regex_text = "oddíl C, vložka 240118"; + let trigger_start = byte_len("oddíl C, vložka "); + let trigger_text = "240118,"; + let result = merge_and_dedup(&[ + PipelineEntity::detected( + 0, + byte_len(regex_text), + "registration number", + regex_text, + 0.95, + DetectionSource::Regex, + ), + PipelineEntity::detected( + trigger_start, + trigger_start + byte_len(trigger_text), + "registration number", + trigger_text, + 0.95, + DetectionSource::Trigger, + ), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, byte_len(regex_text)); +} + +#[test] +fn structured_date_regex_span_beats_trigger_fragment() { + let regex_text = "21. März 1968"; + let trigger_text = "21."; + let result = merge_and_dedup(&[ + PipelineEntity::detected( + 0, + byte_len(regex_text), + "date of birth", + regex_text, + 0.9, + DetectionSource::Regex, + ), + PipelineEntity::detected( + 0, + byte_len(trigger_text), + "date of birth", + trigger_text, + 0.95, + DetectionSource::Trigger, + ), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.text, regex_text); +} + +#[test] +fn person_regex_span_beats_inner_name_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 0, 21, "person"), + entity(DetectionSource::Trigger, 0.95, 5, 21, "person"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Regex); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 21); +} + +#[test] +fn address_trigger_still_beats_inner_address_regex() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Trigger, 0.95, 0, 30, "address"), + entity(DetectionSource::Regex, 0.9, 10, 20, "address"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::Trigger); + assert_eq!(kept.start, 0); + assert_eq!(kept.end, 30); +} + +#[test] +fn identical_spans_with_different_labels_are_kept() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.9, 0, 5, "person"), + entity(DetectionSource::Regex, 0.9, 0, 5, "project"), + ]); + + assert_eq!(result.len(), 2); + assert_eq!( + result + .iter() + .map(|entry| entry.label.as_str()) + .collect::>(), + vec!["person", "project"] + ); +} + +#[test] +fn literal_container_beats_shorter_same_label_match() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 1.0, 0, 6, "postal code"), + entity(DetectionSource::DenyList, 1.0, 0, 11, "postal code"), + ]); + + assert_eq!(result.len(), 1); + let kept = result.first().expect("result"); + assert_eq!(kept.source, DetectionSource::DenyList); + assert_eq!(kept.end, 11); +} + +#[test] +fn literal_container_survives_overlapping_shorter_fragment() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Regex, 0.7, 551, 557, "address"), + entity(DetectionSource::DenyList, 0.9, 501, 518, "organization"), + entity(DetectionSource::DenyList, 1.0, 511, 518, "address"), + entity(DetectionSource::DenyList, 1.0, 543, 550, "address"), + entity(DetectionSource::DenyList, 1.0, 527, 533, "address"), + entity(DetectionSource::Trigger, 0.95, 511, 518, "organization"), + entity(DetectionSource::Trigger, 0.95, 511, 518, "organization"), + entity(DetectionSource::Trigger, 1.0, 527, 557, "address"), + entity(DetectionSource::Trigger, 1.0, 527, 557, "address"), + entity(DetectionSource::Regex, 0.9, 527, 557, "address"), + ]); + + assert!( + result + .iter() + .any(|entry| entry.source == DetectionSource::DenyList + && entry.label == "organization" + && entry.start == 501 + && entry.end == 518), + "merged entities: {result:?}", + ); + assert!( + result + .iter() + .all(|entry| !(entry.source == DetectionSource::Trigger + && entry.label == "organization" + && entry.start == 511 + && entry.end == 518)), + "merged entities: {result:?}", + ); +} + +#[test] +fn address_component_beats_low_confidence_name_collision() { + let result = merge_and_dedup(&[ + entity(DetectionSource::DenyList, 0.5, 510, 521, "person"), + entity(DetectionSource::DenyList, 0.9, 515, 521, "address"), + entity(DetectionSource::DenyList, 0.9, 523, 531, "address"), + ]); + + assert!( + result.iter().any(|entry| entry.label == "address" + && entry.start == 515 + && entry.end == 521), + "merged entities: {result:?}", + ); + assert!( + result.iter().all(|entry| !(entry.label == "person" + && entry.start == 510 + && entry.end == 521)), + "merged entities: {result:?}", + ); +} + +#[test] +fn caller_owned_boundaries_win_overlap_resolution() { + let mut custom = entity(DetectionSource::Regex, 0.5, 0, 8, "person"); + custom.source_detail = Some(SourceDetail::CustomRegex); + let result = merge_and_dedup(&[ + entity(DetectionSource::Trigger, 0.99, 0, 10, "person"), + custom, + ]); + + assert_eq!(result.len(), 1); + assert_eq!( + result.first().expect("result").source_detail, + Some(SourceDetail::CustomRegex) + ); +} + +#[test] +fn same_span_country_loses_to_person() { + let result = merge_and_dedup(&[ + entity(DetectionSource::Country, 0.95, 0, 5, "country"), + entity(DetectionSource::DenyList, 0.9, 0, 5, "person"), + ]); + + assert_eq!(result.len(), 1); + assert_eq!(result.first().expect("result").label, "person"); +} + +#[test] +fn sanitize_trims_punctuation_and_updates_byte_offsets() { + let mut input = + text_entity("\"Tesla Shares\"", "organization", DetectionSource::Ner); + input.start = 10; + input.end = 10_u32.saturating_add(byte_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "Tesla Shares"); + assert_eq!(entity.start, 11); + assert_eq!(entity.end, 23); +} + +#[test] +fn sanitize_trims_leading_date_ellipsis() { + let mut input = text_entity("...2. 2. 2026", "date", DetectionSource::Regex); + input.start = 10; + input.end = 10_u32.saturating_add(byte_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "2. 2. 2026"); + assert_eq!(entity.start, 13); +} + +#[test] +fn sanitize_trims_single_dot_date_filler() { + let mut input = text_entity(". 2. 2. 2026", "date", DetectionSource::Regex); + input.start = 10; + input.end = 10_u32.saturating_add(byte_len(&input.text)); + + let result = sanitize_entities(&[input]); + assert_eq!(result.len(), 1); + let entity = result.first().expect("result"); + assert_eq!(entity.text, "2. 2. 2026"); + assert_eq!(entity.start, 12); +} + +#[test] +fn sanitize_preserves_literal_dictionary_punctuation() { + let result = sanitize_entities(&[ + text_entity("Hello bank!", "organization", DetectionSource::DenyList), + text_entity( + "\"Juez y parte\"", + "organization", + DetectionSource::DenyList, + ), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Hello bank!", "\"Juez y parte\""] + ); +} + +#[test] +fn sanitize_keeps_known_period_suffixes_from_data() { + let result = sanitize_entities(&[ + text_entity("Acme Inc.", "organization", DetectionSource::Ner), + text_entity("123 Main St.", "address", DetectionSource::Ner), + text_entity("Washington, D.C.", "location", DetectionSource::Ner), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Acme Inc.", "123 Main St.", "Washington, D.C."] + ); +} + +#[test] +fn sanitize_preserves_single_non_breaking_space() { + let result = sanitize_entities(&[ + text_entity( + "Městským soudem v\u{00a0}Praze", + "organization", + DetectionSource::Trigger, + ), + text_entity("Acme\n Corp", "organization", DetectionSource::Trigger), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Městským soudem v\u{00a0}Praze", "Acme Corp"] + ); +} + +#[test] +fn sanitize_keeps_legal_form_periods_from_legal_form_source() { + let result = sanitize_entities(&[ + text_entity("Acme INC.", "organization", DetectionSource::LegalForm), + text_entity("Eagles z.s.", "organization", DetectionSource::LegalForm), + text_entity( + "Národní agentura s. p.", + "organization", + DetectionSource::LegalForm, + ), + ]); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Acme INC.", "Eagles z.s.", "Národní agentura s. p."] + ); +} + +#[test] +fn sanitize_drops_empty_entities() { + let result = sanitize_entities(&[text_entity( + "\";!", + "organization", + DetectionSource::Ner, + )]); + + assert!(result.is_empty()); +} + +#[test] +fn boundary_merges_adjacent_same_label_entities() { + let full_text = "Kontaktujte Jan Novák prosím."; + let jan_start = byte_len("Kontaktujte "); + let jan_end = jan_start.saturating_add(byte_len("Jan")); + let surname_start = jan_end.saturating_add(byte_len(" ")); + let surname_end = surname_start.saturating_add(byte_len("Novák")); + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.8, jan_start, jan_end, "person"), + entity( + DetectionSource::Ner, + 0.95, + surname_start, + surname_end, + "person", + ), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.text, "Jan Novák"); + assert_eq!(person.start, jan_start); + assert_eq!(person.end, surname_end); + assert_eq!(person.score, 0.95); +} + +#[test] +fn boundary_expands_partial_words() { + let full_text = "Kontaktujte Novák prosím."; + let start = byte_len("Kontaktujte "); + let partial_end = start.saturating_add(byte_len("Nová")); + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + start, + partial_end, + "person", + "Nová", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.text, "Novák"); + assert_eq!(person.end, start.saturating_add(byte_len("Novák"))); +} + +#[test] +fn boundary_expands_inside_apostrophe_names() { + let full_text = "Kontaktujte O'Connor prosím."; + let start = byte_len("Kontaktujte O'"); + let end = start.saturating_add(byte_len("Connor")); + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + start, + end, + "person", + "Connor", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let person = result.first().expect("person"); + assert_eq!(person.start, byte_len("Kontaktujte ")); + assert_eq!(person.text, "O'Connor"); +} + +#[test] +fn boundary_expands_across_combining_marks() { + let full_text = "Podepsal Cafe\u{0301}."; + let start = byte_len("Podepsal "); + let end = start.saturating_add(byte_len("Cafe")); + let result = enforce_boundary_consistency( + &[PipelineEntity::detected( + start, + end, + "organization", + "Cafe", + 0.9, + DetectionSource::Ner, + )], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + let organization = result.first().expect("organization"); + assert_eq!(organization.text, "Cafe\u{0301}"); +} + +#[test] +fn boundary_clamps_expansion_at_cross_label_neighbors() { + let full_text = "JanPraha"; + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.9, 0, 3, "person"), + entity(DetectionSource::Ner, 0.8, 3, 8, "address"), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 2); + let person = result + .iter() + .find(|entry| entry.label == "person") + .expect("person"); + let address = result + .iter() + .find(|entry| entry.label == "address") + .expect("address"); + assert!(person.end <= address.start); +} + +#[test] +fn boundary_resolves_cross_label_partial_overlaps() { + let full_text = "JanXPraha"; + let result = enforce_boundary_consistency( + &[ + entity(DetectionSource::Ner, 0.9, 0, 3, "person"), + entity(DetectionSource::Ner, 0.8, 4, 9, "address"), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 2); + let person = result + .iter() + .find(|entry| entry.label == "person") + .expect("person"); + let address = result + .iter() + .find(|entry| entry.label == "address") + .expect("address"); + assert!(person.end <= address.start); +} + +#[test] +fn boundary_removes_nested_same_label_entities() { + let full_text = "Ing. Pavel Novák"; + let result = enforce_boundary_consistency( + &[ + PipelineEntity::detected( + 0, + byte_len("Ing. Pavel Novák"), + "person", + "Ing. Pavel Novák", + 0.9, + DetectionSource::Ner, + ), + PipelineEntity::detected( + 5, + 10, + "person", + "Pavel", + 0.8, + DetectionSource::Ner, + ), + ], + full_text, + ) + .unwrap(); + + assert_eq!(result.len(), 1); + assert_eq!(result.first().expect("person").text, "Ing. Pavel Novák"); +} + +#[test] +fn boundary_does_not_merge_legal_form_orgs_across_comma() { + let full_text = "Twitter, Inc., X Corp."; + let result = enforce_boundary_consistency( + &[ + PipelineEntity::detected( + 0, + 13, + "organization", + "Twitter, Inc.", + 0.9, + DetectionSource::LegalForm, + ), + PipelineEntity::detected( + 15, + 22, + "organization", + "X Corp.", + 0.8, + DetectionSource::LegalForm, + ), + ], + full_text, + ) + .unwrap(); + + assert_eq!( + result + .iter() + .map(|entry| entry.text.as_str()) + .collect::>(), + vec!["Twitter, Inc.", "X Corp."] + ); +} diff --git a/crates/anonymize-core/tests/search.rs b/crates/anonymize-core/tests/search.rs new file mode 100644 index 00000000..62cabc0d --- /dev/null +++ b/crates/anonymize-core/tests/search.rs @@ -0,0 +1,467 @@ +#![allow(clippy::expect_used, clippy::indexing_slicing, clippy::unwrap_used)] + +use stella_anonymize_core::{ + Error, FuzzySearchOptions, LiteralSearchOptions, RegexSearchOptions, + SearchIndex, SearchIndexArtifacts, SearchMatch, SearchOptions, SearchPattern, +}; + +#[test] +fn search_index_routes_literal_regex_and_fuzzy_patterns() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\b[A-Z]{2}\d{4}\b")), + SearchPattern::Fuzzy { + pattern: String::from("Muller"), + distance: Some(1), + }, + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + fuzzy: FuzzySearchOptions { + case_insensitive: true, + whole_words: true, + normalize_diacritics: false, + }, + }, + ) + .unwrap(); + + let matches = index + .find_iter("Alice signed AB1234. Later, Muler countersigned.") + .unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 13, + end: 19, + }, + SearchMatch::Fuzzy { + pattern: 2, + start: 28, + end: 33, + distance: 1, + }, + ] + ); +} + +#[test] +fn search_index_preserves_byte_offsets_from_primitive_engines() { + const SUPPLEMENTARY_SCALAR: &str = "\u{1F9EA}"; + + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Bob")), + SearchPattern::Regex(String::from(SUPPLEMENTARY_SCALAR)), + ], + SearchOptions::default(), + ) + .unwrap(); + + let haystack = format!("A {SUPPLEMENTARY_SCALAR} Bob"); + let matches = index.find_iter(&haystack).unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Regex { + pattern: 1, + start: 2, + end: 6, + }, + SearchMatch::Literal { + pattern: 0, + start: 7, + end: 10, + }, + ] + ); +} + +#[test] +fn search_index_preserves_case_insensitive_literal_byte_offsets() { + let index = SearchIndex::new( + vec![SearchPattern::LiteralWithOptions { + pattern: String::from("krajským soudem"), + case_insensitive: Some(true), + whole_words: Some(false), + }], + SearchOptions::default(), + ) + .unwrap(); + + let haystack = "zapsaná v obchodním rejstříku vedeném Krajským soudem"; + let start = haystack.find("Krajským").unwrap(); + let end = haystack.len(); + + assert_eq!( + index.find_iter(haystack).unwrap(), + vec![SearchMatch::Literal { + pattern: 0, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }] + ); +} + +#[test] +fn search_index_preserves_large_case_insensitive_literal_byte_offsets() { + let mut patterns = Vec::new(); + for index in 0..300 { + let pattern = if index == 216 { + String::from("krajským soudem") + } else { + format!("needle-{index}") + }; + patterns.push(SearchPattern::LiteralWithOptions { + pattern, + case_insensitive: Some(true), + whole_words: Some(false), + }); + } + let index = SearchIndex::new(patterns, SearchOptions::default()).unwrap(); + + let haystack = "zapsaná v obchodním rejstříku vedeném Krajským soudem v Ústí"; + let start = haystack.find("Krajským").unwrap(); + let end = start.saturating_add("Krajským soudem".len()); + + assert_eq!( + index.find_iter(haystack).unwrap(), + vec![SearchMatch::Literal { + pattern: 216, + start: u32::try_from(start).unwrap(), + end: u32::try_from(end).unwrap(), + }] + ); +} + +#[test] +fn search_index_returns_overlapping_literal_matches() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Literal(String::from("Alice Smith")), + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Smith signed.").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Literal { + pattern: 1, + start: 0, + end: 11, + }, + ] + ); +} + +#[test] +fn search_index_can_return_overlapping_regex_matches() { + let index = SearchIndex::new( + vec![ + SearchPattern::Regex(String::from("Alice")), + SearchPattern::Regex(String::from("Alice Smith")), + ], + SearchOptions { + regex: RegexSearchOptions { + whole_words: false, + overlap_all: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Smith signed.").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Regex { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Regex { + pattern: 1, + start: 0, + end: 11, + }, + ] + ); +} + +#[test] +fn search_index_supports_per_pattern_literal_word_boundaries() { + let index = SearchIndex::new( + vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("he"), + case_insensitive: None, + whole_words: Some(true), + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("s.r.o."), + case_insensitive: None, + whole_words: Some(false), + }, + ], + SearchOptions::default(), + ) + .unwrap(); + + let matches = index.find_iter("shell Acme s.r.o. he").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 1, + start: 11, + end: 17, + }, + SearchMatch::Literal { + pattern: 0, + start: 18, + end: 20, + }, + ] + ); +} + +#[test] +fn search_index_supports_per_pattern_literal_case_sensitivity() { + let index = SearchIndex::new( + vec![ + SearchPattern::LiteralWithOptions { + pattern: String::from("alice"), + case_insensitive: Some(true), + whole_words: None, + }, + SearchPattern::LiteralWithOptions { + pattern: String::from("bob"), + case_insensitive: Some(false), + whole_words: None, + }, + ], + SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + ..SearchOptions::default() + }, + ) + .unwrap(); + + let matches = index.find_iter("Alice Bob bob").unwrap(); + + assert_eq!( + matches, + vec![ + SearchMatch::Literal { + pattern: 0, + start: 0, + end: 5, + }, + SearchMatch::Literal { + pattern: 1, + start: 10, + end: 13, + }, + ] + ); +} + +#[test] +fn search_index_reports_match_presence_across_engines() { + let index = SearchIndex::new( + vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\d{4}")), + ], + SearchOptions::default(), + ) + .unwrap(); + + assert!(index.is_match("Case 2026").unwrap()); + assert!(!index.is_match("No hit").unwrap()); +} + +#[test] +fn search_index_prepared_artifacts_match_direct_index() { + let patterns = vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Regex(String::from(r"\b[A-Z]{2}\d{4}\b")), + SearchPattern::Fuzzy { + pattern: String::from("Muller"), + distance: Some(1), + }, + ]; + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: true, + }, + regex: RegexSearchOptions { + whole_words: false, + overlap_all: false, + }, + fuzzy: FuzzySearchOptions { + case_insensitive: true, + whole_words: true, + normalize_diacritics: false, + }, + }; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!( + !artifacts.slots.is_empty(), + "prepared search index should record text-search slot artifacts" + ); + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns, options, &artifacts).unwrap(); + let haystack = "Alice signed AB1234. Later, Muler countersigned."; + + assert_eq!( + prepared.find_iter(haystack).unwrap(), + direct.find_iter(haystack).unwrap() + ); + assert_eq!(prepared.is_match(haystack), direct.is_match(haystack)); +} + +#[test] +fn search_index_prepared_artifacts_roundtrip_bytes() { + let patterns = vec![ + SearchPattern::Literal(String::from("Alice")), + SearchPattern::Literal(String::from("Bob")), + ]; + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: true, + }, + ..SearchOptions::default() + }; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + let bytes = artifacts.to_bytes().unwrap(); + let decoded = SearchIndexArtifacts::from_bytes(&bytes).unwrap(); + + assert_eq!(decoded, artifacts); + + let direct = SearchIndex::new(patterns.clone(), options).unwrap(); + let prepared = + SearchIndex::new_with_artifacts(patterns, options, &decoded).unwrap(); + assert_eq!( + prepared.find_iter("Alice and Bob").unwrap(), + direct.find_iter("Alice and Bob").unwrap() + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_invalid_bytes() { + let error = SearchIndexArtifacts::from_bytes(b"not-valid").unwrap_err(); + + assert!( + matches!(error, Error::InvalidStaticData { .. }), + "invalid artifact bytes should fail at the format boundary" + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_wrong_slot_count() { + let patterns = vec![SearchPattern::Literal(String::from("Alice"))]; + let options = SearchOptions::default(); + let mut artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), options).unwrap(); + artifacts.slots.clear(); + + assert!( + SearchIndex::new_with_artifacts(patterns, options, &artifacts).is_err(), + "missing prepared slot artifacts should fail" + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_stale_patterns() { + let options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }; + let artifacts = SearchIndex::prepare_artifacts( + vec![SearchPattern::Literal(String::from("Alice"))], + options, + ) + .unwrap(); + let stale_patterns = vec![SearchPattern::Literal(String::from("Bob"))]; + + assert!( + SearchIndex::new_with_artifacts(stale_patterns, options, &artifacts) + .is_err(), + "same-count stale prepared artifacts should fail" + ); +} + +#[test] +fn search_index_prepared_artifacts_reject_stale_literal_options() { + let prepare_options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: false, + whole_words: false, + }, + ..SearchOptions::default() + }; + let load_options = SearchOptions { + literal: LiteralSearchOptions { + case_insensitive: true, + whole_words: false, + }, + ..SearchOptions::default() + }; + let patterns = vec![SearchPattern::Literal(String::from("Alice"))]; + let artifacts = + SearchIndex::prepare_artifacts(patterns.clone(), prepare_options).unwrap(); + + assert!( + SearchIndex::new_with_artifacts(patterns, load_options, &artifacts) + .is_err(), + "prepared artifacts should be bound to literal search options" + ); +} diff --git a/crates/anonymize-core/tests/trigger_parity.rs b/crates/anonymize-core/tests/trigger_parity.rs new file mode 100644 index 00000000..7830b9a7 --- /dev/null +++ b/crates/anonymize-core/tests/trigger_parity.rs @@ -0,0 +1,313 @@ +#![allow(clippy::expect_used)] + +use stella_anonymize_core::{ + PatternSlice, PreparedSearch, PreparedSearchConfig, PreparedSearchSlices, + SearchOptions, SearchPattern, StaticDetectionResult, TriggerData, + TriggerRule, TriggerStrategy, TriggerValidation, +}; + +fn empty_config(slices: PreparedSearchSlices) -> PreparedSearchConfig { + PreparedSearchConfig { + regex_patterns: vec![], + custom_regex_patterns: vec![], + literal_patterns: vec![], + regex_options: SearchOptions::default(), + custom_regex_options: SearchOptions::default(), + literal_options: SearchOptions::default(), + allowed_labels: vec![], + threshold: 0.0, + confidence_boost: false, + slices, + regex_meta: vec![], + custom_regex_meta: vec![], + deny_list_data: None, + false_positive_filters: None, + gazetteer_data: None, + country_data: None, + hotword_data: None, + trigger_data: None, + legal_form_data: None, + address_seed_data: None, + zone_data: None, + address_context_data: None, + coreference_data: None, + name_corpus_data: None, + date_data: None, + monetary_data: None, + } +} + +fn prepared_for_trigger( + trigger: &str, + label: &str, + strategy: TriggerStrategy, +) -> PreparedSearch { + PreparedSearch::new(PreparedSearchConfig { + regex_patterns: vec![SearchPattern::LiteralWithOptions { + pattern: trigger.to_lowercase(), + case_insensitive: Some(true), + whole_words: Some(false), + }], + slices: PreparedSearchSlices { + triggers: PatternSlice { start: 0, end: 1 }, + ..PreparedSearchSlices::default() + }, + trigger_data: Some(TriggerData { + rules: vec![TriggerRule { + trigger: trigger.to_owned(), + label: label.to_owned(), + strategy, + validations: Vec::::new(), + include_trigger: false, + }], + address_stop_keywords: Vec::new(), + party_position_terms: Vec::new(), + legal_form_suffixes: Vec::new(), + post_nominals: vec![ + String::from("Ph.D."), + String::from("CSc."), + String::from("MBA"), + ], + sentence_terminal_currency_terms: vec![String::from("Kč")], + }), + ..empty_config(PreparedSearchSlices::default()) + }) + .expect("trigger config should prepare") +} + +fn trigger_texts(result: &StaticDetectionResult) -> Vec<&str> { + result + .trigger_entities + .iter() + .map(|entity| entity.text.as_str()) + .collect() +} + +#[test] +fn uppercase_configured_id_triggers_accept_lowercase_source_forms() { + for (trigger, text, expected) in [ + ("CPF", "cpf: 123.456.789-00", "123.456.789-00"), + ("CNPJ", "cnpj: 12.345.678/0001-95", "12.345.678/0001-95"), + ("DNI", "dni 12345678Z", "12345678Z"), + ("CP", "cp: 08001", "08001"), + ] { + let prepared = prepared_for_trigger( + trigger, + "tax identification number", + TriggerStrategy::CompanyIdValue, + ); + let result = prepared + .detect_static_entities(text) + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&expected), + "trigger {trigger} should extract {expected:?}; entities: {:?}", + result.trigger_entities, + ); + } +} + +#[test] +fn labelled_phone_trigger_keeps_extension_suffixes() { + let prepared = + prepared_for_trigger("PHONE", "phone number", TriggerStrategy::ToEndOfLine); + + for (text, expected) in [ + ( + "PHONE: +1 555 123 4567 ext. 89\nNext line.", + "+1 555 123 4567 ext. 89", + ), + ( + "PHONE: +1 555 123 4567 extension 42\nNext line.", + "+1 555 123 4567 extension 42", + ), + ( + "PHONE: +1 555 123 4567 x42\nNext line.", + "+1 555 123 4567 x42", + ), + ] { + let result = prepared + .detect_static_entities(text) + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&expected), + "phone trigger should keep extension in {expected:?}; entities: {:?}", + result.trigger_entities, + ); + } +} + +#[test] +fn labelled_phone_trigger_stops_before_numbered_sentences() { + let prepared = + prepared_for_trigger("PHONE", "phone number", TriggerStrategy::ToEndOfLine); + + let result = prepared + .detect_static_entities("PHONE: +36 1 234 5678. 1. Definitions") + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), ["+36 1 234 5678"]); +} + +#[test] +fn person_trigger_only_skips_known_post_nominals_after_comma() { + let prepared = prepared_for_trigger( + "represented by", + "person", + TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: Some(100), + }, + ); + + let prose = prepared + .detect_static_entities("represented by John Smith, and shall continue.") + .expect("static detection should succeed"); + let degree = prepared + .detect_static_entities( + "represented by John Smith, Ph.D., and shall continue.", + ) + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&prose), ["John Smith"]); + assert_eq!(trigger_texts(°ree), ["John Smith, Ph.D."]); +} + +#[test] +fn match_pattern_trigger_requires_match_at_value_start() { + let prepared = prepared_for_trigger( + "Telephone", + "phone number", + TriggerStrategy::MatchPattern { + pattern: String::from(r"\d+"), + flags: None, + }, + ); + + let rejected = prepared + .detect_static_entities("Telephone : non communique SIREN : 123456789") + .expect("static detection should succeed"); + let accepted = prepared + .detect_static_entities("Telephone : 123456789 SIREN") + .expect("static detection should succeed"); + + assert!(rejected.trigger_entities.is_empty()); + assert_eq!(trigger_texts(&accepted), ["123456789"]); +} + +#[test] +fn to_next_comma_stops_after_short_currency_abbreviation_sentence_tail() { + let prepared = prepared_for_trigger( + "fee", + "monetary amount", + TriggerStrategy::ToNextComma { + stop_words: Vec::new(), + max_length: Some(100), + }, + ); + + let result = prepared + .detect_static_entities("fee 100 Kč. Termin splatnosti je zítra.") + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&"100 Kč"), + "currency sentence tail should stop the capture; entities: {:?}", + result.trigger_entities, + ); +} + +#[test] +fn to_next_comma_stops_on_unicode_case_stop_words() { + let prepared = prepared_for_trigger( + "court", + "organization", + TriggerStrategy::ToNextComma { + stop_words: vec![String::from("dňa")], + max_length: Some(100), + }, + ); + + let result = prepared + .detect_static_entities("court Okresný súd DŇA 1.1.2025, other text.") + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), [String::from("Okresný súd")]); +} + +#[test] +fn company_id_trigger_rejects_single_digit_dotted_date() { + let prepared = prepared_for_trigger( + "DNI", + "national identification number", + TriggerStrategy::CompanyIdValue, + ); + + let result = prepared + .detect_static_entities("DNI 6.11.2025") + .expect("static detection should succeed"); + + assert!(result.trigger_entities.is_empty()); +} + +#[test] +fn company_id_trigger_caps_leading_alpha_prefixes() { + let prepared = prepared_for_trigger( + "Company No.", + "registration number", + TriggerStrategy::CompanyIdValue, + ); + + let rejected = prepared + .detect_static_entities("Company No. ReferenceCode12345") + .expect("static detection should succeed"); + let accepted = prepared + .detect_static_entities("Company No. AB12345") + .expect("static detection should succeed"); + + assert!(rejected.trigger_entities.is_empty()); + assert_eq!(trigger_texts(&accepted), ["AB12345"]); +} + +#[test] +fn address_trigger_stops_after_short_proper_noun_before_real_sentence() { + let prepared = prepared_for_trigger( + "office", + "address", + TriggerStrategy::Address { + max_chars: Some(120), + }, + ); + + let result = prepared + .detect_static_entities("office Brno. Section begins here.") + .expect("static detection should succeed"); + + assert!( + trigger_texts(&result).contains(&"Brno"), + "proper-noun sentence tail should stop the address; entities: {:?}", + result.trigger_entities, + ); +} + +#[test] +fn trigger_lookahead_counts_text_units_not_utf8_bytes() { + let prepared = prepared_for_trigger( + "residing at", + "address", + TriggerStrategy::Address { + max_chars: Some(120), + }, + ); + let dense_prefix = "京".repeat(90); + let expected = format!("{dense_prefix} Main Street 1"); + let text = format!("residing at {expected}\nNext line."); + + let result = prepared + .detect_static_entities(&text) + .expect("static detection should succeed"); + + assert_eq!(trigger_texts(&result), [expected]); +} diff --git a/crates/anonymize-napi/Cargo.toml b/crates/anonymize-napi/Cargo.toml new file mode 100644 index 00000000..0e757372 --- /dev/null +++ b/crates/anonymize-napi/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "stella-anonymize-napi" +version.workspace = true +edition.workspace = true +description = "Native bindings for stella anonymization core" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[lib] +crate-type = ["cdylib"] + +[dependencies] +blake3 = "1" +napi = { version = "3", default-features = false, features = [ + "napi9", + "serde-json", +] } +napi-derive = "3" +serde_json = "1" +stella-anonymize-adapter-contract = { path = "../anonymize-adapter-contract" } +stella-anonymize-core = { path = "../anonymize-core" } + +[build-dependencies] +napi-build = "2" + +[lints] +workspace = true diff --git a/crates/anonymize-napi/build.rs b/crates/anonymize-napi/build.rs new file mode 100644 index 00000000..bbfc9e4b --- /dev/null +++ b/crates/anonymize-napi/build.rs @@ -0,0 +1,3 @@ +fn main() { + napi_build::setup(); +} diff --git a/crates/anonymize-napi/src/lib.rs b/crates/anonymize-napi/src/lib.rs new file mode 100644 index 00000000..484a752f --- /dev/null +++ b/crates/anonymize-napi/src/lib.rs @@ -0,0 +1,835 @@ +use std::{ + collections::{BTreeMap, VecDeque}, + sync::{Arc, LazyLock, Mutex}, + time::Instant, +}; + +use napi::bindgen_prelude::*; +use napi_derive::napi; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingOperatorEntry, BindingPreparedSearchConfig, + BindingRedactionResult, BindingStaticRedactionResult, ContractError, + operator_config_from_binding, prepared_search_config_from_binding, + prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_core_package_view_from_bytes, + prepared_search_package_from_bytes, prepared_search_package_has_core_payload, + static_redaction_diagnostic_result_to_utf16_binding, + static_redaction_diagnostics_to_binding, + static_redaction_result_to_utf16_binding, +}; +use stella_anonymize_core::{ + DiagnosticEvent, DiagnosticEventKind, DiagnosticStage, PreparedSearch, + PreparedSearchArtifacts, PreparedSearchConfig, StaticRedactionDiagnostics, +}; + +const PREPARED_SEARCH_CACHE_LIMIT: usize = 8; + +static PREPARED_SEARCH_CACHE: LazyLock> = + LazyLock::new(|| Mutex::new(PreparedSearchCache::new())); + +struct PreparedSearchCache { + entries: BTreeMap<[u8; 32], Arc>, + order: VecDeque<[u8; 32]>, +} + +impl PreparedSearchCache { + const fn new() -> Self { + Self { + entries: BTreeMap::new(), + order: VecDeque::new(), + } + } + + fn get(&mut self, key: &[u8; 32]) -> Option> { + let entry = self.entries.get(key).cloned()?; + self.retain_order_without(key); + self.order.push_back(*key); + Some(entry) + } + + fn insert(&mut self, key: [u8; 32], value: Arc) { + self.entries.insert(key, value); + self.retain_order_without(&key); + self.order.push_back(key); + + while self.order.len() > PREPARED_SEARCH_CACHE_LIMIT { + if let Some(evicted) = self.order.pop_front() { + self.entries.remove(&evicted); + } + } + } + + fn retain_order_without(&mut self, key: &[u8; 32]) { + self.order.retain(|entry| entry != key); + } +} + +#[napi(object)] +pub struct JsSearchPattern { + pub kind: String, + pub pattern: String, + pub distance: Option, + pub case_insensitive: Option, + pub whole_words: Option, + pub lazy: Option, + pub prefilter_any: Option>, + pub prefilter_case_insensitive: Option, + pub prefilter_regex: Option, +} + +#[napi(object)] +pub struct JsSearchOptions { + pub literal_case_insensitive: Option, + pub literal_whole_words: Option, + pub regex_whole_words: Option, + pub regex_overlap_all: Option, + pub fuzzy_case_insensitive: Option, + pub fuzzy_whole_words: Option, + pub fuzzy_normalize_diacritics: Option, +} + +#[napi(object)] +pub struct JsPatternSlice { + pub start: u32, + pub end: u32, +} + +#[napi(object)] +pub struct JsPreparedSearchSlices { + pub regex: Option, + pub custom_regex: Option, + pub legal_forms: Option, + pub triggers: Option, + pub deny_list: Option, + pub street_types: Option, + pub gazetteer: Option, + pub countries: Option, +} + +#[napi(object)] +pub struct JsRegexMatchMeta { + pub label: String, + pub score: f64, + pub source_detail: Option, + pub requires_validation: Option, + pub validator_id: Option, + pub validator_input: Option, + pub min_byte_length: Option, +} + +#[napi(object)] +pub struct JsGazetteerMatchData { + pub labels: Vec, + pub is_fuzzy: Vec, +} + +#[napi(object)] +pub struct JsCountryMatchData { + pub labels: Vec, +} + +#[napi(object)] +pub struct JsDenyListMatchData { + pub labels: Vec>, + pub custom_labels: Vec>, + pub originals: Vec, + pub sources: Vec>, + pub filters: Option, +} + +#[napi(object)] +pub struct JsDenyListFilterData { + pub stopwords: Vec, + pub allow_list: Vec, + pub person_stopwords: Vec, + pub person_trailing_nouns: Vec, + pub address_stopwords: Vec, + pub address_jurisdiction_prefixes: Vec, + pub street_types: Vec, + pub first_names: Vec, + pub generic_roles: Vec, + pub sentence_starters: Vec, + pub trailing_address_word_exclusions: Vec, + pub defined_term_cues: Vec, + pub signing_place_guards: Vec, +} + +#[napi(object)] +pub struct JsSigningPlaceGuardData { + pub prefix_phrases: Vec, + pub suffix_phrases: Vec, +} + +#[napi(object)] +pub struct JsPreparedSearchConfig { + pub regex_patterns: Vec, + pub custom_regex_patterns: Vec, + pub literal_patterns: Vec, + pub regex_options: Option, + pub custom_regex_options: Option, + pub literal_options: Option, + pub slices: JsPreparedSearchSlices, + pub regex_meta: Vec, + pub custom_regex_meta: Vec, + pub deny_list_data: Option, + pub gazetteer_data: Option, + pub country_data: Option, +} + +#[napi(object)] +pub struct JsOperatorConfig { + pub operators: Option>, + pub redact_string: Option, +} + +#[napi(object)] +pub struct JsRedactionEntry { + pub placeholder: String, + pub original: String, +} + +#[napi(object)] +pub struct JsOperatorEntry { + pub placeholder: String, + pub operator: String, +} + +#[napi(object)] +pub struct JsRedactionResult { + pub redacted_text: String, + pub redaction_map: Vec, + pub operator_map: Vec, + pub entity_count: u32, +} + +#[napi(object)] +pub struct JsPipelineEntity { + pub start: u32, + pub end: u32, + pub label: String, + pub text: String, + pub score: f64, + pub source: String, + pub source_detail: Option, +} + +#[napi(object)] +pub struct JsStaticRedactionResult { + pub resolved_entities: Vec, + pub redaction: JsRedactionResult, +} + +#[napi] +#[must_use] +#[allow(clippy::needless_pass_by_value)] +pub fn normalize_for_search(text: String) -> String { + stella_anonymize_core::normalize_for_search(&text) +} + +#[napi] +#[must_use] +pub fn native_package_version() -> String { + String::from(env!("CARGO_PKG_VERSION")) +} + +#[napi] +#[allow(clippy::needless_pass_by_value)] +pub fn redact_static_entities_json( + config_json: String, + full_text: String, + operators_json: Option, +) -> Result { + let config = + serde_json::from_str::(&config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let operators = operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_napi_serde_error(&error))?; + let prepared = PreparedSearch::new( + prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + let result = prepared + .redact_static_entities( + &full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + let result = static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) +} + +#[napi] +#[allow(clippy::needless_pass_by_value)] +pub fn redact_static_entities_diagnostics_json( + config_json: String, + full_text: String, + operators_json: Option, +) -> Result { + let config = + serde_json::from_str::(&config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let operators = operators_json + .as_deref() + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_napi_serde_error(&error))?; + let prepared = PreparedSearch::new_with_diagnostics( + prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + let mut diagnostics = prepared.diagnostics; + let mut result = prepared + .prepared + .redact_static_entities_with_diagnostics( + &full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_napi_contract_error(&error))?, + ) + .map_err(|error| to_napi_core_error(&error))?; + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) +} + +#[napi(js_name = "prepareStaticSearchArtifactsBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_artifacts_bytes( + config_json: BufferSlice<'_>, +) -> Result { + let config = + serde_json::from_slice::(config_json.as_ref()) + .map_err(|error| to_napi_serde_error(&error))?; + let config = prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?; + PreparedSearch::prepare_artifacts(config) + .and_then(|artifacts| artifacts.to_bytes()) + .map(Buffer::from) + .map_err(|error| to_napi_core_error(&error)) +} + +#[napi(js_name = "prepareStaticSearchPackageBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_package_bytes( + config_json: BufferSlice<'_>, +) -> Result { + prepare_static_search_package_bytes_with(config_json.as_ref(), false) +} + +#[napi(js_name = "prepareStaticSearchCompressedPackageBytes")] +#[allow(clippy::needless_pass_by_value)] +pub fn prepare_static_search_compressed_package_bytes( + config_json: BufferSlice<'_>, +) -> Result { + prepare_static_search_package_bytes_with(config_json.as_ref(), true) +} + +fn prepare_static_search_package_bytes_with( + config_json: &[u8], + compressed: bool, +) -> Result { + let binding_config = + serde_json::from_slice::(config_json) + .map_err(|error| to_napi_serde_error(&error))?; + let core_config = prepared_search_config_from_binding(binding_config) + .map_err(|error| to_napi_contract_error(&error))?; + let artifacts = PreparedSearch::prepare_artifacts(core_config.clone()) + .map_err(|error| to_napi_core_error(&error))?; + let artifact_bytes = artifacts + .to_bytes() + .map_err(|error| to_napi_core_error(&error))?; + let package = if compressed { + prepared_search_core_package_to_compressed_bytes( + &core_config, + &artifact_bytes, + ) + } else { + prepared_search_core_package_to_bytes(&core_config, &artifact_bytes) + }; + let package = package.map_err(|error| to_napi_contract_error(&error))?; + let prepared = PreparedSearch::new_with_artifacts(core_config, &artifacts) + .map_err(|error| to_napi_core_error(&error))?; + prepared_search_cache_insert( + prepared_search_package_cache_key(&package), + Arc::new(prepared), + ); + Ok(Buffer::from(package)) +} + +#[napi] +pub struct NativePreparedSearch { + inner: Arc, + prepare_diagnostics: StaticRedactionDiagnostics, +} + +#[derive(Clone, Copy)] +struct PrepareContext { + input_bytes_len: usize, + cache_key: [u8; 32], + cache_elapsed: u64, + parse_elapsed: u64, + parse_stage: DiagnosticStage, +} + +#[napi] +impl NativePreparedSearch { + #[napi(constructor)] + pub fn new(config_json: String) -> Result { + let config_bytes = config_json.into_bytes(); + Self::from_config_bytes(&config_bytes, None) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_config_json_bytes(config_json: BufferSlice<'_>) -> Result { + Self::from_config_bytes(config_json.as_ref(), None) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_config_json_and_artifact_bytes( + config_json: BufferSlice<'_>, + artifact_bytes: BufferSlice<'_>, + ) -> Result { + Self::from_config_bytes(config_json.as_ref(), Some(artifact_bytes.as_ref())) + } + + #[napi(factory)] + #[allow(clippy::needless_pass_by_value)] + pub fn from_prepared_package_bytes( + package_bytes: BufferSlice<'_>, + ) -> Result { + Self::from_package_bytes(package_bytes.as_ref()) + } + + fn from_config_bytes( + config_bytes: &[u8], + artifact_bytes: Option<&[u8]>, + ) -> Result { + let input_bytes_len = config_bytes + .len() + .saturating_add(artifact_bytes.map_or(0, <[u8]>::len)); + let cache_key = prepared_search_cache_key(config_bytes, artifact_bytes); + let cache_start = Instant::now(); + if let Some(inner) = prepared_search_cache_get(&cache_key) { + return Ok(Self { + inner, + prepare_diagnostics: StaticRedactionDiagnostics { + events: vec![stage_event( + DiagnosticStage::PrepareCacheHit, + Some(1), + Some(elapsed_us(cache_start)), + Some(input_bytes_len), + )], + }, + }); + } + + let cache_elapsed = elapsed_us(cache_start); + let parse_start = Instant::now(); + let config = + serde_json::from_slice::(config_bytes) + .map_err(|error| to_napi_serde_error(&error))?; + let parse_elapsed = elapsed_us(parse_start); + Self::from_binding_config( + config, + artifact_bytes, + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PrepareBindingParse, + }, + ) + } + + fn from_package_bytes(package_bytes: &[u8]) -> Result { + let input_bytes_len = package_bytes.len(); + let cache_key = prepared_search_package_cache_key(package_bytes); + let cache_start = Instant::now(); + if let Some(inner) = prepared_search_cache_get(&cache_key) { + return Ok(Self { + inner, + prepare_diagnostics: StaticRedactionDiagnostics { + events: vec![stage_event( + DiagnosticStage::PrepareCacheHit, + Some(1), + Some(elapsed_us(cache_start)), + Some(input_bytes_len), + )], + }, + }); + } + + let cache_elapsed = elapsed_us(cache_start); + let parse_start = Instant::now(); + if prepared_search_package_has_core_payload(package_bytes) { + let package = prepared_search_core_package_view_from_bytes(package_bytes) + .map_err(|error| to_napi_contract_error(&error))?; + let parse_elapsed = elapsed_us(parse_start); + let config = package.config; + return Self::from_core_config( + config, + Some(package.artifacts.as_ref()), + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PreparePackageDecode, + }, + None, + ); + } + + let package = prepared_search_package_from_bytes(package_bytes) + .map_err(|error| to_napi_contract_error(&error))?; + let parse_elapsed = elapsed_us(parse_start); + let config = package.config; + let artifacts = package.artifacts; + Self::from_binding_config( + config, + Some(&artifacts), + PrepareContext { + input_bytes_len, + cache_key, + cache_elapsed, + parse_elapsed, + parse_stage: DiagnosticStage::PreparePackageDecode, + }, + ) + } + + fn from_binding_config( + config: BindingPreparedSearchConfig, + artifact_bytes: Option<&[u8]>, + context: PrepareContext, + ) -> Result { + let convert_start = Instant::now(); + let config = prepared_search_config_from_binding(config) + .map_err(|error| to_napi_contract_error(&error))?; + let pattern_count = prepared_search_pattern_count(&config); + let convert_elapsed = elapsed_us(convert_start); + Self::from_core_config( + config, + artifact_bytes, + context, + Some((pattern_count, convert_elapsed)), + ) + } + + fn from_core_config( + config: PreparedSearchConfig, + artifact_bytes: Option<&[u8]>, + context: PrepareContext, + binding_convert: Option<(usize, u64)>, + ) -> Result { + let artifact_decode_start = Instant::now(); + let artifacts = artifact_bytes + .map(PreparedSearchArtifacts::from_bytes) + .transpose() + .map_err(|error| to_napi_core_error(&error))?; + let artifact_decode_elapsed = + artifact_bytes.map(|_| elapsed_us(artifact_decode_start)); + let result = if let Some(artifacts) = artifacts.as_ref() { + PreparedSearch::new_with_artifacts_diagnostics(config, artifacts) + } else { + PreparedSearch::new_with_diagnostics(config) + } + .map_err(|error| to_napi_core_error(&error))?; + let inner = Arc::new(result.prepared); + let mut diagnostics = StaticRedactionDiagnostics { + events: vec![ + stage_event( + DiagnosticStage::PrepareCacheMiss, + Some(0), + Some(context.cache_elapsed), + Some(context.input_bytes_len), + ), + stage_event( + context.parse_stage, + None, + Some(context.parse_elapsed), + Some(context.input_bytes_len), + ), + ], + }; + if let Some((pattern_count, convert_elapsed)) = binding_convert { + diagnostics.events.push(stage_event( + DiagnosticStage::PrepareBindingConvert, + Some(pattern_count), + Some(convert_elapsed), + None, + )); + } + if let (Some(elapsed), Some(bytes)) = + (artifact_decode_elapsed, artifact_bytes.map(<[u8]>::len)) + { + diagnostics.events.push(stage_event( + DiagnosticStage::PrepareArtifactsDecode, + None, + Some(elapsed), + Some(bytes), + )); + } + diagnostics.extend(result.diagnostics); + prepared_search_cache_insert(context.cache_key, Arc::clone(&inner)); + Ok(Self { + inner, + prepare_diagnostics: diagnostics, + }) + } + + #[napi] + pub fn prepare_diagnostics_json(&self) -> Result { + let diagnostics = + static_redaction_diagnostics_to_binding(self.prepare_diagnostics.clone()); + + serde_json::to_string(&diagnostics) + .map_err(|error| to_napi_serde_error(&error)) + } + + #[napi] + pub fn warm_lazy_regex(&self) -> Result<()> { + self + .inner + .warm_lazy_regex() + .map_err(|error| to_napi_core_error(&error)) + } + + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; + let result = self + .inner + .redact_static_entities(&full_text, &operators) + .map_err(|error| to_napi_core_error(&error))?; + static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error)) + .and_then(to_js_static_redaction_result) + } + + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities_json( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; + let result = self + .inner + .redact_static_entities(&full_text, &operators) + .map_err(|error| to_napi_core_error(&error))?; + let result = static_redaction_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) + } + + #[napi] + #[allow(clippy::needless_pass_by_value)] + pub fn redact_static_entities_diagnostics_json( + &self, + full_text: String, + operators: Option, + ) -> Result { + let operators = + operator_config_from_binding(operators.map(to_binding_operator_config)) + .map_err(|error| to_napi_contract_error(&error))?; + let mut result = self + .inner + .redact_static_entities_with_diagnostics(&full_text, &operators) + .map_err(|error| to_napi_core_error(&error))?; + let mut diagnostics = self.prepare_diagnostics.clone(); + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, &full_text) + .map_err(|error| to_napi_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_napi_serde_error(&error)) + } +} + +const fn prepared_search_pattern_count(config: &PreparedSearchConfig) -> usize { + config + .regex_patterns + .len() + .saturating_add(config.custom_regex_patterns.len()) + .saturating_add(config.literal_patterns.len()) +} + +fn prepared_search_cache_get(key: &[u8; 32]) -> Option> { + with_prepared_search_cache(|cache| cache.get(key)) +} + +fn prepared_search_cache_insert(key: [u8; 32], value: Arc) { + with_prepared_search_cache(|cache| cache.insert(key, value)); +} + +fn prepared_search_cache_key( + config_bytes: &[u8], + artifact_bytes: Option<&[u8]>, +) -> [u8; 32] { + let mut hasher = blake3::Hasher::new(); + hasher.update(b"config"); + hasher.update(config_bytes); + match artifact_bytes { + Some(bytes) => { + hasher.update(b"artifacts"); + hasher.update(bytes); + } + None => { + hasher.update(b"no-artifacts"); + } + } + *hasher.finalize().as_bytes() +} + +fn prepared_search_package_cache_key(package_bytes: &[u8]) -> [u8; 32] { + let mut hasher = blake3::Hasher::new(); + hasher.update(b"prepared-package"); + hasher.update(package_bytes); + *hasher.finalize().as_bytes() +} + +fn with_prepared_search_cache( + action: impl FnOnce(&mut PreparedSearchCache) -> T, +) -> T { + let mut cache = match PREPARED_SEARCH_CACHE.lock() { + Ok(cache) => cache, + Err(poisoned) => poisoned.into_inner(), + }; + action(&mut cache) +} + +fn to_binding_operator_config( + config: JsOperatorConfig, +) -> BindingOperatorConfig { + BindingOperatorConfig { + operators: config.operators, + redact_string: config.redact_string, + } +} + +fn to_js_static_redaction_result( + result: BindingStaticRedactionResult, +) -> Result { + Ok(JsStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(|entity| JsPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.source_detail, + }) + .collect(), + redaction: to_js_redaction_result(result.redaction)?, + }) +} + +fn to_js_redaction_result( + result: BindingRedactionResult, +) -> Result { + Ok(JsRedactionResult { + redacted_text: result.redacted_text, + redaction_map: result + .redaction_map + .into_iter() + .map(|entry| JsRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + }) + .collect(), + operator_map: to_js_operator_entries(result.operator_map), + entity_count: u32::try_from(result.entity_count).map_err(|_| { + Error::from_reason(format!( + "Entity count exceeds u32 range: {}", + result.entity_count + )) + })?, + }) +} + +fn to_js_operator_entries( + entries: Vec, +) -> Vec { + entries + .into_iter() + .map(|entry| JsOperatorEntry { + placeholder: entry.placeholder, + operator: entry.operator, + }) + .collect() +} + +const fn stage_event( + stage: DiagnosticStage, + count: Option, + elapsed_us: Option, + input_bytes: Option, +) -> DiagnosticEvent { + DiagnosticEvent { + stage, + kind: DiagnosticEventKind::StageSummary, + count, + engine: None, + pattern: None, + source: None, + source_detail: None, + label: None, + start: None, + end: None, + text: None, + score: None, + span_valid: None, + elapsed_us, + input_bytes, + reason: None, + } +} + +fn elapsed_us(start: Instant) -> u64 { + let micros = start.elapsed().as_micros(); + u64::try_from(micros).unwrap_or(u64::MAX) +} + +fn to_napi_core_error(error: &stella_anonymize_core::Error) -> Error { + Error::from_reason(error.to_string()) +} + +fn to_napi_contract_error(error: &ContractError) -> Error { + Error::from_reason(error.to_string()) +} + +fn to_napi_serde_error(error: &serde_json::Error) -> Error { + Error::from_reason(error.to_string()) +} diff --git a/crates/anonymize-py/Cargo.toml b/crates/anonymize-py/Cargo.toml new file mode 100644 index 00000000..8ad28284 --- /dev/null +++ b/crates/anonymize-py/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "stella-anonymize-py" +version.workspace = true +edition.workspace = true +description = "Python bindings for stella anonymization core" +license.workspace = true +publish.workspace = true +repository.workspace = true + +[lib] +name = "stella_anonymize_core_py" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.29", features = ["extension-module", "abi3-py311"] } +serde_json = "1" +stella-anonymize-adapter-contract = { path = "../anonymize-adapter-contract" } +stella-anonymize-core = { path = "../anonymize-core" } + +[build-dependencies] +pyo3-build-config = { version = "0.29", features = ["extension-module"] } + +[lints] +workspace = true diff --git a/crates/anonymize-py/README.md b/crates/anonymize-py/README.md new file mode 100644 index 00000000..29ca5356 --- /dev/null +++ b/crates/anonymize-py/README.md @@ -0,0 +1,48 @@ +# stella-anonymize-core + +Python bindings for the stella anonymization Rust core. + +## Install + +```bash +uv add stella-anonymize-core +``` + +## Usage + +Prepare or load the anonymizer once, then reuse it for documents. + +```py +import stella_anonymize as anonymize + +package_bytes = anonymize.prepare_search_package(config_json) +prepared = anonymize.load_prepared_package(package_bytes) +prepared.warm_lazy_regex() +result = prepared.redact_text(text, redact_string="***") + +print(result.redaction.redacted_text) +``` + +For prepared package files: + +```py +import stella_anonymize as anonymize + +prepared = anonymize.load_prepared_package_file("anonymize.stlanonpkg") +prepared.warm_lazy_regex() +result_json = prepared.redact_text_json(text) +``` + +Top-level `redact_text()` and `redact_text_json()` are available for one-off calls, but they prepare from config on each invocation. Use `load_prepared_package()` or `load_prepared_package_file()` for repeated document processing, then call `warm_lazy_regex()` before the first document when startup can absorb that cost. + +## API + +- `prepare_search_package(config_json, compressed=True) -> bytes` +- `load_prepared_package(package_bytes) -> PreparedAnonymizer` +- `load_prepared_package_file(package_path) -> PreparedAnonymizer` +- `PreparedAnonymizer.warm_lazy_regex()` +- `PreparedAnonymizer.redact_text(text, operators=None, redact_string=None)` +- `PreparedAnonymizer.redact_text_json(text, operators=None, redact_string=None)` +- `PreparedAnonymizer.diagnostics_json(text, operators=None, redact_string=None)` + +`PreparedSearch` is an alias for `PreparedAnonymizer`. diff --git a/crates/anonymize-py/build.rs b/crates/anonymize-py/build.rs new file mode 100644 index 00000000..a781ce15 --- /dev/null +++ b/crates/anonymize-py/build.rs @@ -0,0 +1,3 @@ +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/crates/anonymize-py/pyproject.toml b/crates/anonymize-py/pyproject.toml new file mode 100644 index 00000000..9ac6bc2f --- /dev/null +++ b/crates/anonymize-py/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["maturin>=1.14,<2"] +build-backend = "maturin" + +[project] +name = "stella-anonymize-core" +dynamic = ["version"] +description = "Python bindings for stella anonymization core" +readme = "README.md" +requires-python = ">=3.11" +license = "MIT" +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Rust", + "Typing :: Typed", +] + +[tool.maturin] +manifest-path = "Cargo.toml" +module-name = "stella_anonymize._native" +python-source = "python" diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.py b/crates/anonymize-py/python/stella_anonymize/__init__.py new file mode 100644 index 00000000..ef20a12d --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/__init__.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +import json +from collections.abc import Mapping +from functools import lru_cache +from os import PathLike + +from ._native import ( + OperatorEntry, + PipelineEntity, + PreparedSearch as NativePreparedSearch, + RedactionEntry, + RedactionResult, + StaticRedactionResult, + native_package_version, + normalize_for_search, + prepare_static_search_artifacts_bytes, + prepare_static_search_compressed_package_bytes, + prepare_static_search_package_bytes, + redact_static_entities_diagnostics_json, + redact_static_entities_json, +) + +__all__ = [ + "OperatorEntry", + "OperatorConfig", + "PreparedAnonymizer", + "NativePreparedSearch", + "PipelineEntity", + "PreparedSearch", + "RedactionEntry", + "RedactionResult", + "StaticRedactionResult", + "diagnostics_json", + "load_prepared_package", + "load_prepared_package_file", + "native_package_version", + "normalize_for_search", + "prepare_search_package", + "prepare_static_search_artifacts_bytes", + "prepare_static_search_compressed_package_bytes", + "prepare_static_search_package_bytes", + "redact_text", + "redact_text_json", + "redact_static_entities_diagnostics_json", + "redact_static_entities_json", +] + +BytesLike = bytes | bytearray | memoryview +PathLikeString = str | PathLike[str] +OperatorConfig = Mapping[str, str] | str | None + + +class PreparedAnonymizer: + def __init__(self, prepared: NativePreparedSearch) -> None: + self._prepared = prepared + + @classmethod + def from_config_json(cls, config_json: str) -> PreparedAnonymizer: + return cls(NativePreparedSearch(config_json)) + + @classmethod + def from_config_json_and_artifact_bytes( + cls, + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedAnonymizer: + return cls( + NativePreparedSearch.from_config_json_and_artifact_bytes( + config_json, + bytes(artifact_bytes), + ) + ) + + @classmethod + def from_prepared_package_bytes( + cls, + package_bytes: BytesLike, + ) -> PreparedAnonymizer: + return cls( + NativePreparedSearch.from_prepared_package_bytes(bytes(package_bytes)) + ) + + def prepare_diagnostics_json(self) -> str: + return self._prepared.prepare_diagnostics_json() + + def warm_lazy_regex(self) -> None: + self._prepared.warm_lazy_regex() + + def redact_text( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: + return self._prepared.redact_static_entities( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def redact_text_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self._prepared.redact_static_entities_json( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self._prepared.redact_static_entities_diagnostics_json( + full_text, + _operator_config_json(operators, redact_string=redact_string), + ) + + def redact_static_entities( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: + return self.redact_text( + full_text, + operators, + redact_string=redact_string, + ) + + def redact_static_entities_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self.redact_text_json( + full_text, + operators, + redact_string=redact_string, + ) + + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: + return self.diagnostics_json( + full_text, + operators, + redact_string=redact_string, + ) + + +PreparedSearch = PreparedAnonymizer + + +def prepare_search_package(config_json: str, *, compressed: bool = True) -> bytes: + if compressed: + return prepare_static_search_compressed_package_bytes(config_json) + return prepare_static_search_package_bytes(config_json) + + +def load_prepared_package(package_bytes: BytesLike) -> PreparedAnonymizer: + return _load_prepared_package(bytes(package_bytes)) + + +def load_prepared_package_file(package_path: PathLikeString) -> PreparedAnonymizer: + with open(package_path, "rb") as handle: + return load_prepared_package(handle.read()) + + +@lru_cache(maxsize=8) +def _load_prepared_package(package_bytes: bytes) -> PreparedAnonymizer: + return PreparedAnonymizer.from_prepared_package_bytes(package_bytes) + + +@lru_cache(maxsize=8) +def _prepare_from_config_json(config_json: str) -> PreparedAnonymizer: + return PreparedAnonymizer.from_config_json(config_json) + + +def redact_text( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> StaticRedactionResult: + return _prepare_from_config_json(config_json).redact_text( + full_text, + operators, + redact_string=redact_string, + ) + + +def redact_text_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: + return _prepare_from_config_json(config_json).redact_text_json( + full_text, + operators, + redact_string=redact_string, + ) + + +def diagnostics_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: + return _prepare_from_config_json(config_json).diagnostics_json( + full_text, + operators, + redact_string=redact_string, + ) + + +def _operator_config_json( + operators: OperatorConfig, + *, + redact_string: str | None, +) -> str | None: + if operators is None and redact_string is None: + return None + if isinstance(operators, str): + if redact_string is not None: + raise ValueError("redact_string cannot be combined with raw JSON") + return operators + payload: dict[str, object] = {} + if operators is not None: + payload["operators"] = dict(operators) + if redact_string is not None: + payload["redactString"] = redact_string + return json.dumps(payload, separators=(",", ":")) diff --git a/crates/anonymize-py/python/stella_anonymize/__init__.pyi b/crates/anonymize-py/python/stella_anonymize/__init__.pyi new file mode 100644 index 00000000..4c07bb8d --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/__init__.pyi @@ -0,0 +1,118 @@ +from __future__ import annotations + +from collections.abc import Mapping +from os import PathLike +from typing import TypeAlias + +from ._native import ( + PreparedSearch as NativePreparedSearch, + OperatorEntry as OperatorEntry, + PipelineEntity as PipelineEntity, + RedactionEntry as RedactionEntry, + RedactionResult as RedactionResult, + StaticRedactionResult as StaticRedactionResult, + native_package_version as native_package_version, + normalize_for_search as normalize_for_search, + prepare_static_search_artifacts_bytes as prepare_static_search_artifacts_bytes, + prepare_static_search_compressed_package_bytes as prepare_static_search_compressed_package_bytes, + prepare_static_search_package_bytes as prepare_static_search_package_bytes, + redact_static_entities_diagnostics_json as redact_static_entities_diagnostics_json, + redact_static_entities_json as redact_static_entities_json, +) + +BytesLike: TypeAlias = bytes | bytearray | memoryview +PathLikeString: TypeAlias = str | PathLike[str] +OperatorConfig: TypeAlias = Mapping[str, str] | str | None + +class PreparedAnonymizer: + def __init__(self, prepared: NativePreparedSearch) -> None: ... + @classmethod + def from_config_json(cls, config_json: str) -> PreparedAnonymizer: ... + @classmethod + def from_config_json_and_artifact_bytes( + cls, + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedAnonymizer: ... + @classmethod + def from_prepared_package_bytes( + cls, + package_bytes: BytesLike, + ) -> PreparedAnonymizer: ... + def prepare_diagnostics_json(self) -> str: ... + def warm_lazy_regex(self) -> None: ... + def redact_text( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: ... + def redact_text_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def redact_static_entities( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> StaticRedactionResult: ... + def redact_static_entities_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, + ) -> str: ... + +PreparedSearch: TypeAlias = PreparedAnonymizer + +def prepare_search_package( + config_json: str, *, compressed: bool = True +) -> bytes: ... +def load_prepared_package(package_bytes: BytesLike) -> PreparedAnonymizer: ... +def load_prepared_package_file( + package_path: PathLikeString, +) -> PreparedAnonymizer: ... +def redact_text( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> StaticRedactionResult: ... +def redact_text_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: ... +def diagnostics_json( + config_json: str, + full_text: str, + operators: OperatorConfig = None, + *, + redact_string: str | None = None, +) -> str: ... + +__all__: list[str] diff --git a/crates/anonymize-py/python/stella_anonymize/_native.pyi b/crates/anonymize-py/python/stella_anonymize/_native.pyi new file mode 100644 index 00000000..bbdff4d8 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/_native.pyi @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import TypeAlias + +BytesLike: TypeAlias = bytes | bytearray | memoryview + +class RedactionEntry: + @property + def placeholder(self) -> str: ... + @property + def original(self) -> str: ... + +class OperatorEntry: + @property + def placeholder(self) -> str: ... + @property + def operator(self) -> str: ... + +class RedactionResult: + @property + def redacted_text(self) -> str: ... + @property + def redaction_map(self) -> list[RedactionEntry]: ... + @property + def operator_map(self) -> list[OperatorEntry]: ... + @property + def entity_count(self) -> int: ... + +class PipelineEntity: + @property + def start(self) -> int: ... + @property + def end(self) -> int: ... + @property + def label(self) -> str: ... + @property + def text(self) -> str: ... + @property + def score(self) -> float: ... + @property + def source(self) -> str: ... + @property + def source_detail(self) -> str | None: ... + +class StaticRedactionResult: + @property + def resolved_entities(self) -> list[PipelineEntity]: ... + @property + def redaction(self) -> RedactionResult: ... + +class PreparedSearch: + def __init__(self, config_json: str) -> None: ... + @staticmethod + def from_config_json_and_artifact_bytes( + config_json: str, + artifact_bytes: BytesLike, + ) -> PreparedSearch: ... + @staticmethod + def from_prepared_package_bytes( + package_bytes: BytesLike, + ) -> PreparedSearch: ... + def prepare_diagnostics_json(self) -> str: ... + def warm_lazy_regex(self) -> None: ... + def redact_static_entities( + self, + full_text: str, + operators_json: str | None = None, + ) -> StaticRedactionResult: ... + def redact_static_entities_json( + self, + full_text: str, + operators_json: str | None = None, + ) -> str: ... + def redact_static_entities_diagnostics_json( + self, + full_text: str, + operators_json: str | None = None, + ) -> str: ... + +def redact_static_entities_json( + config_json: str, + full_text: str, + operators_json: str | None = None, +) -> str: ... +def prepare_static_search_artifacts_bytes(config_json: str) -> bytes: ... +def prepare_static_search_package_bytes(config_json: str) -> bytes: ... +def prepare_static_search_compressed_package_bytes(config_json: str) -> bytes: ... +def redact_static_entities_diagnostics_json( + config_json: str, + full_text: str, + operators_json: str | None = None, +) -> str: ... +def normalize_for_search(text: str) -> str: ... +def native_package_version() -> str: ... diff --git a/crates/anonymize-py/python/stella_anonymize/py.typed b/crates/anonymize-py/python/stella_anonymize/py.typed new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/crates/anonymize-py/python/stella_anonymize/py.typed @@ -0,0 +1 @@ + diff --git a/crates/anonymize-py/src/lib.rs b/crates/anonymize-py/src/lib.rs new file mode 100644 index 00000000..1c736fa6 --- /dev/null +++ b/crates/anonymize-py/src/lib.rs @@ -0,0 +1,487 @@ +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::PyBytes; +use stella_anonymize_adapter_contract::{ + BindingOperatorConfig, BindingOperatorEntry, BindingPipelineEntity, + BindingPreparedSearchConfig, BindingRedactionEntry, BindingRedactionResult, + BindingStaticRedactionResult, ContractError, operator_config_from_binding, + prepared_search_config_from_binding, prepared_search_core_package_to_bytes, + prepared_search_core_package_to_compressed_bytes, + prepared_search_core_package_view_from_bytes, + prepared_search_package_from_bytes, prepared_search_package_has_core_payload, + static_redaction_diagnostic_result_to_utf16_binding, + static_redaction_diagnostics_to_binding, static_redaction_result_to_binding, + static_redaction_result_to_utf16_binding, +}; +use stella_anonymize_core::{ + PreparedSearch as CorePreparedSearch, PreparedSearchArtifacts, + StaticRedactionDiagnostics, StaticRedactionResult, +}; + +#[pyclass(name = "RedactionEntry", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyRedactionEntry { + placeholder: String, + original: String, +} + +#[pyclass(name = "OperatorEntry", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyOperatorEntry { + placeholder: String, + operator: String, +} + +#[pyclass(name = "RedactionResult", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyRedactionResult { + redacted_text: String, + redaction_map: Vec, + operator_map: Vec, + entity_count: usize, +} + +#[pyclass(name = "PipelineEntity", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyPipelineEntity { + start: u32, + end: u32, + label: String, + text: String, + score: f64, + source: String, + source_detail: Option, +} + +#[pyclass(name = "StaticRedactionResult", get_all, skip_from_py_object)] +#[derive(Clone)] +pub struct PyStaticRedactionResult { + resolved_entities: Vec, + redaction: PyRedactionResult, +} + +#[pyclass(name = "PreparedSearch")] +pub struct PyPreparedSearch { + inner: CorePreparedSearch, + prepare_diagnostics: StaticRedactionDiagnostics, +} + +#[pymethods] +impl PyPreparedSearch { + #[new] + fn new(config_json: &str) -> PyResult { + let config = parse_core_prepared_search_config(config_json)?; + let result = CorePreparedSearch::new_with_diagnostics(config) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + #[staticmethod] + fn from_config_json_and_artifact_bytes( + config_json: &str, + artifact_bytes: &[u8], + ) -> PyResult { + let config = parse_core_prepared_search_config(config_json)?; + let artifacts = PreparedSearchArtifacts::from_bytes(artifact_bytes) + .map_err(|error| to_py_core_error(&error))?; + let result = + CorePreparedSearch::new_with_artifacts_diagnostics(config, &artifacts) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + #[staticmethod] + fn from_prepared_package_bytes(package_bytes: &[u8]) -> PyResult { + if prepared_search_package_has_core_payload(package_bytes) { + let package = prepared_search_core_package_view_from_bytes(package_bytes) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = + PreparedSearchArtifacts::from_bytes(package.artifacts.as_ref()) + .map_err(|error| to_py_core_error(&error))?; + let result = CorePreparedSearch::new_with_artifacts_diagnostics( + package.config, + &artifacts, + ) + .map_err(|error| to_py_core_error(&error))?; + return Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }); + } + + let package = prepared_search_package_from_bytes(package_bytes) + .map_err(|error| to_py_contract_error(&error))?; + let config = prepared_search_config_from_binding(package.config) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = PreparedSearchArtifacts::from_bytes(&package.artifacts) + .map_err(|error| to_py_core_error(&error))?; + let result = + CorePreparedSearch::new_with_artifacts_diagnostics(config, &artifacts) + .map_err(|error| to_py_core_error(&error))?; + Ok(Self { + inner: result.prepared, + prepare_diagnostics: result.diagnostics, + }) + } + + fn prepare_diagnostics_json(&self) -> PyResult { + let diagnostics = + static_redaction_diagnostics_to_binding(self.prepare_diagnostics.clone()); + + serde_json::to_string(&diagnostics) + .map_err(|error| to_py_serde_error(&error)) + } + + fn warm_lazy_regex(&self) -> PyResult<()> { + self + .inner + .warm_lazy_regex() + .map_err(|error| to_py_core_error(&error)) + } + + fn redact_static_entities( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let result = self.redact_static_entities_core(full_text, operators_json)?; + static_redaction_result_to_python_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error)) + .map(to_py_static_redaction_result) + } + + fn redact_static_entities_json( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let result = self.redact_static_entities_core(full_text, operators_json)?; + let result = static_redaction_result_to_utf16_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error))?; + serde_json::to_string(&result).map_err(|error| to_py_serde_error(&error)) + } + + fn redact_static_entities_diagnostics_json( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let operators = parse_operator_config(operators_json)?; + let mut result = self + .inner + .redact_static_entities_with_diagnostics( + full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map_err(|error| to_py_core_error(&error))?; + let mut diagnostics = self.prepare_diagnostics.clone(); + diagnostics.extend(result.diagnostics); + result.diagnostics = diagnostics; + let result = + static_redaction_diagnostic_result_to_utf16_binding(result, full_text) + .map_err(|error| to_py_contract_error(&error))?; + + serde_json::to_string(&result).map_err(|error| to_py_serde_error(&error)) + } +} + +impl PyPreparedSearch { + fn redact_static_entities_core( + &self, + full_text: &str, + operators_json: Option<&str>, + ) -> PyResult { + let operators = parse_operator_config(operators_json)?; + self + .inner + .redact_static_entities( + full_text, + &operator_config_from_binding(operators) + .map_err(|error| to_py_contract_error(&error))?, + ) + .map_err(|error| to_py_core_error(&error)) + } +} + +#[pyfunction] +fn redact_static_entities_json( + config_json: &str, + full_text: &str, + operators_json: Option<&str>, +) -> PyResult { + let prepared = PyPreparedSearch::new(config_json)?; + prepared.redact_static_entities_json(full_text, operators_json) +} + +#[pyfunction] +fn prepare_static_search_artifacts_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + let config = parse_core_prepared_search_config(config_json)?; + let bytes = CorePreparedSearch::prepare_artifacts(config) + .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_py_core_error(&error))?; + Ok(PyBytes::new(py, &bytes)) +} + +#[pyfunction] +fn prepare_static_search_package_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + prepare_static_search_package_bytes_with(py, config_json, false) +} + +#[pyfunction] +fn prepare_static_search_compressed_package_bytes<'py>( + py: Python<'py>, + config_json: &str, +) -> PyResult> { + prepare_static_search_package_bytes_with(py, config_json, true) +} + +fn prepare_static_search_package_bytes_with<'py>( + py: Python<'py>, + config_json: &str, + compressed: bool, +) -> PyResult> { + let binding_config = parse_prepared_search_config(config_json)?; + let core_config = prepared_search_config_from_binding(binding_config) + .map_err(|error| to_py_contract_error(&error))?; + let artifacts = CorePreparedSearch::prepare_artifacts(core_config.clone()) + .and_then(|artifacts| artifacts.to_bytes()) + .map_err(|error| to_py_core_error(&error))?; + let package = if compressed { + prepared_search_core_package_to_compressed_bytes(&core_config, &artifacts) + } else { + prepared_search_core_package_to_bytes(&core_config, &artifacts) + }; + let bytes = package.map_err(|error| to_py_contract_error(&error))?; + Ok(PyBytes::new(py, &bytes)) +} + +#[pyfunction] +fn redact_static_entities_diagnostics_json( + config_json: &str, + full_text: &str, + operators_json: Option<&str>, +) -> PyResult { + let prepared = PyPreparedSearch::new(config_json)?; + prepared.redact_static_entities_diagnostics_json(full_text, operators_json) +} + +#[pyfunction] +fn normalize_for_search(text: &str) -> String { + stella_anonymize_core::normalize_for_search(text) +} + +#[pyfunction] +#[allow(clippy::missing_const_for_fn)] +fn native_package_version() -> &'static str { + env!("CARGO_PKG_VERSION") +} + +fn parse_prepared_search_config( + config_json: &str, +) -> PyResult { + serde_json::from_str(config_json).map_err(|error| to_py_serde_error(&error)) +} + +fn parse_core_prepared_search_config( + config_json: &str, +) -> PyResult { + prepared_search_config_from_binding(parse_prepared_search_config( + config_json, + )?) + .map_err(|error| to_py_contract_error(&error)) +} + +fn parse_operator_config( + operators_json: Option<&str>, +) -> PyResult> { + operators_json + .map(serde_json::from_str::) + .transpose() + .map_err(|error| to_py_serde_error(&error)) +} + +fn static_redaction_result_to_python_binding( + result: StaticRedactionResult, + full_text: &str, +) -> std::result::Result { + let offsets = PythonOffsetMap::new(full_text)?; + let mut result = static_redaction_result_to_binding(result); + convert_pipeline_entity_offsets_to_python( + &mut result.resolved_entities, + &offsets, + )?; + Ok(result) +} + +fn convert_pipeline_entity_offsets_to_python( + entities: &mut [BindingPipelineEntity], + offsets: &PythonOffsetMap, +) -> std::result::Result<(), ContractError> { + for entity in entities { + entity.start = offsets.convert(entity.start)?; + entity.end = offsets.convert(entity.end)?; + } + Ok(()) +} + +struct PythonOffsetMap { + boundaries: Vec<(u32, u32)>, +} + +impl PythonOffsetMap { + fn new(text: &str) -> std::result::Result { + let mut boundaries = Vec::new(); + let mut code_point_offset = 0_u32; + boundaries.push((0, 0)); + + for (byte_start, ch) in text.char_indices() { + code_point_offset = + code_point_offset.checked_add(1).ok_or_else(|| { + ContractError::InvalidPreparedSearchPackage { + reason: String::from("Python offset exceeds u32 range"), + } + })?; + let byte_end = byte_start.saturating_add(ch.len_utf8()); + boundaries.push((u32_from_usize(byte_end)?, code_point_offset)); + } + + Ok(Self { boundaries }) + } + + fn convert(&self, offset: u32) -> std::result::Result { + self + .try_convert(offset) + .ok_or(ContractError::InvalidBindingOffset { offset }) + } + + fn try_convert(&self, offset: u32) -> Option { + let index = self + .boundaries + .binary_search_by_key(&offset, |(byte_offset, _)| *byte_offset) + .ok()?; + self + .boundaries + .get(index) + .map(|(_, code_point_offset)| *code_point_offset) + } +} + +fn u32_from_usize(value: usize) -> std::result::Result { + u32::try_from(value).map_err(|_| { + ContractError::InvalidPreparedSearchPackage { + reason: format!("Offset exceeds u32 range: {value}"), + } + }) +} + +fn to_py_static_redaction_result( + result: BindingStaticRedactionResult, +) -> PyStaticRedactionResult { + PyStaticRedactionResult { + resolved_entities: result + .resolved_entities + .into_iter() + .map(to_py_pipeline_entity) + .collect(), + redaction: to_py_redaction_result(result.redaction), + } +} + +fn to_py_pipeline_entity(entity: BindingPipelineEntity) -> PyPipelineEntity { + PyPipelineEntity { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.source_detail, + } +} + +fn to_py_redaction_result(result: BindingRedactionResult) -> PyRedactionResult { + PyRedactionResult { + redacted_text: result.redacted_text, + redaction_map: result + .redaction_map + .into_iter() + .map(to_py_redaction_entry) + .collect(), + operator_map: result + .operator_map + .into_iter() + .map(to_py_operator_entry) + .collect(), + entity_count: result.entity_count, + } +} + +fn to_py_redaction_entry(entry: BindingRedactionEntry) -> PyRedactionEntry { + PyRedactionEntry { + placeholder: entry.placeholder, + original: entry.original, + } +} + +fn to_py_operator_entry(entry: BindingOperatorEntry) -> PyOperatorEntry { + PyOperatorEntry { + placeholder: entry.placeholder, + operator: entry.operator, + } +} + +fn to_py_core_error(error: &stella_anonymize_core::Error) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +fn to_py_contract_error(error: &ContractError) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +fn to_py_serde_error(error: &serde_json::Error) -> PyErr { + PyValueError::new_err(error.to_string()) +} + +#[pymodule(gil_used = false)] +fn _native(module: &Bound<'_, PyModule>) -> PyResult<()> { + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module + .add_function(wrap_pyfunction!(redact_static_entities_json, module)?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_artifacts_bytes, + module + )?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_package_bytes, + module + )?)?; + module.add_function(wrap_pyfunction!( + prepare_static_search_compressed_package_bytes, + module + )?)?; + module.add_function(wrap_pyfunction!( + redact_static_entities_diagnostics_json, + module + )?)?; + module.add_function(wrap_pyfunction!(normalize_for_search, module)?)?; + module.add_function(wrap_pyfunction!(native_package_version, module)?)?; + Ok(()) +} diff --git a/crates/anonymize-py/typecheck/sdk_usage.py b/crates/anonymize-py/typecheck/sdk_usage.py new file mode 100644 index 00000000..7fbf05f5 --- /dev/null +++ b/crates/anonymize-py/typecheck/sdk_usage.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import stella_anonymize as anonymize + + +def redact_with_prepared_package(config_json: str, text: str) -> str: + package_bytes = anonymize.prepare_search_package(config_json) + prepared = anonymize.load_prepared_package(package_bytes) + result = prepared.redact_text(text) + return result.redaction.redacted_text + + +def redact_with_package_file(package_path: str, text: str) -> int: + prepared = anonymize.load_prepared_package_file(package_path) + result = prepared.redact_text(text, {"country": "redact"}) + return result.redaction.entity_count + + +def runtime_version() -> str: + return anonymize.native_package_version() + + +def redact_json(config_json: str, text: str) -> str: + return anonymize.redact_text_json( + config_json, + text, + {"country": "redact"}, + redact_string="***", + ) + + +def redact_object(config_json: str, text: str) -> int: + result = anonymize.redact_text( + config_json, + text, + {"country": "redact"}, + redact_string="***", + ) + return result.redaction.entity_count diff --git a/dylint.toml b/dylint.toml new file mode 100644 index 00000000..0966fea2 --- /dev/null +++ b/dylint.toml @@ -0,0 +1,4 @@ +[workspace.metadata.dylint] +libraries = [ + { git = "https://github.com/stella/tooling", rev = "fe7012b863ad2fcdd788a3fab759b31181bbf9c8", pattern = "rust-lints/*" }, +] diff --git a/package.json b/package.json index 12a69357..15869e9d 100644 --- a/package.json +++ b/package.json @@ -11,15 +11,23 @@ "scripts": { "build": "turbo run build", "typecheck": "turbo run typecheck", - "lint": "turbo run lint && bun run lint:oxlint", + "lint": "turbo run lint && bun run lint:oxlint && bun run check:brand-case", "lint:oxlint": "bun --bun oxlint -c oxlint.config.ts --deny-warnings --type-aware .", "lint:fix": "bun --bun oxlint -c oxlint.config.ts --type-aware --fix .", "format": "turbo run format && oxfmt . \"!packages/**\" \"!.ai/**\" \"!.agents/**\" \"!.claude/**\" \"!AGENTS.md\" \"!CLAUDE.md\" \"!GEMINI.md\"", "format:check": "turbo run format -- --check && oxfmt --check . \"!packages/**\" \"!.ai/**\" \"!.agents/**\" \"!.claude/**\" \"!AGENTS.md\" \"!CLAUDE.md\" \"!GEMINI.md\"", "test": "turbo run test", + "rust:fmt": "cargo ci-fmt", + "rust:lint": "cargo ci-clippy", + "rust:dylint": "cargo ci-dylint", + "rust:test": "cargo ci-test", + "rust:check": "bun run rust:fmt && bun run rust:lint && bun run rust:dylint && bun run rust:test", + "python:typecheck": "uvx --from ty==0.0.29 ty check --extra-search-path crates/anonymize-py/python crates/anonymize-py/typecheck", + "python:wheel": "node .github/tools/check-python-wheel.mjs", "sync:version": "node .github/tools/sync-runtime-version.mjs", "check:version": "node .github/tools/sync-runtime-version.mjs --check", "check:bun": "node .github/tools/check-bun-workflows.mjs", + "check:brand-case": "node .github/tools/check-brand-case.mjs", "secrets:check:staged": "bash scripts/check-staged-secrets.sh", "hooks:install": "lefthook install", "hooks:uninstall": "lefthook uninstall", @@ -32,7 +40,7 @@ "@stll/typescript-config": "^0.3.0", "lefthook": "^2.1.9", "oxfmt": "^0.54.0", - "oxlint": "^1.69.0", + "oxlint": "^1.70.0", "oxlint-tsgolint": "^0.23.0", "turbo": "^2.9.18" } diff --git a/packages/anonymize/.gitignore b/packages/anonymize/.gitignore index 896ad8ab..8f27af15 100644 --- a/packages/anonymize/.gitignore +++ b/packages/anonymize/.gitignore @@ -1 +1,2 @@ wasm/dist/ +*.node diff --git a/packages/anonymize/README.md b/packages/anonymize/README.md index f411f81d..89aa8349 100644 --- a/packages/anonymize/README.md +++ b/packages/anonymize/README.md @@ -1,5 +1,5 @@

- Stella anonymize + stella anonymize

# @stll/anonymize @@ -18,39 +18,63 @@ bun add @stll/anonymize-data For browser targets, install `@stll/anonymize-wasm` instead. It exposes the same runtime API through WebAssembly and is the supported entrypoint for Vite-based bundles. -## Usage +## Usage: Node.js native SDK ```ts -import { runPipeline } from "@stll/anonymize"; +import { getDefaultNativePipeline } from "@stll/anonymize/native-node"; -const entities = await runPipeline({ - fullText: text, - config: { - labels: [ - "person", - "organization", - "address", - "date", - "iban", - "phone number", - ], - threshold: 0.5, - enableRegex: true, - enableTriggerPhrases: true, - enableLegalForms: true, - enableNameCorpus: true, - enableDenyList: false, - enableGazetteer: false, - enableNer: false, - enableConfidenceBoost: true, - enableCoreference: true, - workspaceId: "default", - }, - gazetteerEntries: [], -}); +const anonymizer = getDefaultNativePipeline(); +const result = anonymizer.redact_text(text); + +console.log(result.redaction.redactedText); +``` + +Call `getDefaultNativePipeline()` once during service startup and reuse the returned anonymizer. The package ships with a prepared native package, so the normal request path avoids rebuilding search automata. Use `preloadDefaultNativePipeline()` or `preloadDefaultNativePipelineAsync()` when the first document should not pay lazy regex warm-up. + +If your deployment knows the document language up front, build scoped package artifacts and select them at startup: + +```bash +STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES=en,cs bun run build +``` + +```ts +const anonymizer = getDefaultNativePipeline({ language: "en" }); +``` + +For build-time generated packages or caller-owned data, prepare the package before runtime and load the bytes in the process that handles documents. + +```bash +bunx stella-anonymize-build-native-package \ + --config ./anonymize-native-config.mjs \ + --out ./dist/anonymize.stlanonpkg ``` -## Caller-owned deny lists and regexes +```ts +import { load_prepared_package_file } from "@stll/anonymize/native-node"; + +const anonymizer = load_prepared_package_file("./dist/anonymize.stlanonpkg"); +anonymizer.warmLazyRegex(); +const result = anonymizer.redact_text(text, { redactString: "***" }); +``` + +The config module may export a `PipelineConfig` directly or `{ config, gazetteerEntries }`. Include `@stll/anonymize-data` dictionaries there if your runtime config uses the deny-list or name-corpus layers; keep the corresponding layers enabled for caller-owned `customDenyList`, `customRegexes`, and gazetteers. Those inputs are part of the prepared package and should be regenerated when they change. + +## Python SDK + +```py +import stella_anonymize as anonymize + +package_bytes = anonymize.prepare_search_package(config_json) +prepared = anonymize.load_prepared_package(package_bytes) +prepared.warm_lazy_regex() +result = prepared.redact_text(text, redact_string="***") + +print(result.redaction.redacted_text) +``` + +The Python SDK uses the same Rust core and prepared-package contract as the Node SDK. Prefer `load_prepared_package()` or `load_prepared_package_file()` for repeated calls; top-level `redact_text()` and `redact_text_json()` prepare from config on each call. + +## Caller-Owned Deny Lists and Regexes Use `customDenyList` for exact terms and variants that you control. These are matched by the deny-list layer, so keep `enableDenyList: true`. @@ -92,6 +116,20 @@ const entities = await runPipeline({ }); ``` +## TypeScript Pipeline Compatibility + +The async TypeScript pipeline remains available for compatibility and for browser/WASM builds. + +```ts +import { runPipeline } from "@stll/anonymize"; + +const entities = await runPipeline({ + fullText: text, + config, + gazetteerEntries: [], +}); +``` + ## Browser setup If you use Vite with the WASM build, exclude the bundle from dependency pre-bundling: diff --git a/packages/anonymize/index.cjs b/packages/anonymize/index.cjs new file mode 100644 index 00000000..9636dfab --- /dev/null +++ b/packages/anonymize/index.cjs @@ -0,0 +1,3 @@ +"use strict"; + +module.exports = require("./stella_anonymize_napi.node"); diff --git a/packages/anonymize/package.json b/packages/anonymize/package.json index 3e90db75..c2039652 100644 --- a/packages/anonymize/package.json +++ b/packages/anonymize/package.json @@ -13,11 +13,30 @@ "types": "./dist/constants.d.mts", "import": "./dist/constants.mjs", "default": "./dist/constants.mjs" + }, + "./native": { + "types": "./dist/native.d.mts", + "import": "./dist/native.mjs", + "default": "./dist/native.mjs" + }, + "./native-node": { + "types": "./dist/native-node.d.mts", + "import": "./dist/native-node.mjs", + "default": "./dist/native-node.mjs" } }, "types": "dist/index.d.mts", + "bin": { + "stella-anonymize-build-native-package": "./scripts/build-native-pipeline-package.mjs" + }, "files": [ - "dist" + "ATTRIBUTION.md", + "dist", + "index.cjs", + "*.node", + "native-pipeline.stlanonpkg", + "native-pipeline.*.stlanonpkg", + "scripts/build-native-pipeline-package.mjs" ], "publishConfig": { "access": "public" @@ -28,19 +47,22 @@ }, "license": "MIT", "scripts": { - "build": "tsdown", + "build": "tsdown && bun scripts/build-native-node.mjs", "prepublishOnly": "bun run build", "typecheck": "tsc --noEmit -p tsconfig.json && tsc --noEmit -p tsconfig.test.json && tsc --noEmit -p tsconfig.wasm.json", "test": "bun test --preload ./src/__test__/setup.ts --timeout 15000", "test:fast": "bun run test src/__test__/*.test.ts", "perf:contracts": "bun scripts/contract-perf.mjs", + "perf:migration-fixtures": "bun scripts/migration-fixture-perf.mjs", + "perf:native-adapters": "bun scripts/native-adapter-perf.mjs", + "perf:native-package": "bun scripts/native-package-ux-perf.mjs", "smoke:dist": "bun scripts/dist-smoke.mjs", "format": "oxfmt ." }, "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search": "^1.0.6" + "@stll/text-search": "^1.0.7" }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6" @@ -52,10 +74,10 @@ }, "devDependencies": { "@stll/anonymize-data": "workspace:*", - "@stll/text-search-wasm": "^1.0.6", + "@stll/text-search-wasm": "^1.0.7", "bun-types": "^1.3.14", "fast-check": "^4.8.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3", "vite": "^8.0.16" } diff --git a/packages/anonymize/scripts/build-native-node.mjs b/packages/anonymize/scripts/build-native-node.mjs new file mode 100644 index 00000000..b68bb82f --- /dev/null +++ b/packages/anonymize/scripts/build-native-node.mjs @@ -0,0 +1,91 @@ +import { execFileSync } from "node:child_process"; +import { copyFileSync, existsSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const packageRoot = dirname(dirname(fileURLToPath(import.meta.url))); +const repoRoot = dirname(dirname(packageRoot)); +const scopedPackageLanguages = languageListFromEnv( + process.env.STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES, +); + +const sourceByPlatform = { + darwin: "libstella_anonymize_napi.dylib", + linux: "libstella_anonymize_napi.so", + win32: "stella_anonymize_napi.dll", +}; + +const sourceName = sourceByPlatform[process.platform]; +if (!sourceName) { + throw new Error(`Unsupported native build platform: ${process.platform}`); +} + +execFileSync( + "cargo", + ["build", "-p", "stella-anonymize-napi", "--release", "--locked"], + { + cwd: repoRoot, + stdio: "inherit", + }, +); + +const source = join(repoRoot, "target", "release", sourceName); +if (!existsSync(source)) { + throw new Error(`Native build output is missing: ${source}`); +} + +copyFileSync(source, join(packageRoot, "stella_anonymize_napi.node")); + +buildNativePipelinePackage([ + "--out", + join(packageRoot, "native-pipeline.stlanonpkg"), + "--default-dictionaries", +]); + +for (const language of scopedPackageLanguages) { + buildNativePipelinePackage([ + "--out", + join(packageRoot, `native-pipeline.${language}.stlanonpkg`), + "--default-dictionaries", + "--language", + language, + ]); +} + +function buildNativePipelinePackage(args) { + execFileSync( + process.execPath, + [ + join(packageRoot, "scripts", "build-native-pipeline-package.mjs"), + ...args, + ], + { + cwd: packageRoot, + stdio: "inherit", + }, + ); +} + +function languageListFromEnv(value) { + if (value === undefined || value.trim().length === 0) { + return []; + } + const languages = value + .split(",") + .map((entry) => normalizeLanguage(entry)) + .filter((entry, index, entries) => entries.indexOf(entry) === index); + if (languages.length === 0) { + throw new Error("STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES is empty"); + } + return languages; +} + +function normalizeLanguage(value) { + const language = value.trim().toLowerCase(); + if (!/^[a-z0-9]+(?:-[a-z0-9]+)*$/u.test(language)) { + throw new Error( + `Invalid STELLA_ANONYMIZE_NATIVE_PACKAGE_LANGUAGES entry: ${value}`, + ); + } + return language; +} diff --git a/packages/anonymize/scripts/build-native-pipeline-package.mjs b/packages/anonymize/scripts/build-native-pipeline-package.mjs new file mode 100755 index 00000000..0c6d6270 --- /dev/null +++ b/packages/anonymize/scripts/build-native-pipeline-package.mjs @@ -0,0 +1,219 @@ +#!/usr/bin/env node +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, resolve } from "node:path"; +import { pathToFileURL } from "node:url"; + +import { + createPipelineContext, + DEFAULT_NATIVE_PIPELINE_CONFIG, + prepareNativePipelinePackage, +} from "../dist/index.mjs"; +import { loadNativeAnonymizeBinding } from "../dist/native-node.mjs"; + +const args = parseArgs(process.argv.slice(2)); +const outputPath = resolve(args.out ?? "native-pipeline.stlanonpkg"); +const compressed = args.raw !== true; +const { config, gazetteerEntries } = await loadPackageInput(args); +const binding = loadNativeAnonymizeBinding(); +const packageBytes = await prepareNativePipelinePackage({ + binding, + config, + gazetteerEntries, + context: createPipelineContext(), + compressed, +}); + +mkdirSync(dirname(outputPath), { recursive: true }); +writeFileSync(outputPath, packageBytes); + +console.log( + JSON.stringify({ + event: "native-pipeline-package", + outputPath, + bytes: packageBytes.byteLength, + compressed, + nativeVersion: binding.nativePackageVersion(), + }), +); + +function parseArgs(values) { + const result = {}; + for (let index = 0; index < values.length; index += 1) { + const value = values[index]; + switch (value) { + case "--config": { + result.config = requiredValue(values, index, value); + index += 1; + break; + } + case "--export": { + result.exportName = requiredValue(values, index, value); + index += 1; + break; + } + case "--out": { + result.out = requiredValue(values, index, value); + index += 1; + break; + } + case "--raw": { + result.raw = true; + break; + } + case "--default-dictionaries": { + result.defaultDictionaries = true; + break; + } + case "--language": { + result.language = requiredValue(values, index, value); + index += 1; + break; + } + case "--languages": { + result.languages = requiredValue(values, index, value); + index += 1; + break; + } + case "--help": { + printHelp(); + process.exit(0); + } + default: + throw new Error(`Unknown option: ${value}`); + } + } + return result; +} + +function requiredValue(values, index, option) { + const value = values[index + 1]; + if (value === undefined || value.startsWith("--")) { + throw new Error(`${option} requires a value`); + } + return value; +} + +async function loadPackageInput(options) { + const input = await loadBasePackageInput(options); + const withDictionaries = + !options.defaultDictionaries || input.config.dictionaries !== undefined + ? input + : { + ...input, + config: { + ...input.config, + dictionaries: await loadDefaultDictionaries(), + }, + }; + return { + ...withDictionaries, + config: applyCliLanguageScope(withDictionaries.config, options), + }; +} + +async function loadBasePackageInput(options) { + if (!options.config) { + return { config: defaultNativePipelineConfig(), gazetteerEntries: [] }; + } + const moduleUrl = pathToFileURL(resolve(options.config)).href; + // eslint-disable-next-line stll/no-dynamic-import-specifier + const loaded = await import(moduleUrl); + const exportName = options.exportName ?? "default"; + const candidate = + exportName === "default" ? loaded.default : loaded[exportName]; + if (candidate === undefined) { + throw new Error(`Config module does not export ${exportName}`); + } + const value = + typeof candidate === "function" ? await candidate() : await candidate; + if (!value || typeof value !== "object") { + throw new TypeError("Native package config export must be an object"); + } + if ("config" in value) { + return { + config: value.config, + gazetteerEntries: value.gazetteerEntries ?? [], + }; + } + return { config: value, gazetteerEntries: [] }; +} + +function defaultNativePipelineConfig() { + return { + ...DEFAULT_NATIVE_PIPELINE_CONFIG, + labels: [...DEFAULT_NATIVE_PIPELINE_CONFIG.labels], + workspaceId: "native-pipeline-package", + }; +} + +function applyCliLanguageScope(pipelineConfig, options) { + if (options.language !== undefined && options.languages !== undefined) { + throw new Error("Use either --language or --languages, not both"); + } + if (options.language !== undefined) { + const language = normalizeLanguageOption(options.language, "--language"); + return { ...pipelineConfig, language, languages: undefined }; + } + if (options.languages === undefined) { + return pipelineConfig; + } + const languages = normalizeLanguageList(options.languages); + return { ...pipelineConfig, language: undefined, languages }; +} + +function normalizeLanguageOption(value, option) { + const language = value.trim().toLowerCase(); + if (language.length === 0) { + throw new Error(`${option} requires a non-empty language code`); + } + return language; +} + +function normalizeLanguageList(value) { + const languages = value + .split(",") + .map((entry) => normalizeLanguageOption(entry, "--languages")) + .filter((entry, index, entries) => entries.indexOf(entry) === index); + if (languages.length === 0) { + throw new Error("--languages requires at least one language code"); + } + return languages; +} + +async function loadDefaultDictionaries() { + let loaded; + try { + loaded = await import("@stll/anonymize-data/dictionaries"); + } catch (error) { + throw new Error( + `--default-dictionaries requires @stll/anonymize-data: ${formatError(error)}`, + ); + } + if (typeof loaded.loadDictionaryBundle !== "function") { + throw new TypeError( + "@stll/anonymize-data/dictionaries does not export loadDictionaryBundle", + ); + } + return loaded.loadDictionaryBundle(); +} + +function formatError(error) { + if (error instanceof Error) { + return error.message; + } + return String(error); +} + +function printHelp() { + console.log(`Usage: build-native-pipeline-package [options] + +Options: + --out Output package path. Defaults to native-pipeline.stlanonpkg. + --config ESM module exporting a PipelineConfig or { config, gazetteerEntries }. + --export Export name to read from the config module. Defaults to default. + --language Build a package scoped to one content language. + --languages Build a package scoped to comma-separated content languages. + --default-dictionaries Load @stll/anonymize-data into configs that do not provide dictionaries. + --raw Write an uncompressed package. +`); +} diff --git a/packages/anonymize/scripts/dist-smoke.mjs b/packages/anonymize/scripts/dist-smoke.mjs index 1beadc68..833c1ba1 100644 --- a/packages/anonymize/scripts/dist-smoke.mjs +++ b/packages/anonymize/scripts/dist-smoke.mjs @@ -10,6 +10,27 @@ * Run after `bun run build`: `bun run smoke:dist`. */ import { createPipelineContext, runPipeline } from "../dist/index.mjs"; +import { createNativeAnonymizerFromPackage } from "../dist/native.mjs"; +import { + createNativePipelineFromDefaultPackage, + createNativePipelineFromPackageFile, + loadNativeAnonymizeBinding, +} from "../dist/native-node.mjs"; + +if (typeof createNativeAnonymizerFromPackage !== "function") { + throw new TypeError("dist native entrypoint is missing its package loader"); +} +if (typeof loadNativeAnonymizeBinding !== "function") { + throw new TypeError("dist native-node entrypoint is missing its loader"); +} +if (typeof createNativePipelineFromPackageFile !== "function") { + throw new TypeError("dist native-node entrypoint is missing file loading"); +} +if (typeof createNativePipelineFromDefaultPackage !== "function") { + throw new TypeError( + "dist native-node entrypoint is missing default package loading", + ); +} const warnings = []; const originalWarn = console.warn; @@ -62,6 +83,19 @@ if (!person) { ); } +const nativePipeline = createNativePipelineFromDefaultPackage(); +const nativeResult = nativePipeline.redactText( + "A contract was signed by Jan Novak at Praha on 1. 1. 2025.", +); +if (nativeResult.resolvedEntities.length === 0) { + throw new Error("default native pipeline package did not detect any entity"); +} + console.log( - JSON.stringify({ event: "dist-smoke", ok: true, detected: person.text }), + JSON.stringify({ + event: "dist-smoke", + ok: true, + detected: person.text, + nativeEntityCount: nativeResult.resolvedEntities.length, + }), ); diff --git a/packages/anonymize/scripts/migration-fixture-perf.mjs b/packages/anonymize/scripts/migration-fixture-perf.mjs new file mode 100644 index 00000000..be634901 --- /dev/null +++ b/packages/anonymize/scripts/migration-fixture-perf.mjs @@ -0,0 +1,2371 @@ +import { spawnSync } from "node:child_process"; +import { createRequire } from "node:module"; +import { + copyFileSync, + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + symlinkSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join, relative, resolve } from "node:path"; +import { fileURLToPath, pathToFileURL } from "node:url"; + +const SCRIPT_PATH = fileURLToPath(import.meta.url); +const ROOT_DIR = resolve(join(import.meta.dir, "..", "..", "..")); +const PACKAGE_DIR = join(ROOT_DIR, "packages", "anonymize"); +const FIXTURES_DIR = join( + PACKAGE_DIR, + "src", + "__test__", + "fixtures", + "contracts", +); +const BASELINE_REF = + process.env.ANONYMIZE_MIGRATION_BASELINE_REF ?? "origin/main"; +const WORKING_TREE_BASELINE_REF = "working-tree"; +const COMPARE_BASELINE = + process.env.ANONYMIZE_MIGRATION_COMPARE_BASELINE !== "0"; +const REQUIRE_NATIVE_PIPELINE = + process.env.ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE === "1"; +const CANDIDATE_RUNTIME = + process.env.ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME ?? "typescript"; +const FAIL_ON_MISMATCH = + process.env.ANONYMIZE_MIGRATION_FAIL_ON_MISMATCH ?? + (CANDIDATE_RUNTIME === "typescript" ? "1" : "0"); +const ALLOW_ACCEPTED_MISMATCHES = + process.env.ANONYMIZE_MIGRATION_ALLOW_ACCEPTED_MISMATCHES === "1"; +const WARM_ITERATIONS = positiveIntegerEnv( + "ANONYMIZE_MIGRATION_WARM_ITERATIONS", + 2, +); +const CACHED_PREPARE_ITERATIONS = positiveIntegerEnv( + "ANONYMIZE_MIGRATION_CACHED_PREPARE_ITERATIONS", + 3, +); +const PROFILE_REGEX_LABELS = + process.env.ANONYMIZE_MIGRATION_PROFILE_REGEX_LABELS === "1"; +const PROFILE_SCOPED_PREPARE = + process.env.ANONYMIZE_MIGRATION_PROFILE_SCOPED_PREPARE === "1"; +const NATIVE_PREPARED_PACKAGE = + process.env.ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE === "1"; +const NATIVE_COMPRESSED_PACKAGE = + process.env.ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE === "1"; +const NATIVE_PREPARED_ARTIFACTS = + !NATIVE_PREPARED_PACKAGE && + process.env.ANONYMIZE_MIGRATION_NATIVE_PREPARED_ARTIFACTS === "1"; +const FIXTURE_LANGUAGE_FILTER = stringListEnv( + "ANONYMIZE_MIGRATION_FIXTURE_LANGUAGES", +); +const CONTENT_LANGUAGE = + process.env.ANONYMIZE_MIGRATION_CONTENT_LANGUAGE?.trim() ?? ""; +const NATIVE_CONFIG_PATH = + process.env.ANONYMIZE_MIGRATION_NATIVE_CONFIG_PATH?.trim() ?? ""; +const WRITE_NATIVE_CONFIG_PATH = + process.env.ANONYMIZE_MIGRATION_WRITE_NATIVE_CONFIG_PATH?.trim() ?? ""; +const NATIVE_PACKAGE_PATH = + process.env.ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH?.trim() ?? ""; +const WRITE_NATIVE_PACKAGE_PATH = + process.env.ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH?.trim() ?? ""; +const USER_DATA_SCENARIO = + process.env.ANONYMIZE_MIGRATION_USER_DATA_SCENARIO?.trim() ?? "none"; + +const INTENTIONAL_NATIVE_STATIC_IMPROVEMENTS = new Map( + [ + { + fixture: "cs/asset-transfer-court-declensions.txt", + reason: "wider-address-span", + candidateExtra: [ + { start: 445, end: 485, label: "address", source: "regex" }, + ], + candidateMissing: [ + { start: 471, end: 485, label: "address", source: "deny-list" }, + ], + }, + { + fixture: "cs/nakit-legal-services-framework.txt", + reason: "role-heading-not-person", + candidateExtra: [], + candidateMissing: [ + { start: 49384, end: 49395, label: "person", source: "trigger" }, + ], + }, + { + fixture: "cs/vinci-donation-agreement.txt", + reason: "party-organization-retained", + candidateExtra: [ + { start: 542, end: 585, label: "organization", source: "deny-list" }, + { + start: 3226, + end: 3247, + label: "organization", + source: "coreference", + }, + ], + candidateMissing: [], + }, + { + fixture: "en/software-license-agreement.txt", + reason: "wider-notice-address-spans", + candidateExtra: [ + { start: 506, end: 541, label: "address", source: "regex" }, + { start: 1624, end: 1664, label: "address", source: "regex" }, + { start: 1813, end: 1848, label: "address", source: "regex" }, + { start: 1857, end: 1871, label: "phone number", source: "regex" }, + ], + candidateMissing: [ + { start: 515, end: 531, label: "address", source: "deny-list" }, + { start: 1629, end: 1654, label: "address", source: "deny-list" }, + { start: 1822, end: 1838, label: "address", source: "deny-list" }, + { start: 1858, end: 1871, label: "phone number", source: "trigger" }, + ], + }, + ].map((entry) => [entry.fixture, entry]), +); + +if (process.env.ANONYMIZE_MIGRATION_WORKER === "1") { + await runWorker(); +} else { + await runCoordinator(); +} + +async function runCoordinator() { + const fixtures = discoverFixtures(FIXTURES_DIR).filter((fixture) => + FIXTURE_LANGUAGE_FILTER.length === 0 + ? true + : FIXTURE_LANGUAGE_FILTER.includes(fixtureLanguage(fixture)), + ); + if (fixtures.length === 0) { + throw new Error(`No contract fixtures found in ${FIXTURES_DIR}`); + } + + const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-migration-")); + + try { + let baseline = null; + if (COMPARE_BASELINE) { + const baselineRoot = + BASELINE_REF === WORKING_TREE_BASELINE_REF + ? ROOT_DIR + : materializeBaselineRef(BASELINE_REF, tempRoot); + baseline = runVariant({ + name: `baseline:${BASELINE_REF}`, + sourceRoot: baselineRoot, + fixtures, + tempRoot, + }); + printVariantSummary(baseline); + } + + const candidate = runVariant({ + name: "candidate", + sourceRoot: ROOT_DIR, + fixtures, + tempRoot, + runtime: CANDIDATE_RUNTIME, + }); + printVariantSummary(candidate); + + if ( + REQUIRE_NATIVE_PIPELINE && + !candidate.nativeRewrite.measuredInPipeline + ) { + throw new Error( + "Native pipeline is required, but the candidate run used the TypeScript pipeline", + ); + } + + if (baseline !== null) { + const comparison = compareSnapshots(baseline, candidate); + console.log(JSON.stringify(comparisonForLog(comparison))); + const intentionalByPolicy = + ALLOW_ACCEPTED_MISMATCHES && comparison.intentionalEqual; + if ( + !comparison.equal && + !intentionalByPolicy && + FAIL_ON_MISMATCH !== "0" + ) { + throw new Error( + `Fixture parity failed for ${comparison.mismatches.length} fixture(s)`, + ); + } + } + } finally { + rmSync(tempRoot, { force: true, recursive: true }); + } +} + +function runVariant({ + name, + sourceRoot, + fixtures, + tempRoot, + runtime = "typescript", +}) { + validateRuntime(runtime); + if (runtime === "native-static") { + ensureNativeAdapterBuilt(); + } + + const resultPath = join( + tempRoot, + `${name.replaceAll(/[^a-zA-Z0-9_.-]/g, "_")}.json`, + ); + const child = spawnSync(process.execPath, [SCRIPT_PATH], { + cwd: ROOT_DIR, + env: { + ...process.env, + ANONYMIZE_MIGRATION_WORKER: "1", + ANONYMIZE_MIGRATION_SOURCE_ROOT: sourceRoot, + ANONYMIZE_MIGRATION_VARIANT: name, + ANONYMIZE_MIGRATION_RUNTIME: runtime, + ANONYMIZE_MIGRATION_FIXTURES_DIR: FIXTURES_DIR, + ANONYMIZE_MIGRATION_FIXTURES: JSON.stringify(fixtures), + ANONYMIZE_MIGRATION_RESULT_PATH: resultPath, + ANONYMIZE_MIGRATION_WARM_ITERATIONS: String(WARM_ITERATIONS), + ANONYMIZE_MIGRATION_CACHED_PREPARE_ITERATIONS: String( + CACHED_PREPARE_ITERATIONS, + ), + }, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + + if (child.status !== 0) { + throw new Error( + [ + `Migration fixture worker failed for ${name}`, + child.stdout.trim(), + child.stderr.trim(), + ] + .filter(Boolean) + .join("\n"), + ); + } + + return JSON.parse(readFileSync(resultPath, "utf8")); +} + +async function runWorker() { + const sourceRoot = requiredEnv("ANONYMIZE_MIGRATION_SOURCE_ROOT"); + const variant = requiredEnv("ANONYMIZE_MIGRATION_VARIANT"); + const runtime = requiredEnv("ANONYMIZE_MIGRATION_RUNTIME"); + const resultPath = requiredEnv("ANONYMIZE_MIGRATION_RESULT_PATH"); + const fixtures = JSON.parse(requiredEnv("ANONYMIZE_MIGRATION_FIXTURES")); + validateRuntime(runtime); + const usePrebuiltNativePackage = + runtime === "native-static" && NATIVE_PACKAGE_PATH.length > 0; + const usePrebuiltNativeConfig = + !usePrebuiltNativePackage && + runtime === "native-static" && + NATIVE_CONFIG_PATH.length > 0; + + let indexModule = null; + let config = null; + let context = null; + let search = null; + let nativeConfigBytes = null; + let nativePackageBuffer = null; + let importMs = 0; + let dictionaryMs = 0; + let prepareMs = 0; + let nativeConfigReadMs = 0; + let nativeConfigParseMs = 0; + let nativePackageReadMs = 0; + let nativePackageCompressed = NATIVE_COMPRESSED_PACKAGE; + + if (usePrebuiltNativePackage) { + const packageReadStart = Bun.nanoseconds(); + nativePackageBuffer = readFileSync(NATIVE_PACKAGE_PATH); + nativePackageReadMs = elapsedMs(packageReadStart); + nativePackageCompressed = isCompressedNativePackage(nativePackageBuffer); + } else if (usePrebuiltNativeConfig) { + const configReadStart = Bun.nanoseconds(); + nativeConfigBytes = readFileSync(NATIVE_CONFIG_PATH); + nativeConfigReadMs = elapsedMs(configReadStart); + const configParseStart = Bun.nanoseconds(); + search = { + nativeStaticConfig: JSON.parse(nativeConfigBytes.toString("utf8")), + }; + nativeConfigParseMs = elapsedMs(configParseStart); + } else { + const importStart = Bun.nanoseconds(); + const [loadedIndexModule, configModule, dictionaryModule] = + await Promise.all([ + importSource(sourceRoot, "packages/anonymize/src/index.ts", variant), + importSource( + sourceRoot, + "packages/anonymize/src/__test__/contract-config.ts", + variant, + ), + importSource( + sourceRoot, + "packages/anonymize/src/__test__/load-dictionaries.ts", + variant, + ), + ]); + indexModule = loadedIndexModule; + importMs = elapsedMs(importStart); + + const scope = contentLanguageScope(); + const dictionaryStart = Bun.nanoseconds(); + const dictionaries = await dictionaryModule.loadTestDictionaries(scope); + dictionaryMs = elapsedMs(dictionaryStart); + + config = { + ...configModule.contractTestConfig(`migration-fixtures-${variant}`), + ...scope, + dictionaries, + }; + config = applyUserDataScenario(config); + context = indexModule.createPipelineContext(); + + const prepareStart = Bun.nanoseconds(); + search = + runtime === "native-static" + ? await prepareNativeStaticSearch({ + sourceRoot, + variant, + config, + context, + }) + : await indexModule.preparePipelineSearch({ config, context }); + prepareMs = elapsedMs(prepareStart); + if ( + runtime === "native-static" && + WRITE_NATIVE_CONFIG_PATH.length > 0 && + search.nativeStaticConfig + ) { + writeFileSync( + WRITE_NATIVE_CONFIG_PATH, + JSON.stringify(search.nativeStaticConfig), + ); + } + } + let nativeRewrite; + if (usePrebuiltNativePackage) { + nativeRewrite = describeNativeRewriteFromNativePackage(runtime); + } else if (usePrebuiltNativeConfig && search.nativeStaticConfig) { + nativeRewrite = describeNativeRewriteFromNativeConfig( + search.nativeStaticConfig, + runtime, + ); + } else { + nativeRewrite = describeNativeRewrite(config, search, runtime); + } + + let runtimeRunner = null; + if (runtime === "native-static" && nativePackageBuffer !== null) { + runtimeRunner = + createNativeStaticRunnerFromPackageBytes(nativePackageBuffer); + } else if (runtime === "native-static" && nativeConfigBytes === null) { + runtimeRunner = createNativeStaticRunner(search.nativeStaticConfig); + } else if (runtime === "native-static") { + runtimeRunner = createNativeStaticRunnerFromJsonBytes(nativeConfigBytes); + } + const nativePrepareMs = runtimeRunner?.prepareMs ?? 0; + const nativeStringifyMs = runtimeRunner?.stringifyMs ?? 0; + const nativeArtifactPrepareMs = runtimeRunner?.artifactPrepareMs ?? 0; + const nativeArtifactBytes = runtimeRunner?.artifactBytes ?? 0; + const nativePackagePrepareMs = runtimeRunner?.packagePrepareMs ?? 0; + const nativePackageBytes = runtimeRunner?.packageBytes ?? 0; + const nativeWarmPrepareMs = runtimeRunner?.warmPrepareMs ?? 0; + const nativeCachedPrepareMsByIteration = + runtimeRunner?.cachedPrepareMsByIteration ?? []; + const nativeCachedWarmPrepareMsByIteration = + runtimeRunner?.cachedWarmPrepareMsByIteration ?? []; + const nativeCachedPrepareAvgMs = + nativeCachedPrepareMsByIteration.length === 0 + ? 0 + : roundMs( + nativeCachedPrepareMsByIteration.reduce( + (sum, value) => sum + value, + 0, + ) / nativeCachedPrepareMsByIteration.length, + ); + const nativeCachedWarmPrepareAvgMs = + nativeCachedWarmPrepareMsByIteration.length === 0 + ? 0 + : roundMs( + nativeCachedWarmPrepareMsByIteration.reduce( + (sum, value) => sum + value, + 0, + ) / nativeCachedWarmPrepareMsByIteration.length, + ); + + const coldRun = + runtimeRunner === null + ? await runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, + }) + : runNativeStaticFixtureSweep({ runner: runtimeRunner, fixtures }); + + const warmRuns = []; + for (let index = 0; index < WARM_ITERATIONS; index += 1) { + warmRuns.push( + runtimeRunner === null + ? await runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, + }) + : runNativeStaticFixtureSweep({ runner: runtimeRunner, fixtures }), + ); + } + + const warmRunMs = roundMs(warmRuns.reduce((sum, run) => sum + run.ms, 0)); + const warmAvgMs = + WARM_ITERATIONS === 0 ? 0 : roundMs(warmRunMs / WARM_ITERATIONS); + const fixtureTimings = summarizeFixtureTimings(coldRun, warmRuns); + const nativeDiagnostics = + runtimeRunner === null + ? null + : collectNativeDiagnostics({ runner: runtimeRunner, fixtures }); + if (nativeDiagnostics !== null && PROFILE_REGEX_LABELS) { + nativeDiagnostics.regexPrepareByLabel = profileNativeRegexPrepare( + search.nativeStaticConfig, + ); + } + if ( + nativeDiagnostics !== null && + PROFILE_SCOPED_PREPARE && + !usePrebuiltNativeConfig + ) { + nativeDiagnostics.scopedPrepare = await profileScopedNativePrepare({ + sourceRoot, + variant, + baseConfig: config, + fixtures, + }); + } + const snapshots = Object.fromEntries( + coldRun.fixtures.map((fixture) => [fixture.fixture, fixture.snapshot]), + ); + const nativeTimingScenario = describeNativeTimingScenario({ + runtime, + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackageCompressed, + nativePackageBytes, + nativePackageReadMs, + nativePackagePrepareMs, + nativePrepareMs, + nativeWarmPrepareMs, + nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, + coldRunMs: coldRun.ms, + warmAvgMs, + }); + + writeFileSync( + resultPath, + `${JSON.stringify({ + event: "fixture-migration-variant", + variant, + pipelineRuntime: runtime, + nativeRewrite, + fixtureCount: fixtures.length, + warmIterations: WARM_ITERATIONS, + timings: { + importMs, + dictionaryMs, + prepareMs, + nativeConfigReadMs, + nativeConfigParseMs, + nativePackageReadMs, + nativeStringifyMs, + nativeArtifactPrepareMs, + nativeArtifactBytes, + nativePackageCompressed, + nativePackagePrepareMs, + nativePackageBytes, + nativePrepareMs, + nativeWarmPrepareMs, + nativeCachedPrepareMsByIteration, + nativeCachedWarmPrepareMsByIteration, + nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, + nativeFirstTouchMs: nativeTimingScenario.firstTouchMs, + nativeWarmClickMs: nativeTimingScenario.warmClickMs, + coldRunMs: coldRun.ms, + coldPipelineMs: roundMs( + dictionaryMs + + prepareMs + + nativeConfigReadMs + + nativeConfigParseMs + + nativePackageReadMs + + nativeStringifyMs + + nativePrepareMs + + coldRun.ms, + ), + coldTotalMs: roundMs( + importMs + + dictionaryMs + + prepareMs + + nativeConfigReadMs + + nativeConfigParseMs + + nativePackageReadMs + + nativeStringifyMs + + nativePrepareMs + + coldRun.ms, + ), + warmRunMsByIteration: warmRuns.map((run) => run.ms), + warmRunMs, + warmAvgMs, + }, + nativeTimingScenario, + nativeDiagnostics, + fixtureTimings, + fixtures: coldRun.fixtures.map( + ({ fixture, ms, entityCount, redactedTextLength }) => ({ + fixture, + ms, + entityCount, + redactedTextLength, + }), + ), + snapshots, + })}\n`, + ); +} + +function describeNativeTimingScenario({ + runtime, + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackageCompressed, + nativePackageBytes, + nativePackageReadMs, + nativePackagePrepareMs, + nativePrepareMs, + nativeWarmPrepareMs, + nativeCachedPrepareAvgMs, + nativeCachedWarmPrepareAvgMs, + coldRunMs, + warmAvgMs, +}) { + if (runtime !== "native-static") { + return { + mode: "typescript", + firstTouchMs: 0, + warmClickMs: 0, + }; + } + + return { + mode: nativeTimingMode({ + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackagePrepareMs, + }), + packageCompressed: nativePackageCompressed, + packageBytes: nativePackageBytes, + packageReadMs: nativePackageReadMs, + offlinePackageBuildMs: nativePackagePrepareMs, + firstPrepareMs: nativePrepareMs, + firstWarmPrepareMs: nativeWarmPrepareMs, + cachedPrepareMs: nativeCachedPrepareAvgMs, + cachedWarmPrepareMs: nativeCachedWarmPrepareAvgMs, + firstRunMs: coldRunMs, + setupBeforeClickMs: roundMs(nativePackageReadMs + nativePrepareMs), + preloadedClickMs: coldRunMs, + firstTouchMs: roundMs(nativePackageReadMs + nativePrepareMs + coldRunMs), + warmClickMs: warmAvgMs, + }; +} + +function nativeTimingMode({ + usePrebuiltNativePackage, + usePrebuiltNativeConfig, + nativePackagePrepareMs, +}) { + if (usePrebuiltNativePackage) { + return "prebuilt-package"; + } + if (usePrebuiltNativeConfig) { + return "prebuilt-config"; + } + if (nativePackagePrepareMs > 0) { + return "build-package-in-process"; + } + return "build-config-in-process"; +} + +async function prepareNativeStaticSearch({ + sourceRoot, + variant, + config, + context, +}) { + const module = await importSource( + sourceRoot, + "packages/anonymize/src/build-unified-search.ts", + variant, + ); + const buildNativeStaticSearchBundle = Reflect.get( + Object(module), + "buildNativeStaticSearchBundle", + ); + if (typeof buildNativeStaticSearchBundle !== "function") { + throw new TypeError("Native static search bundle builder is unavailable"); + } + return buildNativeStaticSearchBundle(config, [], context); +} + +async function runTypeScriptFixtureSweep({ + indexModule, + config, + context, + fixtures, +}) { + const sweepStart = Bun.nanoseconds(); + const results = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const start = Bun.nanoseconds(); + const entities = await indexModule.runPipeline({ + fullText, + config, + gazetteerEntries: [], + context, + }); + const ms = elapsedMs(start); + const snapshot = toSnapshot(indexModule, fullText, entities, context); + results.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + ms, + entityCount: snapshot.entityCount, + redactedTextLength: snapshot.redactedText.length, + snapshot, + }); + } + + return { + ms: elapsedMs(sweepStart), + fixtures: results, + }; +} + +function runNativeStaticFixtureSweep({ runner, fixtures }) { + const sweepStart = Bun.nanoseconds(); + const results = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const start = Bun.nanoseconds(); + const result = runner.prepared.redactStaticEntities(fullText, undefined); + const ms = elapsedMs(start); + const snapshot = toNativeSnapshot(fullText, result); + results.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + ms, + entityCount: snapshot.entityCount, + redactedTextLength: snapshot.redactedText.length, + snapshot, + }); + } + + return { + ms: elapsedMs(sweepStart), + fixtures: results, + }; +} + +function collectNativeDiagnostics({ runner, fixtures }) { + const fixtureDiagnostics = []; + + for (const fixturePath of fixtures) { + const fullText = readFileSync(fixturePath, "utf8").replaceAll("\r\n", "\n"); + const report = JSON.parse( + runner.prepared.redactStaticEntitiesDiagnosticsJson(fullText, undefined), + ); + fixtureDiagnostics.push({ + fixture: relative(FIXTURES_DIR, fixturePath), + stages: diagnosticStageSummaries(report.diagnostics.events).filter( + isRunStage, + ), + }); + } + + return { + prepare: { + stages: diagnosticStageSummaries(runner.prepareDiagnostics.events), + topStages: topDiagnosticStages( + diagnosticStageSummaries(runner.prepareDiagnostics.events), + ), + }, + cachedPrepare: + runner.cachedPrepareDiagnostics === null + ? null + : { + stages: diagnosticStageSummaries( + runner.cachedPrepareDiagnostics.events, + ), + topStages: topDiagnosticStages( + diagnosticStageSummaries(runner.cachedPrepareDiagnostics.events), + ), + }, + run: summarizeFixtureDiagnostics(fixtureDiagnostics), + }; +} + +function isRunStage(stage) { + return !stage.stage.startsWith("prepare."); +} + +function summarizeFixtureDiagnostics(fixtureDiagnostics) { + const stageBuckets = new Map(); + const byFixture = []; + + for (const fixture of fixtureDiagnostics) { + let fixtureElapsedMs = 0; + for (const stage of fixture.stages) { + fixtureElapsedMs += stage.elapsedMs ?? 0; + const bucket = stageBuckets.get(stage.stage) ?? { + stage: stage.stage, + elapsedMs: [], + count: 0, + }; + if (typeof stage.elapsedMs === "number") { + bucket.elapsedMs.push(stage.elapsedMs); + } + bucket.count += stage.count ?? 0; + stageBuckets.set(stage.stage, bucket); + } + byFixture.push({ + fixture: fixture.fixture, + elapsedMs: roundMs(fixtureElapsedMs), + topStages: topDiagnosticStages(fixture.stages).slice(0, 5), + }); + } + + const stages = [...stageBuckets.values()] + .map((bucket) => ({ + stage: bucket.stage, + calls: bucket.elapsedMs.length, + totalMs: roundMs(bucket.elapsedMs.reduce((sum, ms) => sum + ms, 0)), + avgMs: + bucket.elapsedMs.length === 0 + ? 0 + : roundMs( + bucket.elapsedMs.reduce((sum, ms) => sum + ms, 0) / + bucket.elapsedMs.length, + ), + p50Ms: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 0.5, + ), + p95Ms: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 0.95, + ), + maxMs: percentile( + bucket.elapsedMs.toSorted((a, b) => a - b), + 1, + ), + count: bucket.count, + })) + .sort((left, right) => right.totalMs - left.totalMs); + + return { + stages, + topStages: stages.slice(0, 10), + topFixtures: byFixture + .toSorted((left, right) => right.elapsedMs - left.elapsedMs) + .slice(0, 10), + byFixture, + }; +} + +function diagnosticStageSummaries(events) { + return events + .filter((event) => event.kind === "stage-summary") + .map((event) => ({ + stage: event.stage, + count: event.count ?? 0, + elapsedMs: + typeof event.elapsed_us === "number" + ? roundMs(event.elapsed_us / 1_000) + : null, + inputBytes: event.input_bytes ?? null, + })); +} + +function topDiagnosticStages(stages) { + return stages + .filter((stage) => typeof stage.elapsedMs === "number") + .toSorted((left, right) => right.elapsedMs - left.elapsedMs); +} + +function toSnapshot(indexModule, fullText, entities, context) { + const sorted = entities.toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + const counts = {}; + for (const entity of sorted) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + + const redacted = indexModule.redactText(fullText, sorted, undefined, context); + + return { + entityCount: sorted.length, + counts, + entities: sorted.map(({ start, end, label, text, source }) => ({ + start, + end, + byteStart: utf16OffsetToUtf8ByteOffset(fullText, start), + byteEnd: utf16OffsetToUtf8ByteOffset(fullText, end), + label, + text, + source, + })), + redactedText: redacted.redactedText, + }; +} + +function toNativeSnapshot(fullText, result) { + const entities = result.resolvedEntities.toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + const counts = {}; + for (const entity of entities) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + + return { + entityCount: entities.length, + counts, + entities: entities.map(({ start, end, label, text, source }) => ({ + start, + end, + byteStart: utf16OffsetToUtf8ByteOffset(fullText, start), + byteEnd: utf16OffsetToUtf8ByteOffset(fullText, end), + label, + text, + source, + })), + redactedText: result.redaction.redactedText, + }; +} + +function compareSnapshots(baseline, candidate) { + const mismatches = []; + const fixtureNames = new Set([ + ...Object.keys(baseline.snapshots), + ...Object.keys(candidate.snapshots), + ]); + + for (const fixture of [...fixtureNames].sort()) { + const expected = baseline.snapshots[fixture]; + const actual = candidate.snapshots[fixture]; + if (snapshotsAreEquivalent(expected, actual)) { + continue; + } + mismatches.push(describeMismatch(fixture, expected, actual)); + } + + return { + event: "fixture-migration-parity", + baseline: baseline.variant, + candidate: candidate.variant, + equal: mismatches.length === 0, + intentionalEqual: mismatches.every( + (mismatch) => mismatch.intentionalImprovementReason !== null, + ), + acceptedEqual: mismatches.every( + (mismatch) => mismatch.intentionalImprovementReason !== null, + ), + mismatchSummary: mismatchSummary(mismatches), + fixtureCount: fixtureNames.size, + mismatches, + timingComparison: timingComparison(baseline, candidate), + nativeRewrite: { + baseline: baseline.nativeRewrite, + candidate: candidate.nativeRewrite, + }, + }; +} + +function comparisonForLog(comparison) { + return { + ...comparison, + mismatches: comparison.mismatches.map(mismatchForLog), + }; +} + +function mismatchForLog(mismatch) { + return { + ...mismatch, + firstEntityDiff: entityDiffForLog(mismatch.firstEntityDiff), + firstByteEntityDiff: entityDiffForLog(mismatch.firstByteEntityDiff), + }; +} + +function entityDiffForLog(diff) { + if (diff === null || diff === undefined) { + return diff ?? null; + } + return { + ...diff, + baseline: entityForLog(diff.baseline), + candidate: entityForLog(diff.candidate), + }; +} + +function entityForLog(entity) { + if (entity === null || entity === undefined) { + return entity ?? null; + } + const { text: _text, ...safeEntity } = entity; + return safeEntity; +} + +function mismatchSummary(mismatches) { + const byCategory = {}; + let materialMismatchCount = 0; + let redactionMismatchCount = 0; + let sourceOnlyMismatchCount = 0; + let intentionalImprovementCount = 0; + let unexplainedMismatchCount = 0; + let unexplainedMaterialMismatchCount = 0; + let unexplainedRedactionMismatchCount = 0; + + for (const mismatch of mismatches) { + const category = mismatch.category ?? mismatch.kind; + byCategory[category] = (byCategory[category] ?? 0) + 1; + const intentional = mismatch.intentionalImprovementReason !== null; + if (intentional) { + intentionalImprovementCount += 1; + } else { + unexplainedMismatchCount += 1; + } + if (mismatch.sourceAgnosticEqual !== true) { + materialMismatchCount += 1; + if (!intentional) { + unexplainedMaterialMismatchCount += 1; + } + } + if (mismatch.redactedTextEqual === false) { + redactionMismatchCount += 1; + if (!intentional) { + unexplainedRedactionMismatchCount += 1; + } + } + if ( + mismatch.redactedTextEqual && + mismatch.sourceOnlyCount > 0 && + Object.keys(mismatch.candidateExtraByLabel ?? {}).length === 0 && + Object.keys(mismatch.candidateMissingByLabel ?? {}).length === 0 + ) { + sourceOnlyMismatchCount += 1; + } + } + + return { + strictMismatchCount: mismatches.length, + materialMismatchCount, + redactionMismatchCount, + sourceOnlyMismatchCount, + intentionalImprovementCount, + acceptedMismatchCount: intentionalImprovementCount, + unexplainedMismatchCount, + unexplainedMaterialMismatchCount, + unexplainedRedactionMismatchCount, + byCategory, + }; +} + +function snapshotsAreEquivalent(expected, actual) { + if (expected === undefined || actual === undefined) { + return false; + } + if (JSON.stringify(expected) === JSON.stringify(actual)) { + return true; + } + return ( + expected.redactedText === actual.redactedText && + JSON.stringify(byteNormalizedSnapshot(expected)) === + JSON.stringify(byteNormalizedSnapshot(actual)) + ); +} + +function describeMismatch(fixture, expected, actual) { + if (expected === undefined || actual === undefined) { + return { + fixture, + kind: expected === undefined ? "missing-baseline" : "missing-candidate", + }; + } + + const firstEntityDiff = firstDifferentIndex( + expected.entities, + actual.entities, + ); + const expectedByteSnapshot = byteNormalizedSnapshot(expected); + const actualByteSnapshot = byteNormalizedSnapshot(actual); + const byteNormalizedEqual = + JSON.stringify(expectedByteSnapshot) === JSON.stringify(actualByteSnapshot); + const sourceAgnosticEqual = + JSON.stringify(sourceAgnosticSnapshot(expectedByteSnapshot)) === + JSON.stringify(sourceAgnosticSnapshot(actualByteSnapshot)); + const firstByteEntityDiff = firstDifferentIndex( + expectedByteSnapshot.entities, + actualByteSnapshot.entities, + ); + const category = mismatchCategory(expected, actual); + + const mismatch = { + fixture, + kind: "snapshot-mismatch", + category: category.kind, + entityCount: { + baseline: expected.entityCount, + candidate: actual.entityCount, + }, + counts: { + baseline: expected.counts, + candidate: actual.counts, + }, + redactedTextEqual: expected.redactedText === actual.redactedText, + byteNormalizedEqual, + sourceAgnosticEqual, + sourceOnlyCount: category.sourceOnlyCount, + candidateExtraByLabel: category.candidateExtraByLabel, + candidateMissingByLabel: category.candidateMissingByLabel, + candidateExtra: category.candidateExtra, + candidateMissing: category.candidateMissing, + firstCandidateExtra: category.firstCandidateExtra, + firstCandidateMissing: category.firstCandidateMissing, + firstEntityDiff: + firstEntityDiff === -1 + ? null + : { + index: firstEntityDiff, + baseline: expected.entities.at(firstEntityDiff) ?? null, + candidate: actual.entities.at(firstEntityDiff) ?? null, + }, + firstByteEntityDiff: + firstByteEntityDiff === -1 + ? null + : { + index: firstByteEntityDiff, + baseline: + expectedByteSnapshot.entities.at(firstByteEntityDiff) ?? null, + candidate: + actualByteSnapshot.entities.at(firstByteEntityDiff) ?? null, + }, + }; + return { + ...mismatch, + intentionalImprovementReason: intentionalImprovementReason(mismatch), + acceptedReason: intentionalImprovementReason(mismatch), + }; +} + +function intentionalImprovementReason(mismatch) { + if (mismatch.sourceAgnosticEqual === true) { + return "source-only"; + } + const improvement = INTENTIONAL_NATIVE_STATIC_IMPROVEMENTS.get( + mismatch.fixture, + ); + if (improvement === undefined) { + return null; + } + if ( + entitySummariesEqual(mismatch.candidateExtra, improvement.candidateExtra) && + entitySummariesEqual( + mismatch.candidateMissing, + improvement.candidateMissing, + ) + ) { + return improvement.reason; + } + return null; +} + +function entitySummariesEqual(left, right) { + return JSON.stringify(left ?? []) === JSON.stringify(right ?? []); +} + +function mismatchCategory(expected, actual) { + const expectedByteEntities = byteNormalizedSnapshot(expected).entities; + const actualByteEntities = byteNormalizedSnapshot(actual).entities; + const redactedTextEqual = expected.redactedText === actual.redactedText; + const entitySetEqual = + JSON.stringify(expectedByteEntities) === JSON.stringify(actualByteEntities); + if (redactedTextEqual && entitySetEqual) { + return emptyMismatchCategory("metadata-only"); + } + + const expectedSpanLabel = countEntitiesByKey( + expectedByteEntities, + entitySpanLabelKey, + ); + const actualSpanLabel = countEntitiesByKey( + actualByteEntities, + entitySpanLabelKey, + ); + if (mapsEqual(expectedSpanLabel, actualSpanLabel)) { + return { + ...sourceDriftCategory(expectedByteEntities, actualByteEntities), + kind: redactedTextEqual ? "text-or-source" : "span-label-only", + }; + } + + const expectedContent = countEntitiesByKey( + expectedByteEntities, + entityContentKey, + ); + const actualContent = countEntitiesByKey( + actualByteEntities, + entityContentKey, + ); + if (mapsEqual(expectedContent, actualContent)) { + return { + ...sourceDriftCategory(expectedByteEntities, actualByteEntities), + kind: redactedTextEqual ? "source-only" : "source-or-order", + }; + } + + const delta = entityDelta(expectedByteEntities, actualByteEntities); + return { + kind: delta.missing.length === 0 ? "candidate-extra" : "coverage-drift", + sourceOnlyCount: sourceDriftCategory( + expectedByteEntities, + actualByteEntities, + ).sourceOnlyCount, + candidateExtraByLabel: countByLabel(delta.extra), + candidateMissingByLabel: countByLabel(delta.missing), + candidateExtra: delta.extra.map(entitySummary), + candidateMissing: delta.missing.map(entitySummary), + firstCandidateExtra: entitySummary(delta.extra.at(0)), + firstCandidateMissing: entitySummary(delta.missing.at(0)), + }; +} + +function emptyMismatchCategory(kind) { + return { + kind, + sourceOnlyCount: 0, + candidateExtraByLabel: {}, + candidateMissingByLabel: {}, + candidateExtra: [], + candidateMissing: [], + firstCandidateExtra: null, + firstCandidateMissing: null, + }; +} + +function sourceDriftCategory(expectedEntities, actualEntities) { + const expectedByContent = groupEntitiesByKey( + expectedEntities, + entityContentKey, + ); + const actualByContent = groupEntitiesByKey(actualEntities, entityContentKey); + let sourceOnlyCount = 0; + for (const [key, expectedGroup] of expectedByContent) { + const actualGroup = actualByContent.get(key) ?? []; + const expectedSources = expectedGroup.map((entity) => entity.source).sort(); + const actualSources = actualGroup.map((entity) => entity.source).sort(); + if (JSON.stringify(expectedSources) !== JSON.stringify(actualSources)) { + sourceOnlyCount += Math.max(expectedGroup.length, actualGroup.length); + } + } + return { + ...emptyMismatchCategory("source-only"), + sourceOnlyCount, + }; +} + +function entityDelta(expectedEntities, actualEntities) { + const expectedCounts = countEntitiesByKey(expectedEntities, entityContentKey); + const actualCounts = countEntitiesByKey(actualEntities, entityContentKey); + + return { + missing: takeEntityDelta(expectedEntities, expectedCounts, actualCounts), + extra: takeEntityDelta(actualEntities, actualCounts, expectedCounts), + }; +} + +function takeEntityDelta(entities, ownCounts, otherCounts) { + const remaining = new Map(); + for (const [key, ownCount] of ownCounts) { + const diff = ownCount - (otherCounts.get(key) ?? 0); + if (diff > 0) { + remaining.set(key, diff); + } + } + + const delta = []; + for (const entity of entities) { + const key = entityContentKey(entity); + const count = remaining.get(key) ?? 0; + if (count <= 0) { + continue; + } + delta.push(entity); + remaining.set(key, count - 1); + } + return delta; +} + +function entityContentKey(entity) { + return [entity.start, entity.end, entity.label, entity.text].join("\u0000"); +} + +function entitySpanLabelKey(entity) { + return [entity.start, entity.end, entity.label].join("\u0000"); +} + +function countEntitiesByKey(entities, keyFn) { + const counts = new Map(); + for (const entity of entities) { + const key = keyFn(entity); + counts.set(key, (counts.get(key) ?? 0) + 1); + } + return counts; +} + +function groupEntitiesByKey(entities, keyFn) { + const groups = new Map(); + for (const entity of entities) { + const key = keyFn(entity); + const group = groups.get(key) ?? []; + group.push(entity); + groups.set(key, group); + } + return groups; +} + +function mapsEqual(left, right) { + if (left.size !== right.size) { + return false; + } + for (const [key, value] of left) { + if (right.get(key) !== value) { + return false; + } + } + return true; +} + +function countByLabel(entities) { + const counts = {}; + for (const entity of entities) { + counts[entity.label] = (counts[entity.label] ?? 0) + 1; + } + return counts; +} + +function entitySummary(entity) { + if (!entity) { + return null; + } + return { + start: entity.start, + end: entity.end, + label: entity.label, + source: entity.source, + }; +} + +function byteNormalizedSnapshot(snapshot) { + const entities = snapshot.entities + .map(({ byteStart, byteEnd, label, text, source }) => ({ + start: byteStart, + end: byteEnd, + label, + text, + source, + })) + .toSorted( + (left, right) => + left.start - right.start || + left.end - right.end || + left.label.localeCompare(right.label) || + left.text.localeCompare(right.text), + ); + + return { + entityCount: snapshot.entityCount, + counts: snapshot.counts, + entities, + redactedText: snapshot.redactedText, + }; +} + +function sourceAgnosticSnapshot(snapshot) { + return { + ...snapshot, + entities: snapshot.entities.map(({ source: _source, ...entity }) => entity), + }; +} + +function timingComparison(baseline, candidate) { + return { + coldPipelineSpeedup: speedup( + baseline.timings.coldPipelineMs, + candidate.timings.coldPipelineMs, + ), + warmAvgSpeedup: speedup( + baseline.timings.warmAvgMs, + candidate.timings.warmAvgMs, + ), + baseline: baseline.timings, + candidate: candidate.timings, + }; +} + +function speedup(baselineMs, candidateMs) { + if (candidateMs <= 0) { + return null; + } + return roundMs(baselineMs / candidateMs); +} + +function printVariantSummary(result) { + console.log( + JSON.stringify({ + event: result.event, + variant: result.variant, + pipelineRuntime: result.pipelineRuntime, + nativeRewrite: result.nativeRewrite, + fixtureCount: result.fixtureCount, + warmIterations: result.warmIterations, + timings: result.timings, + nativeDiagnostics: result.nativeDiagnostics, + fixtureTimings: result.fixtureTimings, + fixtures: result.fixtures, + }), + ); +} + +function describeNativeRewrite(config, search, runtime) { + const tsSliceLengths = Object.fromEntries( + Object.entries(search.slices).map(([name, slice]) => [ + name, + sliceLength(slice), + ]), + ); + const nativeStaticConfig = search.nativeStaticConfig; + const sliceLengths = nativeStaticConfig + ? nativeSliceLengths(nativeStaticConfig, tsSliceLengths) + : tsSliceLengths; + const regexValidationSlots = countUnsupportedRegexValidationSlots( + search.regexMeta, + nativeStaticConfig, + ); + const denyListSourceCounts = countDenyListSources(search.denyListData); + const nativeSupported = nativeStaticConfig + ? nativeStaticConfig.regex_patterns.length + + nativeStaticConfig.custom_regex_patterns.length + + nativeStaticConfig.literal_patterns.length + : null; + const unsupportedSearchSlots = [ + unsupportedSlot("regex", regexValidationSlots, "regex validators"), + unsupportedSlot( + "triggers", + nativeStaticConfig ? 0 : sliceLengths.triggers, + "trigger extraction", + ), + unsupportedSlot( + "streetTypes", + nativeStaticConfig ? 0 : tsSliceLengths.streetTypes, + "address seeds", + ), + ].filter((slot) => slot.count > 0); + const supportedSearchSlots = + nativeSupported ?? + Math.max(0, sliceLengths.regex - regexValidationSlots) + + sliceLengths.customRegex + + denyListSourceCounts.customOnly + + denyListSourceCounts.curated + + sliceLengths.gazetteer + + sliceLengths.countries; + const totalSearchSlots = nativeSupported + ? supportedSearchSlots + + unsupportedSearchSlots.reduce((sum, slot) => sum + slot.count, 0) + : Object.values(sliceLengths).reduce((sum, length) => sum + length, 0); + const unsupportedPipelineStages = describeUnsupportedPipelineStages( + config, + search, + runtime, + nativeStaticConfig, + ); + + return { + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, + fullPipelineNativeEligible: + unsupportedSearchSlots.length === 0 && + unsupportedPipelineStages.length === 0, + searchSlotCoverage: { + supported: supportedSearchSlots, + total: totalSearchSlots, + ratio: + totalSearchSlots === 0 + ? 1 + : roundMs(supportedSearchSlots / totalSearchSlots), + }, + sliceLengths, + unsupportedSearchSlots, + unsupportedPipelineStages, + }; +} + +function describeNativeRewriteFromNativeConfig(nativeStaticConfig, runtime) { + const supportedSearchSlots = + nativeStaticConfig.regex_patterns.length + + nativeStaticConfig.custom_regex_patterns.length + + nativeStaticConfig.literal_patterns.length; + const sliceLengths = nativeSliceLengths(nativeStaticConfig, {}); + + return { + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, + fullPipelineNativeEligible: false, + searchSlotCoverage: { + supported: supportedSearchSlots, + total: supportedSearchSlots, + ratio: 1, + }, + sliceLengths, + unsupportedSearchSlots: [], + unsupportedPipelineStages: ["prebuilt-config-summary-only"], + }; +} + +function describeNativeRewriteFromNativePackage(runtime) { + return { + measuredInPipeline: runtime === "native-static", + pipelineRuntime: runtime, + fullPipelineNativeEligible: false, + searchSlotCoverage: { + supported: 0, + total: 0, + ratio: 1, + }, + sliceLengths: { + regex: 0, + customRegex: 0, + legalForms: 0, + triggers: 0, + denyList: 0, + streetTypes: 0, + gazetteer: 0, + countries: 0, + }, + unsupportedSearchSlots: [], + unsupportedPipelineStages: ["prebuilt-package-summary-only"], + }; +} + +function nativeSliceLengths(nativeStaticConfig, fallbackSliceLengths) { + const slices = nativeStaticConfig.slices ?? {}; + return { + regex: sliceLength(slices.regex), + customRegex: sliceLength(slices.custom_regex), + legalForms: sliceLength(slices.legal_forms), + triggers: sliceLength(slices.triggers), + denyList: sliceLength(slices.deny_list), + streetTypes: nativeStaticConfig + ? sliceLength(slices.street_types) + : fallbackSliceLengths.streetTypes, + gazetteer: sliceLength(slices.gazetteer), + countries: sliceLength(slices.countries), + }; +} + +function describeUnsupportedPipelineStages( + config, + search, + runtime, + nativeStaticConfig, +) { + const stages = []; + const nativeRuntime = runtime === "native-static" && nativeStaticConfig; + if (config.enableLegalForms && !nativeRuntime) { + stages.push("legal-forms-v2"); + } + if (config.enableTriggerPhrases && !nativeRuntime) { + stages.push("triggers"); + } + if (config.enableNameCorpus && (!nativeRuntime || !config.enableDenyList)) { + stages.push( + config.enableDenyList ? "name-corpus-supplemental" : "name-corpus", + ); + } + if (config.enableNer) { + stages.push("ner"); + } + if (config.enableZoneClassification && !nativeRuntime) { + stages.push("zone-classification"); + } + if (config.enableCoreference && !nativeRuntime) { + stages.push("coreference"); + } + if (!nativeRuntime && sliceLength(search.slices.streetTypes) > 0) { + stages.push("address-seeds"); + } + + if (!nativeRuntime) { + stages.push("signatures"); + } + return stages; +} + +function countUnsupportedRegexValidationSlots(regexMeta, nativeStaticConfig) { + const nativeValidatorIds = new Set( + (nativeStaticConfig?.regex_meta ?? []) + .map((meta) => meta.validator_id) + .filter((validatorId) => typeof validatorId === "string"), + ); + let count = 0; + for (const meta of regexMeta) { + if (!regexMetaRequiresValidation(meta)) { + continue; + } + if (meta.validatorId && nativeValidatorIds.has(meta.validatorId)) { + continue; + } + count += 1; + } + return count; +} + +function regexMetaRequiresValidation(meta) { + return meta?.validator !== undefined || meta?.requiresValidation === true; +} + +function countDenyListSources(denyListData) { + if (!denyListData) { + return { customOnly: 0, curated: 0 }; + } + + let customOnly = 0; + let curated = 0; + for (const sources of denyListData.sources) { + const sourceList = Array.isArray(sources) ? sources : [sources]; + if ( + sourceList.length > 0 && + sourceList.every((source) => source === "custom-deny-list") + ) { + customOnly += 1; + } else { + curated += 1; + } + } + + return { customOnly, curated }; +} + +function unsupportedSlot(slice, count, reason) { + return { slice, count, reason }; +} + +function sliceLength(slice) { + return Math.max(0, Number(slice.end ?? 0) - Number(slice.start ?? 0)); +} + +function summarizeFixtureTimings(coldRun, warmRuns) { + return { + cold: summarizeRunFixtures(coldRun.fixtures), + warm: + warmRuns.length === 0 + ? null + : summarizeRunFixtures(warmRuns.flatMap((run) => run.fixtures)), + byFixture: coldRun.fixtures.map((coldFixture, index) => { + const warmMs = warmRuns + .map((run) => run.fixtures.at(index)?.ms) + .filter((ms) => typeof ms === "number"); + return { + fixture: coldFixture.fixture, + coldMs: coldFixture.ms, + warmAvgMs: + warmMs.length === 0 + ? null + : roundMs(warmMs.reduce((sum, ms) => sum + ms, 0) / warmMs.length), + }; + }), + }; +} + +function summarizeRunFixtures(fixtures) { + const values = fixtures.map((fixture) => fixture.ms).sort((a, b) => a - b); + return { + minMs: percentile(values, 0), + p50Ms: percentile(values, 0.5), + p95Ms: percentile(values, 0.95), + maxMs: percentile(values, 1), + }; +} + +function percentile(values, fraction) { + if (values.length === 0) { + return 0; + } + const index = Math.min( + values.length - 1, + Math.max(0, Math.ceil(values.length * fraction) - 1), + ); + return roundMs(values[index]); +} + +function firstDifferentIndex(left, right) { + const len = Math.max(left.length, right.length); + for (let index = 0; index < len; index += 1) { + if (JSON.stringify(left.at(index)) !== JSON.stringify(right.at(index))) { + return index; + } + } + return -1; +} + +function utf16OffsetToUtf8ByteOffset(text, offset) { + return Buffer.byteLength(text.slice(0, offset), "utf8"); +} + +function discoverFixtures(fixturesDir) { + const paths = []; + for (const language of readdirSync(fixturesDir)) { + const languageDir = join(fixturesDir, language); + for (const file of readdirSync(languageDir)) { + if (file.endsWith(".txt")) { + paths.push(join(languageDir, file)); + } + } + } + return paths.sort((left, right) => left.localeCompare(right)); +} + +function ensureGitRef(ref) { + const verify = spawnSync("git", ["rev-parse", "--verify", `${ref}^{tree}`], { + cwd: ROOT_DIR, + encoding: "utf8", + }); + if (verify.status === 0) { + return; + } + + if (ref === "origin/main") { + runCommand("git", ["fetch", "origin", "main", "--depth=1"]); + const retry = spawnSync("git", ["rev-parse", "--verify", `${ref}^{tree}`], { + cwd: ROOT_DIR, + encoding: "utf8", + }); + if (retry.status === 0) { + return; + } + } + + throw new Error(`Cannot resolve baseline ref: ${ref}`); +} + +function materializeBaselineRef(ref, tempRoot) { + ensureGitRef(ref); + return materializeGitRef(ref, tempRoot); +} + +function materializeGitRef(ref, tempRoot) { + const outputDir = join(tempRoot, "baseline"); + mkdirSync(outputDir, { recursive: true }); + + const archive = spawnSync("git", ["archive", "--format=tar", ref], { + cwd: ROOT_DIR, + maxBuffer: 512 * 1024 * 1024, + }); + if (archive.status !== 0) { + throw new Error( + `git archive failed for ${ref}: ${archive.stderr.toString()}`, + ); + } + + const extract = spawnSync("tar", ["-x", "-C", outputDir], { + input: archive.stdout, + maxBuffer: 512 * 1024 * 1024, + }); + if (extract.status !== 0) { + throw new Error(`tar extraction failed: ${extract.stderr.toString()}`); + } + + linkWorkspaceNodeModules(outputDir); + return outputDir; +} + +function linkWorkspaceNodeModules(outputDir) { + const source = join(ROOT_DIR, "node_modules"); + const target = join(outputDir, "node_modules"); + if (!existsSync(source) || existsSync(target)) { + return; + } + symlinkSync(source, target, "dir"); +} + +function createNativeStaticRunner(nativeStaticConfig) { + if (!nativeStaticConfig) { + throw new Error("Native static runtime requires nativeStaticConfig"); + } + + const stringifyStart = Bun.nanoseconds(); + const configJson = JSON.stringify(nativeStaticConfig); + const stringifyMs = elapsedMs(stringifyStart); + return createNativeStaticRunnerFromJson(configJson, stringifyMs); +} + +function createNativeStaticRunnerFromJson(configJson, stringifyMs = 0) { + const native = loadNativeAdapter(); + const configBytes = Buffer.from(configJson); + const packageStart = Bun.nanoseconds(); + const packageBytes = NATIVE_PREPARED_PACKAGE + ? prepareNativePackageBytes(native, configBytes) + : null; + writeNativePackageIfRequested(packageBytes); + const packagePrepareMs = packageBytes === null ? 0 : elapsedMs(packageStart); + const artifactStart = Bun.nanoseconds(); + const artifactBytes = NATIVE_PREPARED_ARTIFACTS + ? native.prepareStaticSearchArtifactsBytes(configBytes) + : null; + const artifactPrepareMs = + artifactBytes === null ? 0 : elapsedMs(artifactStart); + const prepare = () => { + if (packageBytes !== null) { + return native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + } + if (artifactBytes !== null) { + return native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + configBytes, + artifactBytes, + ); + } + return new native.NativePreparedSearch(configJson); + }; + const prepareStart = Bun.nanoseconds(); + const prepared = prepare(); + const warmPrepareMs = warmNativePreparedSearch(prepared); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } + return { + prepared, + prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, + configBytes: Buffer.byteLength(configJson, "utf8"), + artifactBytes: artifactBytes?.byteLength ?? 0, + artifactPrepareMs, + packageBytes: packageBytes?.byteLength ?? 0, + packagePrepareMs, + stringifyMs, + prepareMs, + warmPrepareMs, + }; +} + +function createNativeStaticRunnerFromJsonBytes(configBytes) { + const native = loadNativeAdapter(); + const packageStart = Bun.nanoseconds(); + const packageBytes = NATIVE_PREPARED_PACKAGE + ? prepareNativePackageBytes(native, configBytes) + : null; + writeNativePackageIfRequested(packageBytes); + const packagePrepareMs = packageBytes === null ? 0 : elapsedMs(packageStart); + const artifactStart = Bun.nanoseconds(); + const artifactBytes = NATIVE_PREPARED_ARTIFACTS + ? native.prepareStaticSearchArtifactsBytes(configBytes) + : null; + const artifactPrepareMs = + artifactBytes === null ? 0 : elapsedMs(artifactStart); + const prepare = (bytes) => { + if (packageBytes !== null) { + return native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + } + if (artifactBytes !== null) { + return native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + bytes, + artifactBytes, + ); + } + const factory = Reflect.get( + native.NativePreparedSearch, + "fromConfigJsonBytes", + ); + if (typeof factory === "function") { + return factory.call(native.NativePreparedSearch, bytes); + } + return new native.NativePreparedSearch(bytes.toString("utf8")); + }; + const prepareStart = Bun.nanoseconds(); + const prepared = prepare(configBytes); + const warmPrepareMs = warmNativePreparedSearch(prepared); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(configBytes); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } + return { + prepared, + prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, + configBytes: configBytes.byteLength, + artifactBytes: artifactBytes?.byteLength ?? 0, + artifactPrepareMs, + packageBytes: packageBytes?.byteLength ?? 0, + packagePrepareMs, + stringifyMs: 0, + prepareMs, + warmPrepareMs, + }; +} + +function createNativeStaticRunnerFromPackageBytes(packageBytes) { + const native = loadNativeAdapter(); + const prepare = () => + native.NativePreparedSearch.fromPreparedPackageBytes(packageBytes); + const prepareStart = Bun.nanoseconds(); + const prepared = prepare(); + const warmPrepareMs = warmNativePreparedSearch(prepared); + const prepareMs = elapsedMs(prepareStart); + const prepareDiagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const cachedPrepareMsByIteration = []; + const cachedWarmPrepareMsByIteration = []; + let cachedPrepareDiagnostics = null; + for (let index = 0; index < CACHED_PREPARE_ITERATIONS; index += 1) { + const cachedPrepareStart = Bun.nanoseconds(); + const cachedPrepared = prepare(); + cachedWarmPrepareMsByIteration.push( + warmNativePreparedSearch(cachedPrepared), + ); + cachedPrepareMsByIteration.push(elapsedMs(cachedPrepareStart)); + cachedPrepareDiagnostics = JSON.parse( + cachedPrepared.prepareDiagnosticsJson(), + ); + } + return { + prepared, + prepareDiagnostics, + cachedPrepareDiagnostics, + cachedPrepareMsByIteration, + cachedWarmPrepareMsByIteration, + configBytes: 0, + artifactBytes: 0, + artifactPrepareMs: 0, + packageBytes: packageBytes.byteLength, + packagePrepareMs: 0, + stringifyMs: 0, + prepareMs, + warmPrepareMs, + }; +} + +function warmNativePreparedSearch(prepared) { + const warmLazyRegex = + typeof prepared.warmLazyRegex === "function" + ? prepared.warmLazyRegex + : prepared.warm_lazy_regex; + if (typeof warmLazyRegex === "function") { + const warmStart = Bun.nanoseconds(); + warmLazyRegex.call(prepared); + return elapsedMs(warmStart); + } + return 0; +} + +function writeNativePackageIfRequested(packageBytes) { + if (packageBytes !== null && WRITE_NATIVE_PACKAGE_PATH.length > 0) { + writeFileSync(WRITE_NATIVE_PACKAGE_PATH, packageBytes); + } +} + +function profileNativeRegexPrepare(nativeStaticConfig) { + if (!nativeStaticConfig) { + return null; + } + + const native = loadNativeAdapter(); + const regexCount = sliceLength(nativeStaticConfig.slices?.regex); + const regexMeta = nativeStaticConfig.regex_meta ?? []; + const labels = [...new Set(regexMeta.map((meta) => meta.label))].sort( + (left, right) => left.localeCompare(right), + ); + const labelCounts = Object.fromEntries( + labels.map((label) => [ + label, + regexMeta.filter((meta) => meta.label === label).length, + ]), + ); + + return { + regexCount, + labelCounts, + only: labels.map((label) => + measureNativeConfigPrepare( + native.NativePreparedSearch, + `only:${label}`, + nativeRegexOnlyConfig(nativeStaticConfig, new Set([label])), + ), + ), + without: labels.map((label) => + measureNativeConfigPrepare( + native.NativePreparedSearch, + `without:${label}`, + nativeConfigWithRegexLabels( + nativeStaticConfig, + new Set([label]), + false, + ), + ), + ), + withoutHotGroups: measureNativeConfigPrepare( + native.NativePreparedSearch, + "without:date+monetary amount", + nativeConfigWithRegexLabels( + nativeStaticConfig, + new Set(["date", "monetary amount"]), + false, + ), + ), + }; +} + +async function profileScopedNativePrepare({ + sourceRoot, + variant, + baseConfig, + fixtures, +}) { + const module = await importSource( + sourceRoot, + "packages/anonymize/src/build-unified-search.ts", + variant, + ); + const buildNativeStaticSearchBundle = Reflect.get( + Object(module), + "buildNativeStaticSearchBundle", + ); + if (typeof buildNativeStaticSearchBundle !== "function") { + throw new TypeError("Native static search bundle builder is unavailable"); + } + const contextModule = await importSource( + sourceRoot, + "packages/anonymize/src/context.ts", + `${variant}:scoped-prepare`, + ); + const createPipelineContext = Reflect.get( + Object(contextModule), + "createPipelineContext", + ); + if (typeof createPipelineContext !== "function") { + throw new TypeError("Pipeline context factory is unavailable"); + } + + const native = loadNativeAdapter(); + const languages = [ + ...new Set(fixtures.map((fixture) => fixtureLanguage(fixture))), + ].sort((left, right) => left.localeCompare(right)); + + const scopes = []; + for (const language of languages) { + const scopedConfig = applyFixtureLanguageScope(baseConfig, language); + const buildStart = Bun.nanoseconds(); + const bundle = await buildNativeStaticSearchBundle( + scopedConfig, + [], + createPipelineContext(), + ); + const buildMs = elapsedMs(buildStart); + const prepare = measureNativeConfigPrepare( + native.NativePreparedSearch, + language, + bundle.nativeStaticConfig, + ); + scopes.push({ + language, + scope: fixtureLanguageScope(language), + buildMs, + ...prepare, + }); + } + + return scopes; +} + +function nativeConfigWithRegexLabels(config, labels, keepMatching) { + const regexMeta = config.regex_meta ?? []; + const regexPatterns = []; + const nextMeta = []; + for (const [index, meta] of regexMeta.entries()) { + const matches = labels.has(meta.label); + if (matches !== keepMatching) { + continue; + } + regexPatterns.push(config.regex_patterns[index]); + nextMeta.push(meta); + } + + const oldRegexCount = regexMeta.length; + const tail = config.regex_patterns.slice(oldRegexCount); + const nextRegexCount = regexPatterns.length; + const legalFormCount = sliceLength(config.slices?.legal_forms); + const triggerCount = sliceLength(config.slices?.triggers); + + return { + ...config, + regex_patterns: [...regexPatterns, ...tail], + regex_meta: nextMeta, + slices: { + ...config.slices, + regex: { start: 0, end: nextRegexCount }, + legal_forms: { + start: nextRegexCount, + end: nextRegexCount + legalFormCount, + }, + triggers: { + start: nextRegexCount + legalFormCount, + end: nextRegexCount + legalFormCount + triggerCount, + }, + }, + }; +} + +function nativeRegexOnlyConfig(config, labels) { + const regexMeta = config.regex_meta ?? []; + const regexPatterns = []; + const nextMeta = []; + for (const [index, meta] of regexMeta.entries()) { + if (!labels.has(meta.label)) { + continue; + } + regexPatterns.push(config.regex_patterns[index]); + nextMeta.push(meta); + } + + return { + ...config, + regex_patterns: regexPatterns, + regex_meta: nextMeta, + literal_patterns: [], + literal_patterns_from_deny_list_data: false, + deny_list_data: undefined, + gazetteer_data: undefined, + country_data: undefined, + trigger_data: undefined, + legal_form_data: undefined, + slices: { + regex: { start: 0, end: regexPatterns.length }, + custom_regex: { start: 0, end: 0 }, + legal_forms: { + start: regexPatterns.length, + end: regexPatterns.length, + }, + triggers: { + start: regexPatterns.length, + end: regexPatterns.length, + }, + deny_list: { start: 0, end: 0 }, + street_types: { start: 0, end: 0 }, + gazetteer: { start: 0, end: 0 }, + countries: { start: 0, end: 0 }, + }, + }; +} + +function measureNativeConfigPrepare(NativePreparedSearch, name, config) { + const stringifyStart = Bun.nanoseconds(); + const configJson = JSON.stringify(config); + const stringifyMs = elapsedMs(stringifyStart); + const prepareStart = Bun.nanoseconds(); + const prepared = new NativePreparedSearch(configJson); + const prepareMs = elapsedMs(prepareStart); + const diagnostics = JSON.parse(prepared.prepareDiagnosticsJson()); + const stages = diagnosticStageSummaries(diagnostics.events); + + return { + name, + configBytes: Buffer.byteLength(configJson, "utf8"), + sliceLengths: nativeSliceLengths(config, {}), + stringifyMs, + prepareMs, + topStages: topDiagnosticStages(stages).slice(0, 8), + }; +} + +function fixtureLanguage(fixturePath) { + return relative(FIXTURES_DIR, fixturePath).split(/[\\/]/)[0] ?? "und"; +} + +function applyFixtureLanguageScope(config, language) { + return { + ...config, + ...fixtureLanguageScope(language), + }; +} + +function fixtureLanguageScope(language) { + switch (language) { + case "cs": + return { + denyListCountries: ["CZ", "SK"], + nameCorpusLanguages: ["cs", "sk"], + }; + case "de": + return { + denyListCountries: ["DE", "AT", "CH"], + nameCorpusLanguages: ["de"], + }; + case "en": + return { + denyListCountries: ["US", "GB", "CA", "AU", "IE"], + nameCorpusLanguages: ["en"], + }; + default: + return {}; + } +} + +function contentLanguageScope() { + if (CONTENT_LANGUAGE.length === 0) { + return {}; + } + + return { + language: CONTENT_LANGUAGE, + ...fixtureLanguageScope(CONTENT_LANGUAGE), + }; +} + +function applyUserDataScenario(config) { + switch (USER_DATA_SCENARIO) { + case "none": + return config; + case "sample": + return withUserDataOverlay(config, { + denyListCount: 50, + regexCount: 5, + }); + case "heavy": + return withUserDataOverlay(config, { + denyListCount: 5_000, + regexCount: 50, + }); + default: + throw new Error( + `ANONYMIZE_MIGRATION_USER_DATA_SCENARIO must be none, sample, or heavy; got ${USER_DATA_SCENARIO}`, + ); + } +} + +function withUserDataOverlay(config, { denyListCount, regexCount }) { + return { + ...config, + customDenyList: [ + ...(config.customDenyList ?? []), + ...generatedCustomDenyList(denyListCount), + ], + customRegexes: [ + ...(config.customRegexes ?? []), + ...generatedCustomRegexes(regexCount), + ], + }; +} + +function generatedCustomDenyList(count) { + return Array.from({ length: count }, (_, index) => ({ + value: `CustomerPrivateTerm${index.toString().padStart(5, "0")}`, + label: index % 2 === 0 ? "organization" : "person", + variants: [`Customer Private Term ${index.toString().padStart(5, "0")}`], + })); +} + +function generatedCustomRegexes(count) { + return Array.from({ length: count }, (_, index) => ({ + pattern: `USR-${index.toString().padStart(4, "0")}-[A-Z]{2}\\d{4}`, + label: + index % 2 === 0 ? "registration number" : "tax identification number", + score: 0.92, + })); +} + +function loadNativeAdapter() { + const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-fixture-napi-")); + const napiPath = join(tempDir, "stella_anonymize_napi.node"); + copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); + const loaded = createRequire(import.meta.url)(napiPath); + const NativePreparedSearch = Reflect.get( + Object(loaded), + "NativePreparedSearch", + ); + const prepareStaticSearchArtifactsBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchArtifactsBytes", + ); + const prepareStaticSearchPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchPackageBytes", + ); + const prepareStaticSearchCompressedPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchCompressedPackageBytes", + ); + if ( + typeof NativePreparedSearch !== "function" || + typeof prepareStaticSearchArtifactsBytes !== "function" || + typeof prepareStaticSearchPackageBytes !== "function" || + typeof prepareStaticSearchCompressedPackageBytes !== "function" + ) { + throw new TypeError("Native anonymize adapter exports are incomplete"); + } + return { + NativePreparedSearch, + prepareStaticSearchArtifactsBytes, + prepareStaticSearchPackageBytes, + prepareStaticSearchCompressedPackageBytes, + }; +} + +function prepareNativePackageBytes(native, configBytes) { + if (NATIVE_COMPRESSED_PACKAGE) { + return native.prepareStaticSearchCompressedPackageBytes(configBytes); + } + return native.prepareStaticSearchPackageBytes(configBytes); +} + +function isCompressedNativePackage(packageBytes) { + const header = packageBytes.subarray(0, 8).toString("ascii"); + return header === "ANONPKZ1" || header === "ANONCPZ1"; +} + +function nativeLibraryPath(name) { + if (process.platform === "darwin") { + return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(ROOT_DIR, "target", "release", `lib${name}.so`); + } + return join(ROOT_DIR, "target", "release", `${name}.dll`); +} + +function ensureNativeAdapterBuilt() { + runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "--release", + "--locked", + ]); +} + +function runCommand(command, args) { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + stdio: "inherit", + }); + if (result.status !== 0) { + throw new Error(`${command} ${args.join(" ")} failed`); + } +} + +function validateRuntime(runtime) { + if (runtime === "typescript" || runtime === "native-static") { + return; + } + throw new Error( + `ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME must be typescript or native-static, got ${runtime}`, + ); +} + +function importSource(sourceRoot, relativePath, variant) { + const path = join(sourceRoot, relativePath); + if (!existsSync(path)) { + throw new Error(`Missing source file for ${variant}: ${path}`); + } + const url = pathToFileURL(path); + url.searchParams.set("migrationVariant", variant); + // eslint-disable-next-line stll/no-dynamic-import-specifier + return import(url.href); +} + +function positiveIntegerEnv(name, fallback) { + const raw = process.env[name]; + if (raw === undefined) { + return fallback; + } + const value = Number(raw); + if (!Number.isInteger(value) || value < 0) { + throw new Error(`${name} must be a non-negative integer`); + } + return value; +} + +function stringListEnv(name) { + const raw = process.env[name]?.trim(); + if (!raw) { + return []; + } + return raw + .split(",") + .map((part) => part.trim()) + .filter((part) => part.length > 0); +} + +function requiredEnv(name) { + const value = process.env[name]; + if (value === undefined || value === "") { + throw new Error(`Missing required environment variable: ${name}`); + } + return value; +} + +function elapsedMs(start) { + return roundMs((Bun.nanoseconds() - start) / 1_000_000); +} + +function roundMs(ms) { + return Math.round(ms * 1_000) / 1_000; +} diff --git a/packages/anonymize/scripts/native-adapter-perf.mjs b/packages/anonymize/scripts/native-adapter-perf.mjs new file mode 100644 index 00000000..a6281103 --- /dev/null +++ b/packages/anonymize/scripts/native-adapter-perf.mjs @@ -0,0 +1,732 @@ +import { spawnSync } from "node:child_process"; +import { createHash } from "node:crypto"; +import { copyFileSync, mkdirSync, mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createRequire } from "node:module"; +import { + load_prepared_package, + prepare_search_package, + redact_text as redactTextWithSdk, +} from "../src/native.ts"; + +const ROOT_DIR = join(import.meta.dir, "..", "..", ".."); +const ITERATIONS = Number(process.env.ANONYMIZE_NATIVE_PERF_ITERATIONS ?? 100); +const TOP_LEVEL_ITERATIONS = Number( + process.env.ANONYMIZE_NATIVE_PERF_TOP_LEVEL_ITERATIONS ?? + Math.min(ITERATIONS, 10), +); + +const configJson = JSON.stringify({ + regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], + custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], + literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Secret Code", + case_insensitive: true, + whole_words: true, + }, + { + kind: "literal-with-options", + pattern: "Prague", + case_insensitive: true, + whole_words: true, + }, + { + kind: "literal-with-options", + pattern: "Acme", + case_insensitive: true, + whole_words: false, + }, + { kind: "fuzzy", pattern: "Fuzztown", distance: 1 }, + { + kind: "literal-with-options", + pattern: "Turkey", + case_insensitive: true, + whole_words: true, + }, + ], + regex_options: { regex_whole_words: false }, + custom_regex_options: { regex_whole_words: false }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: false, + fuzzy_case_insensitive: true, + fuzzy_whole_words: true, + fuzzy_normalize_diacritics: true, + }, + slices: { + regex: { start: 0, end: 1 }, + custom_regex: { start: 0, end: 1 }, + deny_list: { start: 0, end: 2 }, + gazetteer: { start: 2, end: 4 }, + countries: { start: 4, end: 5 }, + }, + regex_meta: [{ label: "registration number", score: 0.9 }], + custom_regex_meta: [ + { label: "matter id", score: 1, source_detail: "custom-regex" }, + ], + deny_list_data: { + labels: [["matter"], ["address"]], + custom_labels: [["matter"], []], + originals: ["Secret Code", "Prague"], + sources: [["custom-deny-list"], ["city"]], + filters: { + stopwords: [], + allow_list: [], + person_stopwords: [], + address_stopwords: [], + street_types: [], + first_names: [], + generic_roles: [], + sentence_starters: [], + trailing_address_word_exclusions: [], + defined_term_cues: [], + }, + }, + gazetteer_data: { + labels: ["organization", "address"], + is_fuzzy: [false, true], + }, + country_data: { labels: ["country"] }, +}); + +const pythonScript = ` +import importlib.util +import json +import os +import pathlib +import time + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(os.environ["STELLA_ANONYMIZE_PERF_PAYLOAD"]) +prepare_start = time.perf_counter_ns() +prepared = module.PreparedSearch(payload["config_json"]) +prepare_ms = (time.perf_counter_ns() - prepare_start) / 1_000_000 +start = time.perf_counter_ns() +for _ in range(payload["iterations"]): + for item in payload["cases"]: + prepared.redact_static_entities( + item["text"], + item.get("operators_json"), + ) +elapsed_ms = (time.perf_counter_ns() - start) / 1_000_000 +case_results = [ + json.loads( + prepared.redact_static_entities_json( + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps({"prepareMs": prepare_ms, "runMs": elapsed_ms, "caseResults": case_results})) +`; + +const pythonSdkScript = ` +import json +import os +import pathlib +import sys +import time + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(os.environ["STELLA_ANONYMIZE_PERF_PAYLOAD"]) +package_start = time.perf_counter_ns() +package_bytes = anonymize.prepare_search_package(payload["config_json"]) +package_prepare_ms = (time.perf_counter_ns() - package_start) / 1_000_000 +load_start = time.perf_counter_ns() +prepared = anonymize.load_prepared_package(package_bytes) +load_ms = (time.perf_counter_ns() - load_start) / 1_000_000 +def entity_to_dict(entity): + return { + "start": entity.start, + "end": entity.end, + "label": entity.label, + "text": entity.text, + "score": entity.score, + "source": entity.source, + "source_detail": entity.source_detail, + } + +def redaction_entry_to_dict(entry): + return {"placeholder": entry.placeholder, "original": entry.original} + +def operator_entry_to_dict(entry): + return {"placeholder": entry.placeholder, "operator": entry.operator} + +def result_to_dict(result): + return { + "resolved_entities": [ + entity_to_dict(entity) + for entity in result.resolved_entities + ], + "redaction": { + "redacted_text": result.redaction.redacted_text, + "redaction_map": [ + redaction_entry_to_dict(entry) + for entry in result.redaction.redaction_map + ], + "operator_map": [ + operator_entry_to_dict(entry) + for entry in result.redaction.operator_map + ], + "entity_count": result.redaction.entity_count, + }, + } +start = time.perf_counter_ns() +for _ in range(payload["iterations"]): + for item in payload["cases"]: + prepared.redact_text( + item["text"], + item.get("operators"), + ) +run_ms = (time.perf_counter_ns() - start) / 1_000_000 +one_shot_start = time.perf_counter_ns() +for _ in range(payload["top_level_iterations"]): + for item in payload["cases"]: + anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + ) +one_shot_ms = (time.perf_counter_ns() - one_shot_start) / 1_000_000 +package_case_results = [ + result_to_dict( + prepared.redact_text( + item["text"], + item.get("operators"), + ) + ) + for item in payload["cases"] +] +one_shot_case_results = [ + result_to_dict( + anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + ) + ) + for item in payload["cases"] +] +print( + json.dumps( + { + "packagePrepareMs": package_prepare_ms, + "loadMs": load_ms, + "runMs": run_ms, + "oneShotMs": one_shot_ms, + "packageCaseResults": package_case_results, + "oneShotCaseResults": one_shot_case_results, + } + ) +) +`; + +runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "-p", + "stella-anonymize-py", + "--release", + "--locked", +]); + +const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-perf-")); +const napiPath = join(tempDir, "stella_anonymize_napi.node"); +const pythonPackageDir = join(tempDir, "stella_anonymize"); +mkdirSync(pythonPackageDir); +const pythonModulePath = join(pythonPackageDir, "_native.so"); +copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); +copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "__init__.py", + ), + join(pythonPackageDir, "__init__.py"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "__init__.pyi", + ), + join(pythonPackageDir, "__init__.pyi"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "_native.pyi", + ), + join(pythonPackageDir, "_native.pyi"), +); +copyFileSync( + join( + ROOT_DIR, + "crates", + "anonymize-py", + "python", + "stella_anonymize", + "py.typed", + ), + join(pythonPackageDir, "py.typed"), +); + +const native = createRequire(import.meta.url)(napiPath); +const cases = buildCases(); +const payload = { + config_json: configJson, + iterations: ITERATIONS, + top_level_iterations: TOP_LEVEL_ITERATIONS, + cases: cases.map(({ text, operatorsConfig, operatorsJson }) => ({ + text, + operators: operatorsConfig?.operators ?? null, + operators_json: operatorsJson, + })), +}; +const rustCoreResults = callRustCoreResults(payload, tempDir); + +const rustOutput = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_perf", + "--release", + "--locked", + ], + { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + }, +); +const rustSummary = JSON.parse(rustOutput); +printSummary("rust-core", rustSummary, cases.length, ITERATIONS); + +const tsPrepareStart = Bun.nanoseconds(); +const prepared = new native.NativePreparedSearch(configJson); +const tsPrepareMs = elapsedMs(tsPrepareStart); +const tsStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { + for (const item of cases) { + prepared.redactStaticEntities( + item.text, + item.operatorsJson === undefined + ? undefined + : JSON.parse(item.operatorsJson), + ); + } +} +const tsRunMs = elapsedMs(tsStart); +assertAdapterResults( + "ts-napi", + cases.map((item) => + canonicalResult( + prepared.redactStaticEntities( + item.text, + item.operatorsJson === undefined + ? undefined + : JSON.parse(item.operatorsJson), + ), + ), + ), + rustCoreResults, +); +printSummary( + "ts-napi", + { prepareMs: tsPrepareMs, runMs: tsRunMs }, + cases.length, + ITERATIONS, +); + +const packageStart = Bun.nanoseconds(); +const packageBytes = prepare_search_package({ + binding: native, + config: configJson, +}); +const packagePrepareMs = elapsedMs(packageStart); +const loadStart = Bun.nanoseconds(); +const preparedSdk = load_prepared_package({ + binding: native, + packageBytes, +}); +const loadMs = elapsedMs(loadStart); +const sdkRunStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < ITERATIONS; iteration += 1) { + for (const item of cases) { + preparedSdk.redact_text(item.text, item.operatorsConfig); + } +} +const sdkRunMs = elapsedMs(sdkRunStart); +assertAdapterResults( + "ts-sdk-prepared-package", + cases.map((item) => + canonicalResult(preparedSdk.redact_text(item.text, item.operatorsConfig)), + ), + rustCoreResults, +); +printSummary( + "ts-sdk-prepared-package", + { + prepareMs: packagePrepareMs + loadMs, + packagePrepareMs, + loadMs, + runMs: sdkRunMs, + }, + cases.length, + ITERATIONS, +); + +const topLevelRunStart = Bun.nanoseconds(); +for (let iteration = 0; iteration < TOP_LEVEL_ITERATIONS; iteration += 1) { + for (const item of cases) { + redactTextWithSdk({ + binding: native, + config: configJson, + fullText: item.text, + ...(item.operatorsConfig !== undefined + ? { operators: item.operatorsConfig } + : {}), + }); + } +} +const topLevelRunMs = elapsedMs(topLevelRunStart); +assertAdapterResults( + "ts-sdk-one-shot", + cases.map((item) => + canonicalResult( + redactTextWithSdk({ + binding: native, + config: configJson, + fullText: item.text, + ...(item.operatorsConfig !== undefined + ? { operators: item.operatorsConfig } + : {}), + }), + ), + ), + rustCoreResults, +); +printSummary( + "ts-sdk-one-shot", + { prepareMs: 0, runMs: topLevelRunMs }, + cases.length, + TOP_LEVEL_ITERATIONS, +); + +const pyOutput = runCommand("python3", ["-c", pythonScript], { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, +}); +const pySummary = JSON.parse(pyOutput); +assertAdapterResults("python-pyo3", pySummary.caseResults, rustCoreResults); +printSummary("python-pyo3", pySummary, cases.length, ITERATIONS); + +const pySdkOutput = runCommand("python3", ["-c", pythonSdkScript], { + STELLA_ANONYMIZE_PERF_PAYLOAD: JSON.stringify(payload), + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, +}); +const pySdkSummary = JSON.parse(pySdkOutput); +assertAdapterResults( + "python-sdk-prepared-package", + pySdkSummary.packageCaseResults, + rustCoreResults, +); +printSummary( + "python-sdk-prepared-package", + { + prepareMs: pySdkSummary.packagePrepareMs + pySdkSummary.loadMs, + packagePrepareMs: pySdkSummary.packagePrepareMs, + loadMs: pySdkSummary.loadMs, + runMs: pySdkSummary.runMs, + }, + cases.length, + ITERATIONS, +); +assertAdapterResults( + "python-sdk-one-shot", + pySdkSummary.oneShotCaseResults, + rustCoreResults, +); +printSummary( + "python-sdk-one-shot", + { prepareMs: 0, runMs: pySdkSummary.oneShotMs }, + cases.length, + TOP_LEVEL_ITERATIONS, +); + +function buildCases() { + const places = ["Fuzztovn", "Fuzztawn", "Fuzztowm"]; + const operators = [ + undefined, + JSON.stringify({ operators: { country: "redact" } }), + JSON.stringify({ operators: { address: "redact", country: "redact" } }), + JSON.stringify({ operators: { matter: "redact" } }), + ]; + const fixtureCases = []; + + for (let index = 0; index < 24; index += 1) { + const registration = `AB${String(index).padStart(4, "0")}`; + const matter = `MAT-${String(index % 1_000).padStart(3, "0")}`; + const place = places[index % places.length]; + const operatorsJson = operators[index % operators.length]; + fixtureCases.push({ + text: + `Reference ${registration} for Acme s.r.o. near ` + + `${place}, Turkey, Prague, matter ${matter}, code Secret Code.`, + operatorsConfig: + operatorsJson === undefined ? undefined : JSON.parse(operatorsJson), + operatorsJson, + }); + } + + return fixtureCases; +} + +function callRustCoreResults(perfPayload, tempDirectory) { + const parityPayloadPath = join(tempDirectory, "native-adapter-parity.json"); + writeFileSync( + parityPayloadPath, + JSON.stringify({ + config_json: perfPayload.config_json, + cases: perfPayload.cases.map(({ text, operators_json }) => ({ + text, + operators_json, + })), + }), + ); + const output = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_parity", + "--release", + "--locked", + "--quiet", + ], + { + STELLA_ANONYMIZE_PARITY_PAYLOAD: parityPayloadPath, + }, + ); + return JSON.parse(output).map(canonicalResult); +} + +function assertAdapterResults(adapter, actualResults, expectedResults) { + if (actualResults.length !== expectedResults.length) { + throw new Error( + `${adapter} returned ${actualResults.length} parity results, expected ${expectedResults.length}`, + ); + } + + for (let index = 0; index < expectedResults.length; index += 1) { + const actual = canonicalResult(actualResults[index]); + const expected = expectedResults[index]; + const actualSignature = resultSignature(actual); + const expectedSignature = resultSignature(expected); + if (actualSignature === expectedSignature) { + continue; + } + throw new Error( + [ + `${adapter} parity mismatch at case ${index}`, + `expected=${hashSignature(expectedSignature)}`, + `actual=${hashSignature(actualSignature)}`, + ].join(" "), + ); + } +} + +function canonicalResult(result) { + const redaction = result.redaction; + return { + resolved_entities: readArray( + result, + "resolvedEntities", + "resolved_entities", + ).map(canonicalEntity), + redaction: { + redacted_text: readValue(redaction, "redactedText", "redacted_text"), + redaction_map: canonicalRedactionEntries( + readValue(redaction, "redactionMap", "redaction_map"), + ), + operator_map: canonicalOperatorEntries( + readValue(redaction, "operatorMap", "operator_map"), + ), + entity_count: readValue(redaction, "entityCount", "entity_count"), + }, + }; +} + +function canonicalEntity(entity) { + return { + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: + readOptionalValue(entity, "sourceDetail", "source_detail") ?? null, + }; +} + +function canonicalRedactionEntries(entries) { + if (entries instanceof Map) { + return [...entries.entries()].map(([placeholder, original]) => ({ + placeholder, + original, + })); + } + return entries.map(({ placeholder, original }) => ({ + placeholder, + original, + })); +} + +function canonicalOperatorEntries(entries) { + if (entries instanceof Map) { + return [...entries.entries()].map(([placeholder, operator]) => ({ + placeholder, + operator, + })); + } + return entries.map(({ placeholder, operator }) => ({ + placeholder, + operator, + })); +} + +function readArray(value, camelKey, snakeKey) { + const result = readValue(value, camelKey, snakeKey); + if (!Array.isArray(result)) { + throw new TypeError(`Expected array field ${camelKey}/${snakeKey}`); + } + return result; +} + +function readValue(value, camelKey, snakeKey) { + if (Object.hasOwn(value, camelKey)) { + return value[camelKey]; + } + if (Object.hasOwn(value, snakeKey)) { + return value[snakeKey]; + } + throw new TypeError(`Missing field ${camelKey}/${snakeKey}`); +} + +function readOptionalValue(value, camelKey, snakeKey) { + if (Object.hasOwn(value, camelKey)) { + return value[camelKey]; + } + if (Object.hasOwn(value, snakeKey)) { + return value[snakeKey]; + } + return undefined; +} + +function resultSignature(result) { + return JSON.stringify(result); +} + +function hashSignature(signature) { + return createHash("sha256").update(signature).digest("hex").slice(0, 16); +} + +function nativeLibraryPath(name) { + if (process.platform === "darwin") { + return join(ROOT_DIR, "target", "release", `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(ROOT_DIR, "target", "release", `lib${name}.so`); + } + return join(ROOT_DIR, "target", "release", `${name}.dll`); +} + +function printSummary(adapter, summary, fixtureCount, iterations) { + const calls = fixtureCount * iterations; + const runMs = Number(summary.runMs); + const prepareMs = Number(summary.prepareMs); + console.log( + JSON.stringify({ + event: "native-adapter-perf", + adapter, + fixtureCount, + iterations, + calls, + prepareMs: roundMs(prepareMs), + runMs: roundMs(runMs), + totalMs: roundMs(prepareMs + runMs), + avgCallMs: roundMs(runMs / calls), + ...extraSummaryFields(summary), + }), + ); +} + +function elapsedMs(start) { + return (Bun.nanoseconds() - start) / 1_000_000; +} + +function roundMs(ms) { + return Math.round(ms * 1_000) / 1_000; +} + +function extraSummaryFields(summary) { + const fields = {}; + for (const key of ["packagePrepareMs", "loadMs"]) { + if (summary[key] !== undefined) { + fields[key] = roundMs(Number(summary[key])); + } + } + return fields; +} + +function runCommand(command, args, env = {}) { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + env: { ...process.env, ...env }, + }); + + if (result.status === 0) { + return result.stdout; + } + + throw new Error( + [ + `${command} ${args.join(" ")} failed with status ${result.status}`, + result.stdout, + result.stderr, + ] + .filter(Boolean) + .join("\n"), + ); +} diff --git a/packages/anonymize/scripts/native-package-ux-perf.mjs b/packages/anonymize/scripts/native-package-ux-perf.mjs new file mode 100644 index 00000000..cb42c3a1 --- /dev/null +++ b/packages/anonymize/scripts/native-package-ux-perf.mjs @@ -0,0 +1,192 @@ +import { spawnSync } from "node:child_process"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; + +const SCRIPT_PATH = fileURLToPath(import.meta.url); +const PACKAGE_DIR = dirname(dirname(SCRIPT_PATH)); +const ROOT_DIR = resolve(join(PACKAGE_DIR, "..", "..")); +const MIGRATION_SCRIPT = join( + PACKAGE_DIR, + "scripts", + "migration-fixture-perf.mjs", +); + +const SCENARIOS = [ + { name: "compressed", compressed: true }, + { name: "raw", compressed: false }, + ...languageScenarios(), + ...userDataScenarios(), +]; + +const tempRoot = mkdtempSync(join(tmpdir(), "stella-anonymize-package-ux-")); + +try { + const scenarios = SCENARIOS.map((scenario) => runScenario(scenario)); + console.log( + JSON.stringify({ + event: "native-package-ux-perf", + scenarios, + }), + ); +} finally { + rmSync(tempRoot, { force: true, recursive: true }); +} + +function runScenario({ name, compressed, language, userDataScenario }) { + const packagePath = join(tempRoot, `${name}.stlanonpkg`); + const languageEnv = + language === undefined + ? {} + : { + ANONYMIZE_MIGRATION_CONTENT_LANGUAGE: language, + ANONYMIZE_MIGRATION_FIXTURE_LANGUAGES: language, + }; + const userDataEnv = + userDataScenario === undefined || userDataScenario === "none" + ? {} + : { + ANONYMIZE_MIGRATION_USER_DATA_SCENARIO: userDataScenario, + }; + const build = runMigration({ + ...languageEnv, + ...userDataEnv, + ANONYMIZE_MIGRATION_NATIVE_COMPRESSED_PACKAGE: compressed ? "1" : "0", + ANONYMIZE_MIGRATION_NATIVE_PREPARED_PACKAGE: "1", + ANONYMIZE_MIGRATION_WRITE_NATIVE_PACKAGE_PATH: packagePath, + }); + const load = runMigration({ + ...languageEnv, + ...userDataEnv, + ANONYMIZE_MIGRATION_NATIVE_PACKAGE_PATH: packagePath, + }); + const nativeDiagnostics = load.nativeDiagnostics ?? null; + + return { + name, + compressed, + language: language ?? null, + userDataScenario: userDataScenario ?? "none", + fixtureCount: load.fixtureCount, + packageBytes: build.timings.nativePackageBytes, + offlinePackageBuildMs: build.timings.nativePackagePrepareMs, + firstPackageReadMs: load.timings.nativePackageReadMs, + firstPrepareMs: load.timings.nativePrepareMs, + firstWarmPrepareMs: load.timings.nativeWarmPrepareMs, + setupBeforeClickMs: + load.timings.nativePackageReadMs + load.timings.nativePrepareMs, + cachedPrepareMs: load.timings.nativeCachedPrepareAvgMs, + cachedWarmPrepareMs: load.timings.nativeCachedWarmPrepareAvgMs, + firstRunMs: load.timings.coldRunMs, + preloadedClickMs: load.timings.coldRunMs, + firstTouchMs: load.timings.nativeFirstTouchMs, + warmClickMs: load.timings.nativeWarmClickMs, + prepareTopStages: nativeDiagnostics?.prepare?.topStages ?? [], + cachedPrepareTopStages: nativeDiagnostics?.cachedPrepare?.topStages ?? [], + runTopStages: nativeDiagnostics?.run?.topStages ?? [], + runTopFixtures: nativeDiagnostics?.run?.topFixtures ?? [], + fixtureTimings: load.fixtureTimings, + topColdFixtures: load.fixtureTimings.byFixture + .toSorted((left, right) => right.coldMs - left.coldMs) + .slice(0, 5), + }; +} + +function languageScenarios() { + const value = process.env.ANONYMIZE_NATIVE_PACKAGE_UX_LANGUAGES ?? "en,cs,de"; + if (value.trim().length === 0) { + return []; + } + return value + .split(",") + .map((entry) => normalizeLanguage(entry)) + .filter((entry, index, entries) => entries.indexOf(entry) === index) + .map((language) => ({ + name: `compressed-${language}`, + compressed: true, + language, + })); +} + +function normalizeLanguage(value) { + const language = value.trim().toLowerCase(); + if (!/^[a-z0-9]+(?:-[a-z0-9]+)*$/u.test(language)) { + throw new Error( + `Invalid ANONYMIZE_NATIVE_PACKAGE_UX_LANGUAGES entry: ${value}`, + ); + } + return language; +} + +function userDataScenarios() { + const value = + process.env.ANONYMIZE_NATIVE_PACKAGE_UX_USER_DATA_SCENARIOS ?? + "sample,heavy"; + if (value.trim().length === 0) { + return []; + } + return value + .split(",") + .map((entry) => normalizeUserDataScenario(entry)) + .filter((entry) => entry !== "none") + .filter((entry, index, entries) => entries.indexOf(entry) === index) + .map((userDataScenario) => ({ + name: `compressed-user-${userDataScenario}`, + compressed: true, + userDataScenario, + })); +} + +function normalizeUserDataScenario(value) { + const scenario = value.trim().toLowerCase(); + if (scenario === "none" || scenario === "sample" || scenario === "heavy") { + return scenario; + } + throw new Error( + `ANONYMIZE_NATIVE_PACKAGE_UX_USER_DATA_SCENARIOS must contain none, sample, or heavy; got ${value}`, + ); +} + +function runMigration(extraEnv) { + const child = spawnSync(process.execPath, [MIGRATION_SCRIPT], { + cwd: ROOT_DIR, + env: { + ...process.env, + ...extraEnv, + ANONYMIZE_MIGRATION_CANDIDATE_RUNTIME: "native-static", + ANONYMIZE_MIGRATION_COMPARE_BASELINE: "0", + ANONYMIZE_MIGRATION_REQUIRE_NATIVE_PIPELINE: "1", + }, + encoding: "utf8", + maxBuffer: 64 * 1024 * 1024, + }); + + if (child.status !== 0) { + throw new Error( + [ + "Native package UX benchmark failed", + child.stdout.trim(), + child.stderr.trim(), + ] + .filter(Boolean) + .join("\n"), + ); + } + + return parseVariant(child.stdout); +} + +function parseVariant(stdout) { + for (const line of stdout.trim().split("\n").toReversed()) { + try { + const parsed = JSON.parse(line); + if (parsed.event === "fixture-migration-variant") { + return parsed; + } + } catch { + continue; + } + } + throw new Error("Migration benchmark did not emit a variant summary"); +} diff --git a/packages/anonymize/src/__test__/constants-parity.test.ts b/packages/anonymize/src/__test__/constants-parity.test.ts index 54a8306c..7ecbcda1 100644 --- a/packages/anonymize/src/__test__/constants-parity.test.ts +++ b/packages/anonymize/src/__test__/constants-parity.test.ts @@ -23,4 +23,9 @@ describe("@stll/anonymize/constants subpath parity", () => { ...fromRoot.OPERATOR_TYPES, ]); }); + + test("native shared SDK helpers are exported from the root entrypoint", () => { + expect(typeof fromRoot.redact_text).toBe("function"); + expect(typeof fromRoot.redact_text_json).toBe("function"); + }); }); diff --git a/packages/anonymize/src/__test__/countries.test.ts b/packages/anonymize/src/__test__/countries.test.ts index 187cddd1..fc9b01cb 100644 --- a/packages/anonymize/src/__test__/countries.test.ts +++ b/packages/anonymize/src/__test__/countries.test.ts @@ -193,6 +193,21 @@ describe("country detector", () => { expect(found).not.toContain("America"); }); + test("ambiguous short territory surfaces do not block city-state addresses", async () => { + const cityState = "Any arbitration shall take place in Norfolk, Virginia."; + const cityStateEntities = await detect(cityState); + expect(countries(cityStateEntities)).not.toContain("Norfolk"); + expect( + cityStateEntities.some( + (entity) => + entity.label === "address" && entity.text === "Norfolk, Virginia", + ), + ).toBe(true); + + const fullCountry = "The court is located on Norfolk Island."; + expect(countries(await detect(fullCountry))).toContain("Norfolk Island"); + }); + test("country token contained in a person span loses to the person", async () => { // "Chad", "Georgia", "Jordan" are first names AND // countries. When a longer person span contains the diff --git a/packages/anonymize/src/__test__/dictionary-bundle.test.ts b/packages/anonymize/src/__test__/dictionary-bundle.test.ts new file mode 100644 index 00000000..55d18d5b --- /dev/null +++ b/packages/anonymize/src/__test__/dictionary-bundle.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, setDefaultTimeout, test } from "bun:test"; + +setDefaultTimeout(60_000); + +import { loadDictionaryBundle } from "../../../data/dictionaries/index"; + +describe("dictionary bundle scoping", () => { + test("empty country scope keeps default city dictionaries", async () => { + const bundle = await loadDictionaryBundle({ countries: [] }); + + expect(bundle.cities.length).toBeGreaterThan(0); + expect(Object.keys(bundle.citiesByCountry)).toContain("CZ"); + }); + + test("unsupported non-empty name language scope keeps names empty", async () => { + const bundle = await loadDictionaryBundle({ nameLanguages: ["pt-br"] }); + + expect(bundle.firstNames).toEqual({}); + expect(bundle.surnames).toEqual({}); + expect(Object.values(bundle.denyListMeta)).not.toContainEqual( + expect.objectContaining({ category: "Names" }), + ); + }); +}); diff --git a/packages/anonymize/src/__test__/load-dictionaries.ts b/packages/anonymize/src/__test__/load-dictionaries.ts index 228c72b4..8bf74c04 100644 --- a/packages/anonymize/src/__test__/load-dictionaries.ts +++ b/packages/anonymize/src/__test__/load-dictionaries.ts @@ -6,131 +6,42 @@ * Only used in tests — production consumers load and pass * dictionaries themselves. */ -import type { Dictionaries, DictionaryMeta } from "../types"; +import type { Dictionaries } from "../types"; -let cached: Dictionaries | null = null; - -export const loadTestDictionaries = async (): Promise => { - if (cached) return cached; - - const dataModule = await import("@stll/anonymize-data"); +type TestDictionaryScope = { + denyListCountries?: readonly string[]; + nameCorpusLanguages?: readonly string[]; +}; - // Load all dictionaries - const allIds = [...dataModule.ALL_DICTIONARY_IDS]; - const denyList: Record = {}; - const denyListMeta: Record = {}; +const cache = new Map(); - const results = await Promise.all( - allIds.map(async (id) => { - const entries = await dataModule.loadDictionary(id); - return { id, entries }; - }), - ); +const scopeKey = (scope: TestDictionaryScope): string => + JSON.stringify({ + denyListCountries: [...(scope.denyListCountries ?? [])].toSorted(), + nameCorpusLanguages: [...(scope.nameCorpusLanguages ?? [])].toSorted(), + }); - for (const { id, entries } of results) { - const meta = dataModule.DICTIONARY_META[id]; - if (!meta) continue; - denyList[id] = entries; - // SAFETY: anonymize-data categories match DenyListCategory at runtime - denyListMeta[id] = meta as DictionaryMeta; +export const loadTestDictionaries = async ( + scope: TestDictionaryScope = {}, +): Promise => { + const key = scopeKey(scope); + const cached = cache.get(key); + if (cached) return cached; + const dataModule = await import("../../../data/dictionaries/index"); + const bundleOptions: Parameters[0] = + {}; + if (scope.denyListCountries !== undefined) { + bundleOptions.countries = scope.denyListCountries; + bundleOptions.cityCountries = scope.denyListCountries; } - - // Load per-language first names and surnames - const NAME_LANGUAGES = [ - "cs", - "sk", - "de", - "pl", - "hu", - "ro", - "fr", - "es", - "it", - "en", - "sv", - ] as const; - - const firstNames: Record = {}; - const surnames: Record = {}; - - await Promise.all( - NAME_LANGUAGES.map(async (lang) => { - try { - const mod = await import( - `@stll/anonymize-data/dictionaries/names/first/${lang}.json` - ); - firstNames[lang] = mod.default; - } catch { - // Not available for this language - } - try { - const mod = await import( - `@stll/anonymize-data/dictionaries/names/surnames/${lang}.json` - ); - surnames[lang] = mod.default; - } catch { - // Not available for this language - } - }), - ); - - // Load city dictionaries for common countries - const CITY_COUNTRIES = [ - "AT", - "AU", - "BE", - "BG", - "BR", - "CA", - "CH", - "CZ", - "DE", - "DK", - "ES", - "FI", - "FR", - "GB", - "GR", - "HR", - "HU", - "IE", - "IT", - "LU", - "NL", - "NO", - "NZ", - "PL", - "PT", - "RO", - "SE", - "SI", - "SK", - "US", - ]; - const cityResults = await Promise.all( - CITY_COUNTRIES.map(async (country) => ({ - country, - entries: await dataModule.loadCityDictionary(country), - })), - ); - const citiesByCountry: Record = {}; - const mergedCities: string[] = []; - for (const { country, entries } of cityResults) { - citiesByCountry[country] = entries; - for (const entry of entries) { - mergedCities.push(entry); - } + if (scope.nameCorpusLanguages !== undefined) { + bundleOptions.nameLanguages = scope.nameCorpusLanguages; } const result: Dictionaries = { - firstNames, - surnames, - denyList, - denyListMeta, - cities: mergedCities, - citiesByCountry, + ...(await dataModule.loadDictionaryBundle(bundleOptions)), }; - cached = result; + cache.set(key, result); return result; }; diff --git a/packages/anonymize/src/__test__/native-adapter-parity.test.ts b/packages/anonymize/src/__test__/native-adapter-parity.test.ts new file mode 100644 index 00000000..73ffe0a9 --- /dev/null +++ b/packages/anonymize/src/__test__/native-adapter-parity.test.ts @@ -0,0 +1,2934 @@ +import { spawnSync } from "node:child_process"; +import { + copyFileSync, + mkdirSync, + mkdtempSync, + readFileSync, + readdirSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createRequire } from "node:module"; +import { describe, expect, setDefaultTimeout, test } from "bun:test"; +import fc from "fast-check"; +import { + assertNativeBindingVersion, + createNativeAnonymizerFromPackage, + diagnostics_json, + getNativeBindingVersion, + load_prepared_package, + native_package_version, + normalize_for_search, + prepareNativeSearchPackage, + prepare_search_package, + PreparedAnonymizer, + PreparedSearch, + redact_text, + redact_text_json, + type NativeAnonymizeBinding, + type NativeOperatorConfig, + type NativePreparedSearchBinding, + type NativeStaticRedactionResult, +} from "../native"; +import type { + Entity, + OperatorConfig, + PipelineConfig, + RedactionResult, +} from "../types"; +import { + SHARED_NATIVE_SDK_CLASS_NAMES, + SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS, + SHARED_NATIVE_SDK_PREPARED_METHODS, + SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS, +} from "../native-sdk-contract"; +import { + createPipelineContext, + createNativePipelineFromPackage, + DEFAULT_ENTITY_LABELS, + getNativePipelineCompatibility, + preparePipelineSearch, + prepareNativePipelinePackage, + redactText, + runPipeline, +} from "../index"; +import { applyPipelineLanguageScope } from "../language-scope"; +import { contractTestConfig } from "./contract-config"; +import { loadTestDictionaries } from "./load-dictionaries"; + +setDefaultTimeout(240_000); + +const SLOW_NATIVE_FIXTURE_PARITY_TIMEOUT_MS = 600_000; + +type NativeAdapter = Omit< + NativeAnonymizeBinding, + | "prepareStaticSearchPackageBytes" + | "prepareStaticSearchCompressedPackageBytes" +> & { + normalizeForSearch: (text: string) => string; + nativePackageVersion: () => string; + NativePreparedSearch: NativeAnonymizeBinding["NativePreparedSearch"] & { + new (configJson: string): NativePreparedSearchBinding; + fromConfigJsonAndArtifactBytes: ( + configJson: Buffer, + artifactBytes: Buffer, + ) => NativePreparedSearchBinding; + }; + prepareStaticSearchArtifactsBytes: (configJson: Buffer) => Buffer; + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => Buffer; + prepareStaticSearchCompressedPackageBytes: (configJson: Uint8Array) => Buffer; + redactStaticEntitiesJson: ( + configJson: string, + fullText: string, + operatorsJson?: string, + ) => string; + redactStaticEntitiesDiagnosticsJson: ( + configJson: string, + fullText: string, + operatorsJson?: string, + ) => string; +}; + +type RedactionEntry = { + placeholder: string; + original: string; +}; + +type StaticRedactionResult = { + resolved_entities: Array<{ + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + source_detail?: string | null; + }>; + redaction: { + redacted_text: string; + redaction_map: RedactionEntry[]; + operator_map: Array<{ + placeholder: string; + operator: "replace" | "redact"; + }>; + entity_count: number; + }; +}; + +type OffsetFreeStaticRedactionResult = { + resolved_entities: Array< + Omit + >; + redaction: StaticRedactionResult["redaction"]; +}; + +type StaticRedactionDiagnosticResult = { + result: StaticRedactionResult; + diagnostics: { + events: Array<{ + stage: string; + kind: string; + count?: number; + engine?: string; + pattern?: number; + source?: string; + source_detail?: string; + label?: string; + start?: number; + end?: number; + text?: string; + score?: number; + span_valid?: boolean; + elapsed_us?: number; + input_bytes?: number; + reason?: string; + }>; + }; +}; + +type GeneratedNativeCase = { + text: string; + operators: Record | null; + sensitiveValues: string[]; +}; + +type SharedSdkParityCase = { + text: string; + operators: NativeOperatorConfig | null; +}; + +type ContractFixtureCase = { + name: string; + text: string; +}; + +type ExpectedNativeFixtureEntity = { + label: string; + source?: string; + text: string; +}; + +type NativeFixtureImprovementCase = { + language: (typeof CONTRACT_FIXTURE_LANGUAGES)[number]; + fixture: string; + includes?: ExpectedNativeFixtureEntity[]; + excludes?: ExpectedNativeFixtureEntity[]; +}; + +type PythonNativeOffsetSlice = { + start: number; + end: number; + slice: string; + text: string; +}; + +const ROOT_DIR = join(import.meta.dir, "..", "..", "..", ".."); +const TARGET_DIR = join(ROOT_DIR, "target", "debug"); +const PYTHON_SOURCE_DIR = join(ROOT_DIR, "crates", "anonymize-py", "python"); +const CONTRACT_FIXTURES_DIR = join( + ROOT_DIR, + "packages", + "anonymize", + "src", + "__test__", + "fixtures", + "contracts", +); +const CONTRACT_FIXTURE_LANGUAGES = ["cs", "de", "en"] as const; +const NATIVE_FIXTURE_IMPROVEMENTS: NativeFixtureImprovementCase[] = [ + { + language: "cs", + fixture: "asset-transfer-court-declensions.txt", + includes: [ + { + label: "address", + source: "regex", + text: "Václavské náměstí 9, 110 00 Praha 1", + }, + ], + }, + { + language: "cs", + fixture: "nakit-legal-services-framework.txt", + excludes: [{ label: "person", text: "Objednatele" }], + }, + { + language: "cs", + fixture: "vinci-donation-agreement.txt", + includes: [ + { + label: "organization", + source: "deny-list", + text: "České vysoké učení technické v Praze", + }, + { + label: "organization", + source: "coreference", + text: "VINCI Construction CS", + }, + ], + }, + { + language: "en", + fixture: "software-license-agreement.txt", + includes: [ + { + label: "address", + source: "regex", + text: "200 West Street, New York, NY 10282", + }, + { + label: "address", + source: "regex", + text: "1209 Orange Street, Wilmington, DE 19801", + }, + { + label: "phone number", + source: "regex", + text: "(212) 555-0142", + }, + ], + }, +]; +const CONFIG_JSON = JSON.stringify({ + regex_patterns: [{ kind: "regex", pattern: "\\b[A-Z]{2}\\d{4}\\b" }], + custom_regex_patterns: [{ kind: "regex", pattern: "\\bMAT-\\d{3}\\b" }], + literal_patterns: [ + { + kind: "literal-with-options", + pattern: "Secret Code", + case_insensitive: true, + whole_words: true, + }, + { + kind: "literal-with-options", + pattern: "Prague", + case_insensitive: true, + whole_words: true, + }, + { + kind: "literal-with-options", + pattern: "Acme", + case_insensitive: true, + whole_words: false, + }, + { kind: "fuzzy", pattern: "Fuzztown", distance: 1 }, + { + kind: "literal-with-options", + pattern: "Turkey", + case_insensitive: true, + whole_words: true, + }, + ], + regex_options: { regex_whole_words: false }, + custom_regex_options: { regex_whole_words: false }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: false, + fuzzy_case_insensitive: true, + fuzzy_whole_words: true, + fuzzy_normalize_diacritics: true, + }, + slices: { + regex: { start: 0, end: 1 }, + custom_regex: { start: 0, end: 1 }, + deny_list: { start: 0, end: 2 }, + gazetteer: { start: 2, end: 4 }, + countries: { start: 4, end: 5 }, + }, + regex_meta: [{ label: "registration number", score: 0.9 }], + custom_regex_meta: [ + { label: "matter id", score: 1, source_detail: "custom-regex" }, + ], + deny_list_data: { + labels: [["matter"], ["address"]], + custom_labels: [["matter"], []], + originals: ["Secret Code", "Prague"], + sources: [["custom-deny-list"], ["city"]], + filters: { + stopwords: [], + allow_list: [], + person_stopwords: [], + address_stopwords: [], + street_types: [], + ambiguous_street_type_terms: [], + first_names: [], + generic_roles: [], + sentence_starters: [], + trailing_address_word_exclusions: [], + document_heading_words: [], + defined_term_cues: [], + }, + }, + gazetteer_data: { + labels: ["organization", "address"], + is_fuzzy: [false, true], + }, + country_data: { labels: ["country"] }, +}); + +const PYTHON_ADAPTER_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +results = [ + json.loads( + module.redact_static_entities_json( + payload["config_json"], + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps(results)) +`; + +const PYTHON_NATIVE_OFFSET_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +prepared = module.PreparedSearch(payload["config_json"]) +result = prepared.redact_static_entities( + payload["text"], + payload.get("operators_json"), +) +entity = next( + ( + item + for item in result.resolved_entities + if item.label == payload["label"] + ), + None, +) +if entity is None: + raise AssertionError(f"entity not found: {payload['label']}") +sliced = payload["text"][entity.start:entity.end] +if sliced != payload["expected"]: + raise AssertionError( + f"slice mismatch: {sliced!r} at {entity.start}:{entity.end}" + ) +print( + json.dumps( + { + "start": entity.start, + "end": entity.end, + "slice": sliced, + "text": entity.text, + } + ) +) +`; + +const PYTHON_VERSION_SCRIPT = ` +import importlib.util +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +print(module.native_package_version()) +`; + +const PYTHON_PREPARED_ARTIFACT_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +artifact_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_ARTIFACTS"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +artifact_bytes = artifact_path.read_bytes() +if module.prepare_static_search_artifacts_bytes(payload["config_json"]) != artifact_bytes: + raise AssertionError("prepared artifact bytes differ") +prepared = module.PreparedSearch.from_config_json_and_artifact_bytes( + payload["config_json"], + artifact_bytes, +) +print( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) +) +`; + +const PYTHON_PREPARED_PACKAGE_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +prepare_fn_name = os.environ.get( + "STELLA_ANONYMIZE_PACKAGE_PREPARE_FN", + "prepare_static_search_package_bytes", +) +if getattr(module, prepare_fn_name)(payload["config_json"]) != package_bytes: + raise AssertionError("prepared package bytes differ") +prepared = module.PreparedSearch.from_prepared_package_bytes(package_bytes) +print( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) +) +`; + +const PYTHON_PREPARED_PACKAGE_CASES_SCRIPT = ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +prepare_fn_name = os.environ.get( + "STELLA_ANONYMIZE_PACKAGE_PREPARE_FN", + "prepare_static_search_package_bytes", +) +if getattr(module, prepare_fn_name)(payload["config_json"]) != package_bytes: + raise AssertionError("prepared package bytes differ") +prepared = module.PreparedSearch.from_prepared_package_bytes(package_bytes) +results = [ + json.loads( + prepared.redact_static_entities_json( + item["text"], + item.get("operators_json"), + ) + ) + for item in payload["cases"] +] +print(json.dumps(results)) +`; + +const PYTHON_PACKAGE_FACADE_SCRIPT = ` +import json +import os +import pathlib +import sys + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +if anonymize.prepare_search_package( + payload["config_json"], + compressed=payload["compressed"], +) != package_bytes: + raise AssertionError("facade package bytes differ") +prepared = anonymize.load_prepared_package(package_bytes) +if prepared is not anonymize.load_prepared_package(package_bytes): + raise AssertionError("facade package cache did not reuse prepared search") +from_file = anonymize.load_prepared_package_file(package_path) +print( + json.dumps( + { + "from_bytes": json.loads( + prepared.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) + ), + "from_file": json.loads( + from_file.redact_static_entities_json( + payload["text"], + payload.get("operators_json"), + ) + ), + "version": anonymize.native_package_version(), + } + ) +) +`; + +const PYTHON_SHARED_SDK_PARITY_SCRIPT = ` +import json +import os +import pathlib +import sys + +module_root = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]).parent.parent +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +package_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PACKAGE"]) +sys.path.insert(0, str(module_root)) + +import stella_anonymize as anonymize + +payload = json.loads(payload_path.read_text()) +package_bytes = package_path.read_bytes() +top_level = payload["top_level_functions"] +prepared_methods = payload["prepared_methods"] +class_names = payload["class_names"] +missing_top_level = [ + name for name in top_level if not callable(getattr(anonymize, name, None)) +] +if missing_top_level: + raise AssertionError(f"missing Python SDK functions: {missing_top_level}") +missing_public_names = [ + name for name in [*top_level, *class_names] if name not in anonymize.__all__ +] +if missing_public_names: + raise AssertionError(f"missing Python SDK public names: {missing_public_names}") +missing_classes = [ + name for name in class_names if not callable(getattr(anonymize, name, None)) +] +if missing_classes: + raise AssertionError(f"missing Python SDK classes: {missing_classes}") +prepared = anonymize.load_prepared_package(package_bytes) +if prepared is not anonymize.load_prepared_package(package_bytes): + raise AssertionError("facade package cache did not reuse prepared search") +missing_prepared = [ + name for name in prepared_methods if not callable(getattr(prepared, name, None)) +] +if missing_prepared: + raise AssertionError(f"missing Python prepared methods: {missing_prepared}") +from_file = anonymize.load_prepared_package_file(package_path) +if anonymize.prepare_search_package( + payload["config_json"], + compressed=payload["compressed"], +) != package_bytes: + raise AssertionError("facade package bytes differ") + +def redact_with(instance, item): + return json.loads( + instance.redact_text_json( + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + ) + +def redact_object_with_top_level(item): + result = anonymize.redact_text( + payload["config_json"], + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + return { + "resolved_entities": [ + { + "label": entity.label, + "text": entity.text, + "score": entity.score, + "source": entity.source, + "source_detail": entity.source_detail, + } + for entity in result.resolved_entities + ], + "redaction": { + "redacted_text": result.redaction.redacted_text, + "redaction_map": [ + { + "placeholder": entry.placeholder, + "original": entry.original, + } + for entry in result.redaction.redaction_map + ], + "operator_map": [ + { + "placeholder": entry.placeholder, + "operator": entry.operator, + } + for entry in result.redaction.operator_map + ], + "entity_count": result.redaction.entity_count, + }, + } + +print( + json.dumps( + { + "from_bytes": [ + redact_with(prepared, item) for item in payload["cases"] + ], + "from_file": [ + redact_with(from_file, item) for item in payload["cases"] + ], + "top_level": [ + json.loads( + anonymize.redact_text_json( + payload["config_json"], + item["text"], + item.get("operators"), + redact_string=item.get("redact_string"), + ) + ) + for item in payload["cases"] + ], + "top_level_object": [ + redact_object_with_top_level(item) for item in payload["cases"] + ], + "normalized": anonymize.normalize_for_search(payload["normalize_text"]), + "version": anonymize.native_package_version(), + } + ) +) +`; + +let loadedAdapters: { + native: NativeAdapter; + pythonModulePath: string; + tempDir: string; +} | null = null; + +const gapArb = fc + .array( + fc.constantFrom( + " ", + "\t", + "\n", + ".", + ",", + ";", + ":", + "(", + ")", + "a", + "e", + "n", + "r", + "s", + "t", + "č", + "ř", + "á", + "ü", + ), + { maxLength: 12 }, + ) + .map((chars) => chars.join("")); + +const registrationArb = fc + .record({ + prefix: fc.tuple( + fc.constantFrom("A", "B", "C", "D", "E", "F"), + fc.constantFrom("G", "H", "I", "J", "K", "L"), + ), + serial: fc.integer({ min: 0, max: 9999 }), + }) + .map( + ({ prefix, serial }) => + `${prefix.join("")}${String(serial).padStart(4, "0")}`, + ); + +const matterArb = fc + .integer({ min: 0, max: 999 }) + .map((value) => `MAT-${String(value).padStart(3, "0")}`); + +const fuzzyPlaceArb = fc.constantFrom("Fuzztovn", "Fuzztawn", "Fuzztowm"); + +const operatorsArb = fc.option( + fc.constantFrom( + { country: "redact" }, + { address: "redact", country: "redact" }, + { "matter id": "redact" }, + { matter: "redact" }, + ), + { nil: null }, +); + +const generatedCaseArb: fc.Arbitrary = fc + .record({ + left: gapArb, + middle: gapArb, + right: gapArb, + registration: registrationArb, + matter: matterArb, + fuzzyPlace: fuzzyPlaceArb, + operators: operatorsArb, + }) + .map( + ({ left, middle, right, registration, matter, fuzzyPlace, operators }) => { + const text = + `${left}Reference ${registration} for Acme s.r.o. near ` + + `${fuzzyPlace}, Turkey, Prague, matter ${matter}, code Secret Code.` + + `${middle}${right}`; + return { + text, + operators, + sensitiveValues: [ + registration, + "Acme s.r.o.", + fuzzyPlace, + "Turkey", + "Prague", + matter, + "Secret Code", + ], + }; + }, + ); + +describe("native adapter parity", () => { + test("native adapter versions match package metadata", () => { + const adapters = getAdapters(); + const packageVersion = packageJsonVersion(); + + expect(getNativeBindingVersion(adapters.native)).toBe(packageVersion); + expect(callPythonVersion(adapters.pythonModulePath)).toBe(packageVersion); + expect(() => + assertNativeBindingVersion({ + binding: adapters.native, + expectedVersion: packageVersion, + }), + ).not.toThrow(); + expect(() => + assertNativeBindingVersion({ + binding: adapters.native, + expectedVersion: "0.0.0", + }), + ).toThrow(); + }); + + test("normalization is identical through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = "Číslo\u00a0PAS - 1234 / Fuzztovn"; + + expect(callPythonNormalize(adapters.pythonModulePath, text)).toBe( + adapters.native.normalizeForSearch(text), + ); + }); + + test("generated static-redaction fixtures match exactly", () => { + const adapters = getAdapters(); + + fc.assert( + fc.property( + fc.array(generatedCaseArb, { minLength: 10, maxLength: 40 }), + (cases) => { + const tsResults = cases.map(({ text, operators }) => + runTsAdapter(adapters.native, text, operators), + ); + const pyResults = runPythonAdapters( + adapters.pythonModulePath, + cases, + adapters.tempDir, + ); + + expect(pyResults).toEqual(tsResults); + for (const [index, item] of cases.entries()) { + const result = tsResults.at(index); + expect(result).toBeDefined(); + expect(result?.redaction.entity_count).toBe(7); + for (const value of item.sensitiveValues) { + expect(result?.redaction.redacted_text).not.toContain(value); + } + } + }, + ), + { numRuns: 5, seed: 20_260_624 }, + ); + }); + + test("adapter result offsets slice source text after multibyte prefixes", () => { + const adapters = getAdapters(); + const text = + "č Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const tsResult = runTsAdapter(adapters.native, text, null); + const pyResult = runPythonAdapters( + adapters.pythonModulePath, + [ + { + text, + operators: null, + sensitiveValues: [], + }, + ], + adapters.tempDir, + ).at(0); + + expect(pyResult).toEqual(tsResult); + const registration = tsResult.resolved_entities.find( + (entity) => entity.label === "registration number", + ); + expect(registration).toBeDefined(); + if (!registration) { + return; + } + expect(text.slice(registration.start, registration.end)).toBe("AB1234"); + }); + + test("Python-native offsets slice source text after astral prefixes", () => { + const adapters = getAdapters(); + const text = "🙂 Reference AB1234 for Acme s.r.o."; + + const tsResult = runTsAdapter(adapters.native, text, null); + const registration = tsResult.resolved_entities.find( + (entity) => entity.label === "registration number", + ); + expect(registration).toBeDefined(); + if (!registration) { + return; + } + expect(text.slice(registration.start, registration.end)).toBe("AB1234"); + + const pythonSlice = callPythonNativeOffsetSlice( + adapters.pythonModulePath, + text, + "registration number", + "AB1234", + null, + ); + + expect(pythonSlice).toEqual({ + start: 12, + end: 18, + slice: "AB1234", + text: "AB1234", + }); + expect(pythonSlice.start).toBe(registration.start - 1); + expect(pythonSlice.end).toBe(registration.end - 1); + }); + + test("prepared search accepts config JSON bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const stringPrepared = new adapters.native.NativePreparedSearch( + CONFIG_JSON, + ); + const bytesPrepared = + adapters.native.NativePreparedSearch.fromConfigJsonBytes( + Buffer.from(CONFIG_JSON), + ); + + expect(bytesPrepared.redactStaticEntities(text)).toEqual( + stringPrepared.redactStaticEntities(text), + ); + }); + + test("prepared search accepts artifact bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const artifactBytes = + adapters.native.prepareStaticSearchArtifactsBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromConfigJsonAndArtifactBytes( + configBytes, + artifactBytes, + ); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithArtifacts( + adapters.pythonModulePath, + adapters.tempDir, + artifactBytes, + text, + null, + ), + ).toEqual(expectedJson); + }); + + test("prepared search accepts package bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchPackageBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + const diagnosticsJson = prepared.prepareDiagnosticsJson?.(); + if (diagnosticsJson === undefined) { + throw new Error("missing prepare diagnostics"); + } + const diagnostics = JSON.parse(diagnosticsJson); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + expect( + diagnostics.events.some( + (event: { stage?: unknown }) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); + const runDiagnosticsJson = + prepared.redactStaticEntitiesDiagnosticsJson?.(text); + if (runDiagnosticsJson === undefined) { + throw new Error("missing prepared run diagnostics"); + } + const runDiagnostics = JSON.parse( + runDiagnosticsJson, + ) as StaticRedactionDiagnosticResult; + expect( + runDiagnostics.diagnostics.events.some( + (event) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithPackage( + adapters.pythonModulePath, + adapters.tempDir, + packageBytes, + text, + null, + ), + ).toEqual(expectedJson); + }); + + test("prepared package cache verifies same-length corrupted bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchPackageBytes(configBytes); + + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + expect(prepared.redactStaticEntities(text)).toBeDefined(); + + const corrupted = Buffer.from(packageBytes); + const lastIndex = corrupted.length - 1; + const lastByte = corrupted.at(lastIndex); + if (lastByte === undefined) { + throw new Error("prepared package unexpectedly empty"); + } + corrupted.writeUInt8(lastByte ^ 0x01, lastIndex); + + expect(() => + adapters.native.NativePreparedSearch.fromPreparedPackageBytes(corrupted), + ).toThrow(); + }); + + test("prepared search accepts compressed package bytes through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchCompressedPackageBytes(configBytes); + const direct = new adapters.native.NativePreparedSearch(CONFIG_JSON); + const prepared = + adapters.native.NativePreparedSearch.fromPreparedPackageBytes( + packageBytes, + ); + const diagnosticsJson = prepared.prepareDiagnosticsJson?.(); + if (diagnosticsJson === undefined) { + throw new Error("missing prepare diagnostics"); + } + const diagnostics = JSON.parse(diagnosticsJson); + + expect(prepared.redactStaticEntities(text)).toEqual( + direct.redactStaticEntities(text), + ); + expect( + diagnostics.events.some( + (event: { stage?: unknown }) => event.stage === "prepare.cache.hit", + ), + ).toBe(true); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + expect( + callPythonPreparedWithPackage( + adapters.pythonModulePath, + adapters.tempDir, + packageBytes, + text, + null, + "prepare_static_search_compressed_package_bytes", + ), + ).toEqual(expectedJson); + }); + + test("Python package facade loads compressed package bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const configBytes = Buffer.from(CONFIG_JSON); + const packageBytes = + adapters.native.prepareStaticSearchCompressedPackageBytes(configBytes); + const expectedJson = JSON.parse( + adapters.native.redactStaticEntitiesJson(CONFIG_JSON, text), + ); + const result = callPythonPackageFacade({ + pythonModulePath: adapters.pythonModulePath, + tempDir: adapters.tempDir, + packageBytes, + text, + operators: null, + compressed: true, + }); + + expect(result.from_bytes).toEqual(expectedJson); + expect(result.from_file).toEqual(expectedJson); + expect(result.version).toBe(packageJsonVersion()); + }); + + test("shared TS and Python SDK facades match Rust core JSON", () => { + const adapters = getAdapters(); + const config = JSON.parse(CONFIG_JSON); + const packageBytes = prepare_search_package({ + binding: adapters.native, + config: CONFIG_JSON, + compressed: true, + }); + const prepared = load_prepared_package({ + binding: adapters.native, + packageBytes, + }); + const cases: SharedSdkParityCase[] = [ + { + text: + "č Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code.", + operators: null, + }, + { + text: + "🙂 Reference CD9876 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-456, code Secret Code.", + operators: { + operators: { country: "redact", "matter id": "redact" }, + redactString: "***", + }, + }, + ]; + + const tsSdkFunctions: Record< + (typeof SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS)[number], + unknown + > = { + diagnostics_json, + load_prepared_package, + native_package_version, + normalize_for_search, + prepare_search_package, + redact_text, + redact_text_json, + }; + for (const name of SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS) { + expect(typeof tsSdkFunctions[name]).toBe("function"); + } + const tsSdkClasses: Record< + (typeof SHARED_NATIVE_SDK_CLASS_NAMES)[number], + unknown + > = { + PreparedAnonymizer, + PreparedSearch, + }; + for (const name of SHARED_NATIVE_SDK_CLASS_NAMES) { + expect(typeof tsSdkClasses[name]).toBe("function"); + } + const preparedApi = prepared as unknown as Record; + for (const name of SHARED_NATIVE_SDK_PREPARED_METHODS) { + expect(typeof preparedApi[name]).toBe("function"); + } + + expect(native_package_version(adapters.native)).toBe(packageJsonVersion()); + expect( + normalize_for_search({ + binding: adapters.native, + text: "Číslo\u00a0PAS - 1234", + }), + ).toBe(adapters.native.normalizeForSearch("Číslo\u00a0PAS - 1234")); + expect([ + ...prepare_search_package({ binding: adapters.native, config }), + ]).toEqual([...packageBytes]); + + const rustCoreJson = callRustCoreSharedSdkParity(adapters.tempDir, cases); + const tsSdkJson = cases.map(({ text, operators }) => + JSON.parse(prepared.redact_text_json(text, operators ?? undefined)), + ); + + expect(tsSdkJson).toEqual(rustCoreJson); + expect( + cases.map(({ text, operators }) => + toBindingStaticResult( + redact_text({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: text, + ...(operators !== null ? { operators } : {}), + }), + ), + ), + ).toEqual(rustCoreJson); + expect( + cases.map(({ text, operators }) => + JSON.parse( + redact_text_json({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: text, + ...(operators !== null ? { operators } : {}), + }), + ), + ), + ).toEqual(rustCoreJson); + const diagnosticsJson = prepared.diagnostics_json(cases[0].text); + if (diagnosticsJson === null) { + throw new Error("missing shared SDK diagnostics"); + } + expect(diagnosticsJson).toContain('"diagnostics"'); + const topLevelDiagnosticsJson = diagnostics_json({ + binding: adapters.native, + config: CONFIG_JSON, + fullText: cases[0].text, + }); + if (topLevelDiagnosticsJson === null) { + throw new Error("missing top-level shared SDK diagnostics"); + } + expect(topLevelDiagnosticsJson).toContain('"diagnostics"'); + + const python = callPythonSharedSdkParity({ + pythonModulePath: adapters.pythonModulePath, + tempDir: adapters.tempDir, + packageBytes: Buffer.from(packageBytes), + cases, + normalizeText: "Číslo\u00a0PAS - 1234", + }); + + expect(python.from_bytes).toEqual(rustCoreJson); + expect(python.from_file).toEqual(rustCoreJson); + expect(python.top_level).toEqual(rustCoreJson); + expect(python.top_level_object).toEqual( + rustCoreJson.map(withoutEntityOffsets), + ); + expect(python.normalized).toBe( + adapters.native.normalizeForSearch("Číslo\u00a0PAS - 1234"), + ); + expect(python.version).toBe(packageJsonVersion()); + }); + + test("native facade redacts from compressed package bytes", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const operators: NativeOperatorConfig = { + operators: { country: "redact" }, + redactString: "***", + }; + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: JSON.parse(CONFIG_JSON), + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + const expected: StaticRedactionResult = JSON.parse( + adapters.native.redactStaticEntitiesJson( + CONFIG_JSON, + text, + JSON.stringify(operators), + ), + ); + + const result = anonymizer.redactStaticEntities(text, operators); + + expect(result.resolvedEntities).toEqual( + expected.resolved_entities.map(toNativeFacadeEntity), + ); + expect(result.redaction.redactedText).toBe( + expected.redaction.redacted_text, + ); + expect(result.redaction.entityCount).toBe(expected.redaction.entity_count); + expect([...result.redaction.redactionMap.entries()]).toEqual( + expected.redaction.redaction_map.map(({ placeholder, original }) => [ + placeholder, + original, + ]), + ); + expect([...result.redaction.operatorMap.entries()]).toEqual( + expected.redaction.operator_map.map(({ placeholder, operator }) => [ + placeholder, + operator, + ]), + ); + expect(result.redaction.redactedText).toContain("***"); + }); + + test("native pipeline package matches TS static pipeline redaction", async () => { + const adapters = getAdapters(); + const fullText = + "Project Nebula and Blue Harbour signed MAT-123 on 2024-01-02. " + + "Acme s.r.o.\n/s/ Jane Doe"; + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: true, + customDenyList: [ + { + value: "Project Nebula", + label: "organization", + variants: ["Nebula Programme"], + }, + ], + customRegexes: [ + { pattern: "\\bMAT-\\d{3}\\b", label: "matter id", score: 1 }, + ], + enableGazetteer: true, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization", "date", "person", "matter id"], + workspaceId: "native-pipeline-static-test", + }; + const gazetteerEntries = [ + { + id: "blue-harbour", + canonical: "Blue Harbor Capital", + label: "organization", + variants: ["Blue Harbour"], + workspaceId: "native-pipeline-static-test", + createdAt: 0, + source: "manual" as const, + }, + ]; + const operators: NativeOperatorConfig & OperatorConfig = { + operators: { "matter id": "redact" }, + redactString: "***", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + gazetteerEntries, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries, + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS address context redaction", async () => { + const adapters = getAdapters(); + const fullText = + "ACME s.r.o.\nEvropska 710\n160 00 Praha\n" + "body ".repeat(200); + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + customRegexes: [ + { pattern: "ACME s\\.r\\.o\\.", label: "organization", score: 1 }, + ], + labels: ["organization", "address"], + workspaceId: "native-pipeline-address-context-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsContext = createPipelineContext(); + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ label: "address", text: "Evropska 710" }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS confidence boost redaction", async () => { + const adapters = getAdapters(); + const fullText = "ANCHOR-123 signed with NEAR-456."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + customRegexes: [ + { + pattern: "\\bANCHOR-\\d+\\b", + label: "registration number", + score: 0.95, + }, + { pattern: "\\bNEAR-\\d+\\b", label: "matter id", score: 0.45 }, + ], + labels: ["registration number", "matter id"], + workspaceId: "native-pipeline-confidence-boost-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities.some(({ text }) => text === "NEAR-456")).toBe(true); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches trigger-only legal suffix reclassification", async () => { + const adapters = getAdapters(); + const fullText = "jednatelem Novák Partners s.r.o. na základě plné moci."; + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-trigger-suffix-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "organization", + text: expect.stringContaining("s.r.o."), + source: "trigger", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS hotword reclassification", async () => { + const adapters = getAdapters(); + const fullText = "narozen dne 12.03.1990 v Praze"; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: true, + enableZoneClassification: false, + labels: ["date of birth"], + workspaceId: "native-pipeline-hotword-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "date of birth", + text: "12.03.1990", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS organization propagation", async () => { + const adapters = getAdapters(); + const fullText = "Acme LLC signed. Acme paid."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: true, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-coreference-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "organization", + text: "Acme", + source: "coreference", + corefSourceText: "Acme LLC", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package keeps org propagation suffixes in TS parity", async () => { + const adapters = getAdapters(); + const fullText = "Acme Kft. signed. Acme paid."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: true, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["organization"], + workspaceId: "native-pipeline-coreference-suffix-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect( + tsEntities.some( + (entity) => entity.source === "coreference" && entity.text === "Acme", + ), + ).toBe(false); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS trigger monetary widening", async () => { + const adapters = getAdapters(); + const fullText = + "Smluvní pokuta je sjednána ve výši 50.000,- Kč (slovy: padesát tisíc korun českých)."; + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: true, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["monetary amount"], + workspaceId: "native-pipeline-trigger-money-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context: createPipelineContext(), + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + label: "monetary amount", + text: "50.000,- Kč (slovy: padesát tisíc korun českých)", + }), + ]), + ); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS zone score adjustments", async () => { + const adapters = getAdapters(); + const fullText = ["Parties", "Alice", "Article 1", "Body"].join("\n"); + const config: PipelineConfig = { + threshold: 0.5, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: true, + customRegexes: [{ pattern: "Alice", label: "person", score: 0.45 }], + labels: ["person"], + workspaceId: "native-pipeline-zone-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual([ + expect.objectContaining({ label: "person", text: "Alice", score: 0.55 }), + ]); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline package matches TS supplemental name corpus", async () => { + const adapters = getAdapters(); + const fullText = "The agreement is signed by Sato Kenji."; + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["person"], + workspaceId: "native-pipeline-name-corpus-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + + expect(tsEntities).toEqual([ + expect.objectContaining({ + label: "person", + text: "Sato Kenji", + score: 0.9, + }), + ]); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline keeps supplemental names outside address seeds", async () => { + const adapters = getAdapters(); + const fullText = + "Sato Kenji, address: 100 Main Street, Boston, MA 02101-1234."; + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: false, + enableLegalForms: false, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["person", "address"], + workspaceId: "native-pipeline-name-address-boundary-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + + const context = createPipelineContext(); + const packageBytes = await prepareNativePipelinePackage({ + binding: adapters.native, + config, + context, + compressed: true, + }); + const nativePipeline = createNativePipelineFromPackage({ + binding: adapters.native, + packageBytes, + }); + const tsContext = createPipelineContext(); + const operators: OperatorConfig & NativeOperatorConfig = { + operators: {}, + redactString: "[REDACTED]", + }; + const tsEntities = await runPipeline({ + fullText, + config, + gazetteerEntries: [], + context: tsContext, + }); + const tsRedaction = redactText(fullText, tsEntities, operators, tsContext); + const address = tsEntities.find((entity) => entity.label === "address"); + + expect(tsEntities).toEqual( + expect.arrayContaining([ + expect.objectContaining({ label: "person", text: "Sato Kenji" }), + ]), + ); + expect(address?.text).toContain("100 Main Street"); + expect(address?.text).not.toContain("Sato Kenji"); + expect( + toBindingStaticResult(nativePipeline.redactText(fullText, operators)), + ).toEqual({ + resolved_entities: tsEntities.map(toBindingEntity), + redaction: toBindingRedactionResult(tsRedaction), + }); + }); + + test("native pipeline compatibility rejects TS-only contextual passes", () => { + const config: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableNer: true, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "native-pipeline-compat-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "unsupported", + unsupportedFeatures: ["enableNer"], + }); + }); + + test("native pipeline compatibility accepts address context passes", () => { + const config: PipelineConfig = { + threshold: 0.85, + enableTriggerPhrases: false, + enableRegex: true, + enableLegalForms: false, + enableNameCorpus: false, + enableDenyList: false, + enableGazetteer: false, + enableNer: false, + enableConfidenceBoost: false, + enableCoreference: false, + enableHotwordRules: false, + enableZoneClassification: false, + labels: ["address"], + workspaceId: "native-pipeline-address-context-test", + }; + + expect(getNativePipelineCompatibility(config)).toEqual({ + status: "supported", + }); + }); + + test( + "native facade and Python match on contract fixture packages", + async () => { + const adapters = getAdapters(); + for (const language of CONTRACT_FIXTURE_LANGUAGES) { + const fixtures = loadContractFixtureCases(language); + const scopedConfig = applyPipelineLanguageScope({ + ...contractTestConfig(`native-facade-fixture-parity-${language}`), + language, + }); + const dictionaryScope: Parameters[0] = {}; + if (scopedConfig.denyListCountries !== undefined) { + dictionaryScope.denyListCountries = scopedConfig.denyListCountries; + } + if (scopedConfig.nameCorpusLanguages !== undefined) { + dictionaryScope.nameCorpusLanguages = + scopedConfig.nameCorpusLanguages; + } + const dictionaries = await loadTestDictionaries(dictionaryScope); + const search = await preparePipelineSearch({ + config: { + ...scopedConfig, + dictionaries, + }, + context: createPipelineContext(), + }); + const configJson = JSON.stringify(search.nativeStaticConfig); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + + const tsResults = fixtures.map(({ text }) => + toBindingStaticResult(anonymizer.redactStaticEntities(text)), + ); + const pyResults = callPythonPreparedPackageCases( + adapters.pythonModulePath, + adapters.tempDir, + Buffer.from(packageBytes), + fixtures.map(({ text }) => ({ text, operators: null })), + "prepare_static_search_compressed_package_bytes", + configJson, + ); + + for (const [index, fixture] of fixtures.entries()) { + expect({ + fixture: `${language}/${fixture.name}`, + result: pyResults.at(index), + }).toEqual({ + fixture: `${language}/${fixture.name}`, + result: tsResults.at(index), + }); + } + } + }, + SLOW_NATIVE_FIXTURE_PARITY_TIMEOUT_MS, + ); + + test("native fixture improvements are explicit", async () => { + const adapters = getAdapters(); + const languages = [ + ...new Set(NATIVE_FIXTURE_IMPROVEMENTS.map(({ language }) => language)), + ]; + + for (const language of languages) { + const fixtures = new Map( + loadContractFixtureCases(language).map(({ name, text }) => [ + name, + text, + ]), + ); + const scopedConfig = applyPipelineLanguageScope({ + ...contractTestConfig(`native-fixture-improvements-${language}`), + language, + }); + const dictionaryScope: Parameters[0] = {}; + if (scopedConfig.denyListCountries !== undefined) { + dictionaryScope.denyListCountries = scopedConfig.denyListCountries; + } + if (scopedConfig.nameCorpusLanguages !== undefined) { + dictionaryScope.nameCorpusLanguages = scopedConfig.nameCorpusLanguages; + } + const dictionaries = await loadTestDictionaries(dictionaryScope); + const search = await preparePipelineSearch({ + config: { + ...scopedConfig, + dictionaries, + }, + context: createPipelineContext(), + }); + const packageBytes = prepareNativeSearchPackage({ + binding: adapters.native, + config: search.nativeStaticConfig, + compressed: true, + }); + const anonymizer = createNativeAnonymizerFromPackage({ + binding: adapters.native, + packageBytes, + }); + + for (const improvement of NATIVE_FIXTURE_IMPROVEMENTS.filter( + (item) => item.language === language, + )) { + const text = fixtures.get(improvement.fixture); + expect(text).toBeDefined(); + if (text === undefined) { + continue; + } + + const result = toBindingStaticResult( + anonymizer.redactStaticEntities(text), + ); + for (const entity of improvement.includes ?? []) { + expectNativeFixtureEntity(result, entity); + } + for (const entity of improvement.excludes ?? []) { + expectNativeFixtureEntityAbsent(result, entity); + } + } + } + }); + + test("JSON operator config accepts camel-case redactString", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + + const result = JSON.parse( + adapters.native.redactStaticEntitiesJson( + CONFIG_JSON, + text, + JSON.stringify({ + operators: { country: "redact" }, + redactString: "***", + }), + ), + ) as StaticRedactionResult; + + expect(result.redaction.redacted_text).toContain("***"); + }); + + test("diagnostics JSON is identical through TS and Python adapters", () => { + const adapters = getAdapters(); + const text = + "Reference AB1234 for Acme s.r.o. near Fuzztovn, Turkey, " + + "Prague, matter MAT-123, code Secret Code."; + const operators = { country: "redact" }; + + const tsResult = runTsDiagnosticsAdapter(adapters.native, text, operators); + const pyResult = callPythonDiagnostics( + adapters.pythonModulePath, + text, + operators, + ); + + expect(stripDiagnosticTimings(pyResult)).toEqual( + stripDiagnosticTimings(tsResult), + ); + expect( + tsResult.diagnostics.events.some( + (event) => + event.kind === "stage-summary" && + typeof event.elapsed_us === "number", + ), + ).toBe(true); + expect( + tsResult.diagnostics.events.some( + (event) => + event.stage === "search.literal" && + event.kind === "stage-summary" && + typeof event.count === "number" && + event.count > 0, + ), + ).toBe(true); + expect( + tsResult.diagnostics.events.some( + (event) => + event.stage === "resolution.sanitize" && + event.kind === "entity" && + event.span_valid === true, + ), + ).toBe(true); + expect( + tsResult.diagnostics.events.every((event) => event.text === undefined), + ).toBe(true); + }); +}); + +const getAdapters = () => { + if (loadedAdapters !== null) { + return loadedAdapters; + } + + runCommand("cargo", [ + "build", + "-p", + "stella-anonymize-napi", + "-p", + "stella-anonymize-py", + "--locked", + ]); + + const tempDir = mkdtempSync(join(tmpdir(), "stella-anonymize-native-")); + const napiPath = join(tempDir, "stella_anonymize_napi.node"); + const pythonPackageDir = join(tempDir, "stella_anonymize"); + mkdirSync(pythonPackageDir); + const pythonModulePath = join(pythonPackageDir, "_native.so"); + copyFileSync(nativeLibraryPath("stella_anonymize_napi"), napiPath); + copyFileSync(nativeLibraryPath("stella_anonymize_core_py"), pythonModulePath); + copyFileSync( + join(PYTHON_SOURCE_DIR, "stella_anonymize", "__init__.py"), + join(pythonPackageDir, "__init__.py"), + ); + + const native = loadNativeAdapter(napiPath); + loadedAdapters = { native, pythonModulePath, tempDir }; + return loadedAdapters; +}; + +const nativeLibraryPath = (name: string): string => { + if (process.platform === "darwin") { + return join(TARGET_DIR, `lib${name}.dylib`); + } + if (process.platform === "linux") { + return join(TARGET_DIR, `lib${name}.so`); + } + return join(TARGET_DIR, `${name}.dll`); +}; + +const loadNativeAdapter = (nativePath: string): NativeAdapter => { + const nativeRequire = createRequire(import.meta.url); + const loaded: unknown = nativeRequire(nativePath); + const normalizeForSearch = Reflect.get(Object(loaded), "normalizeForSearch"); + const NativePreparedSearch = Reflect.get( + Object(loaded), + "NativePreparedSearch", + ); + const nativePackageVersion = Reflect.get( + Object(loaded), + "nativePackageVersion", + ); + const redactStaticEntitiesJson = Reflect.get( + Object(loaded), + "redactStaticEntitiesJson", + ); + const redactStaticEntitiesDiagnosticsJson = Reflect.get( + Object(loaded), + "redactStaticEntitiesDiagnosticsJson", + ); + const prepareStaticSearchArtifactsBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchArtifactsBytes", + ); + const prepareStaticSearchPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchPackageBytes", + ); + const prepareStaticSearchCompressedPackageBytes = Reflect.get( + Object(loaded), + "prepareStaticSearchCompressedPackageBytes", + ); + if ( + typeof NativePreparedSearch !== "function" || + typeof normalizeForSearch !== "function" || + typeof nativePackageVersion !== "function" || + typeof prepareStaticSearchArtifactsBytes !== "function" || + typeof prepareStaticSearchPackageBytes !== "function" || + typeof prepareStaticSearchCompressedPackageBytes !== "function" || + typeof redactStaticEntitiesJson !== "function" || + typeof redactStaticEntitiesDiagnosticsJson !== "function" + ) { + throw new TypeError("Native anonymize adapter exports are incomplete"); + } + return { + NativePreparedSearch: + NativePreparedSearch as NativeAdapter["NativePreparedSearch"], + normalizeForSearch, + nativePackageVersion, + prepareStaticSearchArtifactsBytes, + prepareStaticSearchPackageBytes, + prepareStaticSearchCompressedPackageBytes, + redactStaticEntitiesJson, + redactStaticEntitiesDiagnosticsJson, + }; +}; + +const runTsAdapter = ( + adapter: NativeAdapter, + text: string, + operators: Record | null, +): StaticRedactionResult => { + const operatorsJson = operatorConfigJson(operators); + return JSON.parse( + adapter.redactStaticEntitiesJson(CONFIG_JSON, text, operatorsJson), + ); +}; + +const runTsDiagnosticsAdapter = ( + adapter: NativeAdapter, + text: string, + operators: Record | null, +): StaticRedactionDiagnosticResult => { + const operatorsJson = operatorConfigJson(operators); + return JSON.parse( + adapter.redactStaticEntitiesDiagnosticsJson( + CONFIG_JSON, + text, + operatorsJson, + ), + ); +}; + +const runPythonAdapters = ( + pythonModulePath: string, + cases: GeneratedNativeCase[], + tempDir: string, +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: operatorConfigJson(operators), + })), + }), + ); + + const output = runCommand("python3", ["-c", PYTHON_ADAPTER_SCRIPT], { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + +const callPythonNativeOffsetSlice = ( + pythonModulePath: string, + text: string, + label: string, + expected: string, + operators: Record | null, +): PythonNativeOffsetSlice => { + const payloadDir = mkdtempSync( + join(tmpdir(), "stella-anonymize-py-offsets-"), + ); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + label, + expected, + operators_json: operatorConfigJson(operators), + }), + ); + try { + const output = runCommand("python3", ["-c", PYTHON_NATIVE_OFFSET_SCRIPT], { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + +const callPythonNormalize = ( + pythonModulePath: string, + text: string, +): string => { + const payloadDir = mkdtempSync(join(tmpdir(), "stella-anonymize-normalize-")); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync(payloadPath, JSON.stringify({ text })); + try { + return runCommand( + "python3", + [ + "-c", + ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +print(module.normalize_for_search(payload["text"])) +`, + ], + { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ).trimEnd(); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + +const callPythonVersion = (pythonModulePath: string): string => + runCommand("python3", ["-c", PYTHON_VERSION_SCRIPT], { + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }).trimEnd(); + +const callPythonPreparedWithArtifacts = ( + pythonModulePath: string, + tempDir: string, + artifactBytes: Buffer, + text: string, + operators: Record | null, +): StaticRedactionResult => { + const payloadPath = join(tempDir, "prepared-artifacts-payload.json"); + const artifactPath = join(tempDir, "prepared-artifacts.bin"); + writeFileSync(artifactPath, artifactBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_PREPARED_ARTIFACT_SCRIPT], + { + STELLA_ANONYMIZE_ARTIFACTS: artifactPath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + +const callPythonPreparedWithPackage = ( + pythonModulePath: string, + tempDir: string, + packageBytes: Buffer, + text: string, + operators: Record | null, + prepareFn = "prepare_static_search_package_bytes", + configJson = CONFIG_JSON, +): StaticRedactionResult => { + const payloadPath = join(tempDir, "prepared-package-payload.json"); + const packagePath = join(tempDir, "prepared-package.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: configJson, + text, + operators_json: operatorConfigJson(operators), + }), + ); + const output = runCommand("python3", ["-c", PYTHON_PREPARED_PACKAGE_SCRIPT], { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PACKAGE_PREPARE_FN: prepareFn, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + +const callPythonPreparedPackageCases = ( + pythonModulePath: string, + tempDir: string, + packageBytes: Buffer, + cases: Array<{ + text: string; + operators: Record | null; + }>, + prepareFn = "prepare_static_search_package_bytes", + configJson = CONFIG_JSON, +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "prepared-package-cases-payload.json"); + const packagePath = join(tempDir, "prepared-package-cases.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: configJson, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: operatorConfigJson(operators), + })), + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_PREPARED_PACKAGE_CASES_SCRIPT], + { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PACKAGE_PREPARE_FN: prepareFn, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + +type PythonPackageFacadeOptions = { + pythonModulePath: string; + tempDir: string; + packageBytes: Buffer; + text: string; + operators: Record | null; + compressed: boolean; +}; + +const callPythonPackageFacade = ({ + pythonModulePath, + tempDir, + packageBytes, + text, + operators, + compressed, +}: PythonPackageFacadeOptions): { + from_bytes: StaticRedactionResult; + from_file: StaticRedactionResult; + version: string; +} => { + const payloadPath = join(tempDir, "package-facade-payload.json"); + const packagePath = join(tempDir, "package-facade.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + compressed, + }), + ); + const output = runCommand("python3", ["-c", PYTHON_PACKAGE_FACADE_SCRIPT], { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }); + return JSON.parse(output); +}; + +const callRustCoreSharedSdkParity = ( + tempDir: string, + cases: SharedSdkParityCase[], +): StaticRedactionResult[] => { + const payloadPath = join(tempDir, "rust-core-shared-sdk-payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + cases: cases.map(({ text, operators }) => ({ + text, + operators_json: nativeOperatorConfigJson(operators), + })), + }), + ); + const output = runCommand( + "cargo", + [ + "run", + "-p", + "stella-anonymize-adapter-contract", + "--example", + "native_adapter_parity", + "--locked", + "--quiet", + ], + { + STELLA_ANONYMIZE_PARITY_PAYLOAD: payloadPath, + }, + ); + return JSON.parse(output); +}; + +type PythonSharedSdkParityOptions = { + pythonModulePath: string; + tempDir: string; + packageBytes: Buffer; + cases: SharedSdkParityCase[]; + normalizeText: string; +}; + +const callPythonSharedSdkParity = ({ + pythonModulePath, + tempDir, + packageBytes, + cases, + normalizeText, +}: PythonSharedSdkParityOptions): { + from_bytes: StaticRedactionResult[]; + from_file: StaticRedactionResult[]; + top_level: StaticRedactionResult[]; + top_level_object: OffsetFreeStaticRedactionResult[]; + normalized: string; + version: string; +} => { + const payloadPath = join(tempDir, "shared-sdk-payload.json"); + const packagePath = join(tempDir, "shared-sdk-package.bin"); + writeFileSync(packagePath, packageBytes); + writeFileSync( + payloadPath, + JSON.stringify({ + cases: cases.map(({ text, operators }) => ({ + text, + operators: operators?.operators ?? null, + redact_string: operators?.redactString, + })), + class_names: SHARED_NATIVE_SDK_CLASS_NAMES, + compressed: true, + config_json: CONFIG_JSON, + normalize_text: normalizeText, + prepared_methods: SHARED_NATIVE_SDK_PREPARED_METHODS, + top_level_functions: SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS, + }), + ); + const output = runCommand( + "python3", + ["-c", PYTHON_SHARED_SDK_PARITY_SCRIPT], + { + STELLA_ANONYMIZE_PACKAGE: packagePath, + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); +}; + +const callPythonDiagnostics = ( + pythonModulePath: string, + text: string, + operators: Record | null, +): StaticRedactionDiagnosticResult => { + const payloadDir = mkdtempSync( + join(tmpdir(), "stella-anonymize-diagnostics-"), + ); + const payloadPath = join(payloadDir, "payload.json"); + writeFileSync( + payloadPath, + JSON.stringify({ + config_json: CONFIG_JSON, + text, + operators_json: operatorConfigJson(operators), + }), + ); + try { + const output = runCommand( + "python3", + [ + "-c", + ` +import importlib.util +import json +import os +import pathlib + +module_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PY_MODULE"]) +payload_path = pathlib.Path(os.environ["STELLA_ANONYMIZE_PAYLOAD"]) +spec = importlib.util.spec_from_file_location( + "_native", + module_path, +) +module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(module) +payload = json.loads(payload_path.read_text()) +print( + module.redact_static_entities_diagnostics_json( + payload["config_json"], + payload["text"], + payload.get("operators_json"), + ) +) +`, + ], + { + STELLA_ANONYMIZE_PAYLOAD: payloadPath, + STELLA_ANONYMIZE_PY_MODULE: pythonModulePath, + }, + ); + return JSON.parse(output); + } finally { + rmSync(payloadDir, { recursive: true, force: true }); + } +}; + +const stripDiagnosticTimings = ( + result: StaticRedactionDiagnosticResult, +): StaticRedactionDiagnosticResult => ({ + result: result.result, + diagnostics: { + events: result.diagnostics.events.map( + ({ elapsed_us: _elapsedUs, ...event }) => event, + ), + }, +}); + +const toNativeFacadeEntity = ({ + source_detail: sourceDetail, + ...entity +}: StaticRedactionResult["resolved_entities"][number]) => ({ + ...entity, + ...(sourceDetail ? { sourceDetail } : {}), +}); + +const toBindingEntity = ( + entity: Entity, +): StaticRedactionResult["resolved_entities"][number] => ({ + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + source_detail: entity.sourceDetail ?? null, +}); + +const toBindingRedactionResult = ( + result: RedactionResult, +): StaticRedactionResult["redaction"] => ({ + redacted_text: result.redactedText, + redaction_map: [...result.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.entityCount, +}); + +const toBindingStaticResult = ( + result: NativeStaticRedactionResult, +): StaticRedactionResult => ({ + resolved_entities: result.resolvedEntities.map(toBindingPipelineEntity), + redaction: { + redacted_text: result.redaction.redactedText, + redaction_map: [...result.redaction.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.redaction.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.redaction.entityCount, + }, +}); + +const withoutEntityOffsets = ({ + resolved_entities, + redaction, +}: StaticRedactionResult): OffsetFreeStaticRedactionResult => ({ + resolved_entities: resolved_entities.map( + ({ start: _start, end: _end, ...entity }) => entity, + ), + redaction, +}); + +const toBindingPipelineEntity = ({ + sourceDetail, + ...entity +}: NativeStaticRedactionResult["resolvedEntities"][number]) => ({ + ...entity, + source_detail: sourceDetail ?? null, +}); + +const loadContractFixtureCases = ( + language: (typeof CONTRACT_FIXTURE_LANGUAGES)[number], +): ContractFixtureCase[] => + readdirSync(join(CONTRACT_FIXTURES_DIR, language)) + .filter((name) => name.endsWith(".txt")) + .toSorted() + .map((name) => ({ + name, + text: readFileSync(join(CONTRACT_FIXTURES_DIR, language, name), "utf8"), + })); + +const findNativeFixtureEntity = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => + result.resolved_entities.find( + (entity) => + entity.label === expected.label && + entity.text === expected.text && + (expected.source === undefined || entity.source === expected.source), + ); + +const expectNativeFixtureEntity = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => { + expect(findNativeFixtureEntity(result, expected)).toMatchObject(expected); +}; + +const expectNativeFixtureEntityAbsent = ( + result: StaticRedactionResult, + expected: ExpectedNativeFixtureEntity, +) => { + expect(findNativeFixtureEntity(result, expected)).toBeUndefined(); +}; + +const packageJsonVersion = (): string => { + const packageJson = JSON.parse( + readFileSync(join(ROOT_DIR, "packages", "anonymize", "package.json"), { + encoding: "utf8", + }), + ) as { version?: unknown }; + if (typeof packageJson.version !== "string") { + throw new TypeError("Package version is missing"); + } + return packageJson.version; +}; + +const operatorConfigJson = ( + operators: Record | null, +): string | undefined => { + if (operators === null) { + return undefined; + } + return JSON.stringify({ operators }); +}; + +const nativeOperatorConfigJson = ( + operators: NativeOperatorConfig | null, +): string | undefined => { + if (operators === null) { + return undefined; + } + return JSON.stringify(operators); +}; + +const runCommand = ( + command: string, + args: string[], + env: Record = {}, +): string => { + const result = spawnSync(command, args, { + cwd: ROOT_DIR, + encoding: "utf8", + env: { ...process.env, ...env }, + }); + + if (result.status === 0) { + return result.stdout; + } + + throw new Error( + [ + `${command} ${args.join(" ")} failed with status ${result.status}`, + result.stdout, + result.stderr, + ] + .filter(Boolean) + .join("\n"), + ); +}; diff --git a/packages/anonymize/src/__test__/native-node.test.ts b/packages/anonymize/src/__test__/native-node.test.ts new file mode 100644 index 00000000..c22381cd --- /dev/null +++ b/packages/anonymize/src/__test__/native-node.test.ts @@ -0,0 +1,452 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import type { NativeAnonymizeBinding } from "../native"; +import { + createNativePipelineFromDefaultPackage, + createNativePipelineFromPackageFile, + diagnostics_json, + getDefaultNativePipeline, + load_prepared_package, + load_prepared_package_file, + loadNativeAnonymizeBinding, + native_package_version, + normalize_for_search, + preloadDefaultNativePipeline, + preloadDefaultNativePipelineAsync, + prepare_search_package, + readDefaultNativePipelinePackageFile, + readNativePipelinePackageFile, + readNativePipelinePackageFileAsync, + redact_text, + redact_text_json, +} from "../native-node"; +import { SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS } from "../native-sdk-contract"; + +describe("native node loader", () => { + test("loads the bundled native loader", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0"); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "../index.cjs") { + return binding; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + expect(calls).toEqual(["../index.cjs"]); + }); + + test("loads an explicit native library path first", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0"); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + env: { STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH: "/tmp/anonymize.node" }, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "/tmp/anonymize.node") { + return { default: binding }; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + expect(calls).toEqual(["/tmp/anonymize.node"]); + }); + + test("accepts a napi class constructor on the native binding", () => { + const calls: string[] = []; + const binding = fakeNativeBinding("1.5.0", { + preparedSearchAsConstructor: true, + }); + const loaded = loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + calls.push(specifier); + if (specifier === "../index.cjs") { + return binding; + } + throw new Error("not found"); + }, + }); + + expect(loaded).toBe(binding); + }); + + test("rejects mismatched native binding versions", () => { + expect(() => + loadNativeAnonymizeBinding({ + expectedVersion: "1.5.0", + platform: "darwin", + arch: "arm64", + env: {}, + requireModule: (specifier) => { + if (specifier === "../index.cjs") { + return fakeNativeBinding("1.4.0"); + } + throw new Error("not found"); + }, + }), + ).toThrow("does not match 1.5.0"); + }); + + test("loads native pipeline package bytes from a file", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-package-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, Uint8Array.of(1, 2, 3, 4)); + + expect([...readNativePipelinePackageFile(packagePath)]).toEqual([ + 1, 2, 3, 4, + ]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("loads native pipeline package bytes from a file asynchronously", async () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-package-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, Uint8Array.of(4, 3, 2, 1)); + + expect([ + ...(await readNativePipelinePackageFileAsync(packagePath)), + ]).toEqual([4, 3, 2, 1]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("creates a native pipeline from a package file", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-native-pipeline-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(7, 8, 9)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const pipeline = createNativePipelineFromPackageFile({ + binding, + expectedVersion: "1.5.0", + packagePath, + }); + + expect(capturedBytes).toEqual([[7, 8, 9]]); + expect(pipeline.redactText("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("creates a native pipeline from the default package path override", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-pipeline-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(10, 11, 12)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const pipeline = createNativePipelineFromDefaultPackage({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(capturedBytes).toEqual([[10, 11, 12]]); + expect(pipeline.redactText("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("caches the default native pipeline per binding and package path", () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-cache-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(13, 14, 15)); + let warmCount = 0; + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + onWarmLazyRegex: () => { + warmCount += 1; + }, + }); + + const first = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + const second = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + const preloaded = preloadDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(preloaded).toBe(first); + expect(capturedBytes).toEqual([[13, 14, 15]]); + expect(warmCount).toBe(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("preloads the default native pipeline asynchronously", async () => { + const dir = mkdtempSync(join(tmpdir(), "anonymize-default-async-cache-")); + const packagePath = join(dir, "native-pipeline.stlanonpkg"); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(16, 17, 18)); + let warmCount = 0; + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + onWarmLazyRegex: () => { + warmCount += 1; + }, + }); + + const [first, second] = await Promise.all([ + preloadDefaultNativePipelineAsync({ + binding, + packagePath, + expectedVersion: "1.5.0", + }), + preloadDefaultNativePipelineAsync({ + binding, + packagePath, + expectedVersion: "1.5.0", + }), + ]); + const syncCached = getDefaultNativePipeline({ + binding, + packagePath, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(syncCached).toBe(first); + expect(capturedBytes).toEqual([[16, 17, 18]]); + expect(warmCount).toBe(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("loads language-scoped default native pipeline packages", () => { + const language = "zz-test"; + const packagePath = fileURLToPath( + new URL(`../../native-pipeline.${language}.stlanonpkg`, import.meta.url), + ); + const capturedBytes: number[][] = []; + try { + writeFileSync(packagePath, Uint8Array.of(31, 32, 33)); + const binding = fakeNativeBinding("1.5.0", { + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + const first = getDefaultNativePipeline({ + binding, + language: "ZZ-Test", + expectedVersion: "1.5.0", + }); + const second = getDefaultNativePipeline({ + binding, + language, + expectedVersion: "1.5.0", + }); + + expect(second).toBe(first); + expect(capturedBytes).toEqual([[31, 32, 33]]); + } finally { + rmSync(packagePath, { force: true }); + } + }); + + test("rejects unsafe default native package language selectors", () => { + expect(() => + readDefaultNativePipelinePackageFile({ language: "../en" }), + ).toThrow("Default native pipeline language must match"); + expect(() => + getDefaultNativePipeline({ + binding: fakeNativeBinding("1.5.0"), + language: "en", + packagePath: "/tmp/native-pipeline.stlanonpkg", + }), + ).toThrow("Use either language or packagePath"); + }); + + test("shared SDK helpers delegate through the native binding", () => { + const sharedSdkFunctions: Record< + (typeof SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS)[number], + unknown + > = { + diagnostics_json, + load_prepared_package, + load_prepared_package_file, + native_package_version, + normalize_for_search, + prepare_search_package, + redact_text, + redact_text_json, + }; + for (const name of SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS) { + expect(typeof sharedSdkFunctions[name]).toBe("function"); + } + + const capturedBytes: number[][] = []; + const binding = fakeNativeBinding("1.5.0", { + compressedPackageBytes: Uint8Array.of(21, 22, 23), + onPreparedPackageBytes: (bytes) => { + capturedBytes.push([...bytes]); + }, + }); + + expect(native_package_version({ binding })).toBe("1.5.0"); + expect(normalize_for_search("Číslo", { binding })).toBe("Číslo"); + + const packageBytes = prepare_search_package("{}", { binding }); + expect([...packageBytes]).toEqual([21, 22, 23]); + + const prepared = load_prepared_package(packageBytes, { binding }); + expect(capturedBytes).toEqual([[21, 22, 23]]); + expect(prepared.redact_text("x").redaction.redactedText).toBe(""); + expect(redact_text("{}", "x", undefined, { binding }).redaction).toEqual({ + entityCount: 0, + operatorMap: new Map(), + redactedText: "", + redactionMap: new Map(), + }); + const expectedJson = { + redaction: { + entity_count: 0, + operator_map: [], + redacted_text: "", + redaction_map: [], + }, + resolved_entities: [], + }; + expect(JSON.parse(prepared.redact_text_json("x"))).toEqual(expectedJson); + expect( + JSON.parse(redact_text_json("{}", "x", undefined, { binding })), + ).toEqual(expectedJson); + expect( + JSON.parse(diagnostics_json("{}", "x", undefined, { binding }) ?? "{}"), + ).toEqual({ + diagnostics: { events: [] }, + result: expectedJson, + }); + + const dir = mkdtempSync(join(tmpdir(), "anonymize-shared-sdk-")); + const packagePath = join(dir, "pipeline.stlanonpkg"); + try { + writeFileSync(packagePath, packageBytes); + const fromFile = load_prepared_package_file(packagePath, { binding }); + expect(fromFile.redact_text("x").redaction.redactedText).toBe(""); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +const emptyStaticRedactionBindingResult = () => ({ + resolvedEntities: [], + redaction: { + redactedText: "", + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, +}); + +const emptyStaticRedactionDiagnosticJson = (): string => + JSON.stringify({ + diagnostics: { events: [] }, + result: { + redaction: { + entity_count: 0, + operator_map: [], + redacted_text: "", + redaction_map: [], + }, + resolved_entities: [], + }, + }); + +type FakeNativeBindingOptions = { + preparedSearchAsConstructor?: boolean; + compressedPackageBytes?: Uint8Array; + onPreparedPackageBytes?: (bytes: Uint8Array) => void; + onWarmLazyRegex?: () => void; +}; + +const fakeNativeBinding = ( + version: string, + options: FakeNativeBindingOptions = {}, +): NativeAnonymizeBinding => { + const preparedSearch = { + fromConfigJsonBytes: () => fakePreparedSearch(options.onWarmLazyRegex), + fromPreparedPackageBytes: (bytes: Uint8Array) => { + options.onPreparedPackageBytes?.(bytes); + return fakePreparedSearch(options.onWarmLazyRegex); + }, + }; + const NativePreparedSearch = options.preparedSearchAsConstructor + ? Object.assign(function NativePreparedSearch() {}, preparedSearch) + : preparedSearch; + + return { + normalizeForSearch: (text: string) => text, + nativePackageVersion: () => version, + NativePreparedSearch, + prepareStaticSearchPackageBytes: () => new Uint8Array(), + prepareStaticSearchCompressedPackageBytes: () => + options.compressedPackageBytes ?? new Uint8Array(), + }; +}; + +const fakePreparedSearch = (onWarmLazyRegex?: () => void) => ({ + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + warmLazyRegex: () => { + onWarmLazyRegex?.(); + }, + redactStaticEntities: emptyStaticRedactionBindingResult, + redactStaticEntitiesDiagnosticsJson: emptyStaticRedactionDiagnosticJson, +}); diff --git a/packages/anonymize/src/__test__/pipeline-config.test.ts b/packages/anonymize/src/__test__/pipeline-config.test.ts index 7cb37d56..1abfd7b5 100644 --- a/packages/anonymize/src/__test__/pipeline-config.test.ts +++ b/packages/anonymize/src/__test__/pipeline-config.test.ts @@ -3,12 +3,21 @@ import { describe, expect, test } from "bun:test"; import { createPipelineContext, DEFAULT_ENTITY_LABELS, + createNativePipelineFromConfig, preparePipelineSearch, + prepareNativePipelinePackage, redactText, runPipeline, } from "../index"; import { buildUnifiedSearch } from "../build-unified-search"; -import { REGEX_META } from "../detectors/regex"; +import { + REGEX_META, + REGEX_PATTERNS, + getNativeSigningClausePatterns, + getSigningClausePatterns, +} from "../detectors/regex"; +import { applyPipelineLanguageScope } from "../language-scope"; +import type { NativeAnonymizeBinding } from "../native"; import type { Dictionaries, PipelineConfig } from "../types"; import { loadTestDictionaries } from "./load-dictionaries"; @@ -50,7 +59,80 @@ const detect = async (fullText: string, config: Partial) => context: getCtx(), }); +const createCountingNativeBinding = (version: string) => { + let compressedPrepare = 0; + let rawPrepare = 0; + let fromPackage = 0; + const binding = { + normalizeForSearch: (text: string) => text, + nativePackageVersion: () => version, + NativePreparedSearch: { + fromConfigJsonBytes: () => { + throw new Error("native package cache test should use package bytes"); + }, + fromPreparedPackageBytes: () => { + fromPackage += 1; + return { + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + redactStaticEntities: (fullText: string) => ({ + resolvedEntities: [], + redaction: { + redactedText: fullText, + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, + }), + }; + }, + }, + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => { + rawPrepare += 1; + return new Uint8Array([rawPrepare, configJson.byteLength % 256]); + }, + prepareStaticSearchCompressedPackageBytes: (configJson: Uint8Array) => { + compressedPrepare += 1; + return new Uint8Array([compressedPrepare, configJson.byteLength % 256]); + }, + } satisfies NativeAnonymizeBinding; + + return { + binding, + counts: () => ({ + compressedPrepare, + fromPackage, + rawPrepare, + }), + }; +}; + describe("pipeline config semantics", () => { + test("content language derives dictionary scopes", () => { + expect( + applyPipelineLanguageScope({ + ...BASE_CONFIG, + language: "en-US", + }), + ).toMatchObject({ + nameCorpusLanguages: ["en"], + denyListCountries: ["US", "GB", "CA", "AU", "IE"], + }); + }); + + test("explicit dictionary scopes override content language", () => { + expect( + applyPipelineLanguageScope({ + ...BASE_CONFIG, + language: "en", + denyListCountries: ["CZ"], + nameCorpusLanguages: ["cs"], + }), + ).toMatchObject({ + nameCorpusLanguages: ["cs"], + denyListCountries: ["CZ"], + }); + }); + test("empty labels do not suppress deterministic detectors", async () => { const entities = await detect("Datum narození: 2024-01-02", { enableRegex: true, @@ -85,6 +167,429 @@ describe("pipeline config semantics", () => { expect(regexCount).toBe(expected); }); + test("native config carries final label and threshold filters", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableConfidenceBoost: true, + labels: ["person"], + threshold: 0.93, + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.allowed_labels).toEqual(["person"]); + expect(search.nativeStaticConfig.threshold).toBe(0.93); + expect(search.nativeStaticConfig.confidence_boost).toBe(true); + }); + + test("native config carries false-positive filters without deny-list matching", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableDenyList: false, + enableRegex: true, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.deny_list_data).toBeUndefined(); + expect( + search.nativeStaticConfig.false_positive_filters?.document_heading_words, + ).toContain("schedule"); + }); + + test("native config carries hotword rule metadata", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableHotwordRules: true, + labels: ["date of birth"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.allowed_labels).toEqual(["date of birth"]); + expect(search.nativeStaticConfig.slices.hotwords).toEqual({ + start: search.nativeStaticConfig.slices.hotwords?.start ?? 0, + end: search.nativeStaticConfig.slices.hotwords?.start ?? 0, + }); + expect( + search.nativeStaticConfig.hotword_data?.rules.some((rule) => + rule.hotwords.includes("born"), + ), + ).toBe(true); + expect( + search.nativeStaticConfig.literal_patterns.some( + (pattern) => pattern.pattern === "born", + ), + ).toBe(false); + expect( + search.nativeStaticConfig.hotword_data?.pattern_rule_indices, + ).toEqual([]); + }); + + test("native signing-place patterns match TypeScript signing patterns", async () => { + const [tsPatterns, nativePatterns] = await Promise.all([ + getSigningClausePatterns(), + getNativeSigningClausePatterns(), + ]); + + expect(nativePatterns).toEqual(tsPatterns); + expect(nativePatterns.some((pattern) => pattern.includes("Signed"))).toBe( + true, + ); + expect(nativePatterns.some((pattern) => pattern.includes("À"))).toBe(true); + }); + + test("native pipeline package context cache is scoped by dictionary identity", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-context-dictionaries", + ); + const context = createPipelineContext(); + const cacheDictionaries = { + firstNames: { + en: ["Ada"], + }, + } satisfies Dictionaries; + const config = { + ...BASE_CONFIG, + dictionaries: cacheDictionaries, + enableCountries: false, + labels: ["person"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + dictionaries: { ...cacheDictionaries }, + }, + context, + }); + + expect(counts().compressedPrepare).toBe(2); + }); + + test("native config carries coreference definition data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableCoreference: true, + enableRegex: true, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.coreference_data?.definition_patterns.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.coreference_data?.role_stop_terms, + ).toContain("seller"); + expect( + search.nativeStaticConfig.coreference_data?.legal_form_aliases, + ).toContain("LLC"); + expect( + search.nativeStaticConfig.coreference_data?.legal_form_aliases, + ).toContain("Kft."); + expect( + search.nativeStaticConfig.coreference_data?.organization_suffixes, + ).toContain("LLC"); + expect( + search.nativeStaticConfig.coreference_data?.organization_suffixes, + ).not.toContain("Kft."); + expect( + search.nativeStaticConfig.coreference_data?.organization_determiners, + ).toContain("the\\s+(?:company|corporation|firm)"); + }); + + test("native config carries zone classifier data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableZoneClassification: true, + labels: ["person"], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.zone_data?.section_heading_patterns.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.zone_data?.section_heading_patterns.some( + ({ pattern }) => pattern.includes("Article"), + ), + ).toBe(true); + expect( + search.nativeStaticConfig.zone_data?.signing_clauses.length, + ).toBeGreaterThan(0); + }); + + test("native trigger config carries legal suffix data without legal-form search", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableTriggerPhrases: true, + enableLegalForms: false, + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + const legalFormsSlice = search.nativeStaticConfig.slices.legal_forms; + expect(legalFormsSlice).toBeDefined(); + expect(legalFormsSlice?.end).toBe(legalFormsSlice?.start); + expect( + search.nativeStaticConfig.legal_form_data?.suffixes.length, + ).toBeGreaterThan(0); + expect( + search.nativeStaticConfig.trigger_data?.rules.length, + ).toBeGreaterThan(0); + }); + + test("native config carries stdnum validator metadata", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + labels: ["national identification number"], + }, + [], + createPipelineContext(), + ); + + const patternIndex = search.nativeStaticConfig.regex_patterns.findIndex( + (pattern) => + pattern.kind === "regex" && pattern.pattern.includes("\\d{17}"), + ); + expect(patternIndex).toBeGreaterThanOrEqual(0); + const meta = search.nativeStaticConfig.regex_meta.at(patternIndex); + expect(meta).toMatchObject({ + label: "national identification number", + requires_validation: true, + validator_id: "cn.ric", + }); + expect( + search.nativeStaticConfig.regex_meta.filter( + (entry) => entry.requires_validation === true && !entry.validator_id, + ), + ).toEqual([]); + }); + + test("native config carries static regex prefilter metadata", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + labels: ["email address"], + }, + [], + createPipelineContext(), + ); + + expect(REGEX_PATTERNS.every((pattern) => typeof pattern === "string")).toBe( + true, + ); + const emailPattern = search.nativeStaticConfig.regex_patterns.find( + (pattern) => + pattern.kind === "regex" && + pattern.pattern === "\\b[\\w.+\\-]+@[\\w\\-]+(?:\\.[\\w\\-]+)+\\b", + ); + + expect(emailPattern).toMatchObject({ + lazy: true, + prefilter_any: ["@"], + prefilter_case_insensitive: false, + }); + }); + + test("native trigger config carries currency terms and monetary extension data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: false, + enableTriggerPhrases: true, + labels: [], + }, + [], + createPipelineContext(), + ); + + expect( + search.nativeStaticConfig.trigger_data?.sentence_terminal_currency_terms + .length, + ).toBeGreaterThan(0); + expect(search.nativeStaticConfig.monetary_data).toBeDefined(); + }); + + test("native date data gates year words on trigger phrases", async () => { + const regexOnly = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableTriggerPhrases: false, + labels: ["date"], + }, + [], + createPipelineContext(), + ); + const withTriggers = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableRegex: true, + enableTriggerPhrases: true, + labels: ["date"], + }, + [], + createPipelineContext(), + ); + + expect( + Object.values( + regexOnly.nativeStaticConfig.date_data?.year_words_by_language ?? {}, + ).flat(), + ).toEqual([]); + expect( + Object.values( + withTriggers.nativeStaticConfig.date_data?.year_words_by_language ?? {}, + ).flat().length, + ).toBeGreaterThan(0); + }); + + test("content language scopes deny-list search build", async () => { + const testDictionaries = await getDictionaries(); + const config = { + ...BASE_CONFIG, + dictionaries: testDictionaries, + enableDenyList: true, + enableNameCorpus: true, + labels: ["address", "person"], + }; + + const unscoped = await buildUnifiedSearch( + config, + [], + createPipelineContext(), + ); + const scoped = await buildUnifiedSearch( + { ...config, language: "en" }, + [], + createPipelineContext(), + ); + + expect( + scoped.slices.denyList.end - scoped.slices.denyList.start, + ).toBeLessThan( + unscoped.slices.denyList.end - unscoped.slices.denyList.start, + ); + }); + + test("native config keeps alphanumeric custom deny-list overlays compact", async () => { + const testDictionaries = await getDictionaries(); + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + dictionaries: testDictionaries, + enableDenyList: true, + customDenyList: [ + { + value: "Widget X", + label: "organization", + }, + ], + labels: ["organization"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.literal_patterns_from_deny_list_data).toBe( + true, + ); + expect(search.nativeStaticConfig.literal_patterns).toHaveLength(0); + expect(search.nativeStaticConfig.deny_list_data?.originals).toContain( + "Widget X", + ); + expect( + search.nativeStaticConfig.deny_list_data?.originals.length ?? 0, + ).toBeGreaterThan(1); + }); + + test("native config inlines punctuation-edged custom deny-list overlays", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableDenyList: true, + customDenyList: [ + { + value: ".env", + label: "file", + }, + ], + labels: ["file"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.literal_patterns_from_deny_list_data).toBe( + false, + ); + expect(search.nativeStaticConfig.literal_patterns).toEqual([ + expect.objectContaining({ + kind: "literal-with-options", + pattern: ".env", + whole_words: false, + }), + ]); + }); + + test("native config serializes gazetteer metadata with Rust field names", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableGazetteer: true, + labels: ["organization"], + }, + [ + { + id: "gazetteer-acme", + canonical: "Acme", + label: "organization", + variants: [], + workspaceId: "test", + createdAt: 0, + source: "manual", + }, + ], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.gazetteer_data).toEqual({ + labels: ["organization", "organization"], + is_fuzzy: [false, true], + }); + expect(search.nativeStaticConfig.literal_options.fuzzy_whole_words).toBe( + false, + ); + expect( + Object.hasOwn(search.nativeStaticConfig.gazetteer_data ?? {}, "isFuzzy"), + ).toBe(false); + }); + test("preparePipelineSearch reuses the context search cache", async () => { const context = createPipelineContext(); const config = { @@ -149,6 +654,254 @@ describe("pipeline config semantics", () => { expect(second).not.toBe(first); }); + test("preparePipelineSearch cache keys native redaction options", async () => { + const context = createPipelineContext(); + const baseConfig = { + ...BASE_CONFIG, + enableRegex: true, + labels: ["date of birth"], + }; + + const first = await preparePipelineSearch({ + config: { + ...baseConfig, + threshold: 0.5, + enableConfidenceBoost: false, + enableHotwordRules: false, + }, + context, + }); + const second = await preparePipelineSearch({ + config: { + ...baseConfig, + threshold: 0.93, + enableConfidenceBoost: true, + enableHotwordRules: true, + }, + context, + }); + + expect(second).not.toBe(first); + expect(second.nativeStaticConfig.threshold).toBe(0.93); + expect(second.nativeStaticConfig.confidence_boost).toBe(true); + expect( + second.nativeStaticConfig.hotword_data?.rules.length, + ).toBeGreaterThan(0); + }); + + test("native pipeline package cache reuses exact configs", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-context", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + labels: ["person"], + }; + + const first = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + first[0] = 99; + const second = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + await createNativePipelineFromConfig({ binding, config, context }); + + expect(counts().compressedPrepare).toBe(1); + expect(second[0]).toBe(1); + }); + + test("native pipeline package cache is scoped by dictionary identity", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-dictionaries", + ); + const cacheDictionaries = { + firstNames: { + en: ["Ada"], + }, + } satisfies Dictionaries; + const config = { + ...BASE_CONFIG, + dictionaries: cacheDictionaries, + enableCountries: false, + labels: ["person"], + }; + + await prepareNativePipelinePackage({ + binding, + config, + context: createPipelineContext(), + }); + await prepareNativePipelinePackage({ + binding, + config, + context: createPipelineContext(), + }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + dictionaries: { ...cacheDictionaries }, + }, + context: createPipelineContext(), + }); + + expect(counts().compressedPrepare).toBe(2); + }); + + test("native pipeline package cache keys caller data", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-caller-data", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + customRegexes: [ + { + label: "matter id", + pattern: "MAT-[0-9]+", + }, + ], + enableCountries: false, + enableRegex: true, + labels: ["matter id"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + customRegexes: [ + { + label: "matter id", + pattern: "REF-[0-9]+", + }, + ], + }, + context, + }); + + expect(counts().compressedPrepare).toBe(2); + }); + + test("native pipeline package cache keys contextual native passes", async () => { + const { binding, counts } = createCountingNativeBinding( + "native-cache-contextual-passes", + ); + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + enableCoreference: false, + enableZoneClassification: false, + labels: ["organization"], + }; + + await prepareNativePipelinePackage({ binding, config, context }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + enableCoreference: true, + }, + context, + }); + await prepareNativePipelinePackage({ + binding, + config: { + ...config, + enableZoneClassification: true, + }, + context, + }); + + expect(counts().compressedPrepare).toBe(3); + }); + + test("native pipeline package cache retries after failed build", async () => { + let attempts = 0; + const binding = { + normalizeForSearch: (text: string) => text, + nativePackageVersion: () => "native-cache-retry", + NativePreparedSearch: { + fromConfigJsonBytes: () => { + throw new Error( + "native package cache retry should use package bytes", + ); + }, + fromPreparedPackageBytes: () => ({ + prepareDiagnosticsJson: () => JSON.stringify({ events: [] }), + redactStaticEntities: (fullText: string) => ({ + resolvedEntities: [], + redaction: { + redactedText: fullText, + redactionMap: [], + operatorMap: [], + entityCount: 0, + }, + }), + }), + }, + prepareStaticSearchPackageBytes: () => new Uint8Array([9]), + prepareStaticSearchCompressedPackageBytes: () => { + attempts += 1; + if (attempts === 1) { + throw new Error("build failed"); + } + return new Uint8Array([attempts]); + }, + } satisfies NativeAnonymizeBinding; + const context = createPipelineContext(); + const config = { + ...BASE_CONFIG, + enableCountries: false, + labels: ["person"], + }; + + try { + await prepareNativePipelinePackage({ binding, config, context }); + throw new Error("expected first native package build to fail"); + } catch (error) { + expect(error).toBeInstanceOf(Error); + const message = error instanceof Error ? error.message : ""; + expect(message).toBe("build failed"); + } + + const retry = await prepareNativePipelinePackage({ + binding, + config, + context, + }); + + expect([...retry]).toEqual([2]); + expect(attempts).toBe(2); + }); + + test("native trigger configs carry monetary extension data", async () => { + const search = await buildUnifiedSearch( + { + ...BASE_CONFIG, + enableTriggerPhrases: true, + labels: ["monetary amount"], + }, + [], + createPipelineContext(), + ); + + expect(search.nativeStaticConfig.monetary_data).toBeDefined(); + expect( + search.nativeStaticConfig.monetary_data?.amount_words + .written_amount_patterns.length, + ).toBeGreaterThan(0); + }); + test("enableLegalForms flag gates legal-form detection", async () => { const withFlag = await detect("Acme s.r.o.", { enableLegalForms: true, diff --git a/packages/anonymize/src/build-unified-search.ts b/packages/anonymize/src/build-unified-search.ts index 82d5a31c..aabf1aff 100644 --- a/packages/anonymize/src/build-unified-search.ts +++ b/packages/anonymize/src/build-unified-search.ts @@ -18,36 +18,84 @@ */ import type { PatternEntry, TextSearch } from "@stll/text-search"; +import legalFormRuleWords from "./data/legal-form-rule-words.json"; +import nameCorpusCjk from "./data/name-corpus-cjk.json"; +import nameCorpusParticles from "./data/name-corpus-particles.json"; +import organizationIndicators from "./data/organization-indicators.json"; import { getTextSearch } from "./search-engine"; import { isLegalFormsEnabled, + type CustomRegexPattern, type GazetteerEntry, type PipelineConfig, } from "./types"; +import { applyPipelineLanguageScope } from "./language-scope"; import type { RegexMeta } from "./detectors/regex"; import type { TriggerRule } from "./types"; -import type { DenyListData } from "./detectors/deny-list"; +import type { DenyListData, DenyListFilterData } from "./detectors/deny-list"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; +import { POST_NOMINALS } from "./config/titles"; +import { LEGAL_SUFFIXES } from "./config/legal-forms"; +import { loadLanguageConfigs } from "./util/lang-loader"; import { - REGEX_PATTERNS, + REGEX_PATTERN_ENTRIES, REGEX_META, + NATIVE_REGEX_VALIDATOR_IDS, getCurrencyPatternEntries, CURRENCY_PATTERN_META, + getDateMonthData, getDatePatterns, + getYearWordData, + getMonetaryData, DATE_PATTERN_META, getSigningClausePatterns, + getNativeSigningClausePatterns, SIGNING_CLAUSE_META, + type DateMonthData, + type YearWordData, + type MonetaryData, } from "./detectors/regex"; -import { buildTriggerPatterns } from "./detectors/triggers"; -import { buildDenyList } from "./detectors/deny-list"; -import { buildStreetTypePatterns } from "./detectors/address-seeds"; +import { + buildTriggerPatterns, + getAddressStopKeywordsSync, +} from "./detectors/triggers"; +import { + buildDenyList, + buildDenyListFilterData, + ensureDenyListData, +} from "./detectors/deny-list"; +import { + buildStreetTypePatterns, + getAddressSeedData, + type AddressSeedData, +} from "./detectors/address-seeds"; import { buildGazetteerPatterns } from "./detectors/gazetteer"; import { buildCountryPatterns, type CountryData } from "./detectors/countries"; -import { expandLabelsForHotwordRules } from "./filters/hotword-rules"; +import { + expandLabelsForHotwordRuleSet, + loadHotwordRuleSet, + type HotwordRule, +} from "./filters/hotword-rules"; +import { + getAddressContextData, + type AddressContextData, +} from "./filters/confidence-boost"; +import { + getClauseNounHeadsSync, + getConnectorProseHeadsSync, + getKnownLegalSuffixes, + getLeadingClauseTrimsSync, + getLegalRoleHeadsSync, + getNormalizedInNameLegalFormWordsSync, + getNormalizedLegalBoundarySuffixesSync, + getSentenceVerbIndicatorsSync, + getStructuralSingleCapPrefixesSync, + warmLegalRoleHeads, +} from "./detectors/legal-forms"; const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; const ALNUM_RE = /[\p{L}\p{N}]/u; @@ -57,6 +105,247 @@ type PatternSlice = { end: number; }; +type NativeSearchPatternKind = + | "literal" + | "literal-with-options" + | "regex" + | "fuzzy"; + +export type NativeSearchPattern = { + kind: NativeSearchPatternKind; + pattern: string; + distance?: number; + case_insensitive?: boolean; + whole_words?: boolean; + lazy?: boolean; + prefilter_any?: string[]; + prefilter_case_insensitive?: boolean; + prefilter_regex?: string; +}; + +export type NativeSearchOptions = { + literal_case_insensitive?: boolean; + literal_whole_words?: boolean; + regex_whole_words?: boolean; + regex_overlap_all?: boolean; + fuzzy_case_insensitive?: boolean; + fuzzy_whole_words?: boolean; + fuzzy_normalize_diacritics?: boolean; +}; + +export type NativeRegexMatchMeta = { + label: string; + score: number; + source_detail?: string; + requires_validation?: boolean; + validator_id?: string; + validator_input?: string; + min_byte_length?: number; +}; + +export type NativeDenyListFilterData = { + stopwords: string[]; + allow_list: string[]; + person_stopwords: string[]; + person_trailing_nouns: string[]; + address_stopwords: string[]; + address_jurisdiction_prefixes: string[]; + street_types: string[]; + address_component_terms: string[]; + ambiguous_street_type_terms: string[]; + first_names: string[]; + generic_roles: string[]; + number_abbrev_prefixes: string[]; + sentence_starters: string[]; + trailing_address_word_exclusions: string[]; + document_heading_words: string[]; + document_heading_ordinal_markers: string[]; + defined_term_cues: string[]; + signing_place_guards: NativeSigningPlaceGuardData[]; +}; + +export type NativeSigningPlaceGuardData = { + prefix_phrases: string[]; + suffix_phrases: string[]; +}; + +export type NativeDenyListMatchData = { + labels?: string[][]; + label_table?: string[]; + label_indices?: number[][]; + custom_labels?: string[][]; + custom_label_indices?: number[][]; + originals: string[]; + sources?: string[][]; + source_table?: string[]; + source_indices?: number[][]; + filters?: NativeDenyListFilterData; +}; + +export type NativeTriggerStrategy = + | { type: "to-next-comma"; stop_words?: string[]; max_length?: number } + | { type: "to-end-of-line" } + | { type: "n-words"; count: number } + | { type: "company-id-value" } + | { type: "address"; max_chars?: number } + | { type: "match-pattern"; pattern: string; flags?: string }; + +export type NativeTriggerValidation = + | { type: "starts-uppercase" } + | { type: "min-length"; min: number } + | { type: "max-length"; max: number } + | { type: "no-digits" } + | { type: "has-digits" } + | { type: "matches-pattern"; pattern: string; flags?: string } + | { type: "valid-id"; validator: string }; + +export type NativeTriggerRule = { + trigger: string; + label: string; + strategy: NativeTriggerStrategy; + validations: NativeTriggerValidation[]; + include_trigger: boolean; +}; + +export type NativeTriggerData = { + rules: NativeTriggerRule[]; + address_stop_keywords: string[]; + party_position_terms: string[]; + post_nominals: string[]; + sentence_terminal_currency_terms: string[]; +}; + +export type NativeLegalFormData = { + suffixes: string[]; + normalized_boundary_suffixes: string[]; + normalized_in_name_words: string[]; + normalized_suffix_words: string[]; + role_heads: string[]; + sentence_verb_indicators: string[]; + clause_noun_heads: string[]; + connector_prose_heads: string[]; + structural_single_cap_prefixes: string[]; + leading_clause_phrases: string[]; + leading_clause_direct_prefixes: string[]; + connector_words: string[]; + and_connector_words: string[]; + in_name_prepositions: string[]; + company_suffix_words: string[]; + comma_gated_direct_prefixes: string[]; +}; + +export type NativeDateData = { + month_names_by_language: DateMonthData; + year_words_by_language: YearWordData; +}; + +export type NativeMonetaryData = MonetaryData; +export type NativeAddressSeedData = AddressSeedData; +export type NativeAddressContextData = AddressContextData; +export type NativeCoreferencePatternData = { + pattern: string; + flags: string; +}; +export type NativeCoreferenceData = { + definition_patterns: NativeCoreferencePatternData[]; + role_stop_terms: string[]; + legal_form_aliases: string[]; + organization_suffixes: string[]; + organization_determiners: string[]; +}; +export type NativeNameCorpusData = { + first_names: string[]; + surnames: string[]; + title_tokens: string[]; + title_abbreviations: string[]; + excluded_words: string[]; + common_words: string[]; + non_western_names: string[]; + excluded_all_caps: string[]; + ja_suffixes: string[]; + arabic_connectors: string[]; + relation_connectors: string[]; + hyphenated_prefixes: string[]; + cjk_non_person_terms: string[]; + cjk_surname_starters: string[]; + organization_terms: string[]; +}; +export type NativeZonePatternData = { + pattern: string; + flags: string; +}; +export type NativeZoneSigningClauseData = { + prefix: string; + suffix: string; + prepositions: string[]; +}; +export type NativeZoneData = { + section_heading_patterns: NativeZonePatternData[]; + signing_clauses: NativeZoneSigningClauseData[]; +}; +type GenericRolesData = { + roles: string[]; +}; +type CoreferenceDeterminersData = Record; +export type NativeGazetteerData = { + labels: string[]; + is_fuzzy: boolean[]; +}; + +export type NativeHotwordRule = { + hotwords: string[]; + target_labels: string[]; + score_adjustment: number; + reclassify_to?: string; + proximity_before: number; + proximity_after: number; +}; + +export type NativeHotwordRuleData = { + rules: NativeHotwordRule[]; + pattern_rule_indices: number[]; +}; + +export type NativePreparedSearchConfig = { + regex_patterns: NativeSearchPattern[]; + custom_regex_patterns: NativeSearchPattern[]; + literal_patterns: NativeSearchPattern[]; + regex_options: NativeSearchOptions; + custom_regex_options: NativeSearchOptions; + literal_options: NativeSearchOptions; + literal_patterns_from_deny_list_data?: boolean; + allowed_labels: string[]; + threshold: number; + confidence_boost: boolean; + slices: { + regex: PatternSlice; + custom_regex: PatternSlice; + legal_forms?: PatternSlice; + triggers?: PatternSlice; + deny_list: PatternSlice; + street_types?: PatternSlice; + gazetteer: PatternSlice; + countries: PatternSlice; + hotwords?: PatternSlice; + }; + regex_meta: NativeRegexMatchMeta[]; + custom_regex_meta: NativeRegexMatchMeta[]; + deny_list_data?: NativeDenyListMatchData; + false_positive_filters?: NativeDenyListFilterData; + gazetteer_data?: NativeGazetteerData; + country_data?: CountryData; + hotword_data?: NativeHotwordRuleData; + trigger_data?: NativeTriggerData; + legal_form_data?: NativeLegalFormData; + address_seed_data?: NativeAddressSeedData; + zone_data?: NativeZoneData; + address_context_data?: NativeAddressContextData; + coreference_data?: NativeCoreferenceData; + name_corpus_data?: NativeNameCorpusData; + date_data?: NativeDateData; + monetary_data?: NativeMonetaryData; +}; + const createAllowedLabelSet = ( labels: readonly string[], ): ReadonlySet | null => (labels.length > 0 ? new Set(labels) : null); @@ -66,6 +355,9 @@ const labelIsAllowed = ( allowedLabels: ReadonlySet | null, ): boolean => allowedLabels === null || allowedLabels.has(label); +const sliceContains = (slice: PatternSlice, index: number): boolean => + index >= slice.start && index < slice.end; + export type GazetteerData = { /** Maps local pattern index to entry label. */ labels: string[]; @@ -99,19 +391,237 @@ export type UnifiedSearchInstance = { denyListData: DenyListData | null; gazetteerData: GazetteerData | null; countryData: CountryData | null; + nativeStaticConfig: NativePreparedSearchConfig; }; -export const buildUnifiedSearch = async ( +type GazetteerPatternResult = { + patterns: PatternEntry[]; + data: GazetteerData; +}; + +type CountryPatternResult = { + patterns: PatternEntry[]; + data: CountryData; +}; + +type CoreferenceConfigRow = { + pattern: string; + flags: string; +}; + +type NameCorpusCjkLanguageData = { + nonPersonTerms: string[]; + surnameStarters: string[]; +}; + +type NameCorpusCjkData = Record< + string, + NameCorpusCjkLanguageData | string | undefined +>; + +type NameCorpusParticleLanguageData = { + connectors?: string[]; + relationConnectors?: string[]; + suffixes?: string[]; + hyphenatedPrefixes?: string[]; +}; + +type NameCorpusParticleData = Record< + string, + NameCorpusParticleLanguageData | string | undefined +>; + +type OrganizationIndicatorData = Record; + +type SectionHeadingsConfig = { + patterns: Array<{ re: string; flags: string }>; +}; + +type SigningClauseConfig = { + patterns: Array<{ + prefix?: string; + suffix?: string; + prepositions?: string[]; + }>; +}; + +type UnifiedSearchSources = { + allRegex: PatternEntry[]; + regexMeta: RegexMeta[]; + customRegexes: CustomRegexPattern[]; + customRegexMeta: RegexMeta[]; + legalForms: readonly string[]; + triggers: { + patterns: string[]; + rules: TriggerRule[]; + }; + denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; + streetTypes: string[]; + gazResult: GazetteerPatternResult | null; + countryResult: CountryPatternResult | null; + nativeLegalFormPatterns: string[]; + nativeLegalFormData: NativeLegalFormData | null; + nativeDateData: NativeDateData | null; + nativeMonetaryData: NativeMonetaryData | null; + nativeSentenceTerminalCurrencyTerms: string[]; + nativeAddressSeedData: NativeAddressSeedData | null; + nativeZoneData: NativeZoneData | null; + nativeAddressContextData: NativeAddressContextData | null; + nativeCoreferenceData: NativeCoreferenceData | null; + nativeNameCorpusData: NativeNameCorpusData | null; + nativeSigningPatterns: readonly string[]; + partyPositionTerms: string[]; + hotwordRules: readonly HotwordRule[]; + nativeCurrencyPatternRange: PatternSlice; + nativeDatePatternRange: PatternSlice; + nativeSigningPatternRange: PatternSlice; + nativeAllowedLabels: readonly string[]; + threshold: number; + confidenceBoost: boolean; + slices: UnifiedSearchInstance["slices"]; + literalAllPatterns: PatternEntry[] | string[]; + canUseGlobalWholeWordLiterals: boolean; + customDenyListNeedsWholeWords: (pattern: string) => boolean; +}; + +export type NativeStaticSearchBundle = { + nativeStaticConfig: NativePreparedSearchConfig; + slices: UnifiedSearchInstance["slices"]; + regexMeta: readonly RegexMeta[]; + customRegexMeta: readonly RegexMeta[]; + denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; +}; + +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const NAME_CORPUS_CJK = nameCorpusCjk as NameCorpusCjkData; +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const NAME_CORPUS_PARTICLES = nameCorpusParticles as NameCorpusParticleData; +// eslint-disable-next-line no-unsafe-type-assertion -- JSON config module shape. +const ORGANIZATION_INDICATORS = + organizationIndicators as OrganizationIndicatorData; + +const CJK_LANGUAGE_ALIASES: Record = { + zh: ["zh", "zh-latn", "zh-hans", "zh-hant"], + ja: ["ja", "ja-latn"], + ko: ["ko", "ko-latn"], +}; + +const buildNativeNameCorpusData = ( + config: PipelineConfig, + ctx: PipelineContext, +): NativeNameCorpusData | null => { + if (!config.enableNameCorpus || !config.enableDenyList || !ctx.nameCorpus) { + return null; + } + + const languages = config.nameCorpusLanguages?.map((language) => + language.toLowerCase(), + ); + const cjkNonPersonTerms: string[] = []; + const cjkSurnameStarters: string[] = []; + for (const [language, value] of Object.entries(NAME_CORPUS_CJK)) { + if (!isNameCorpusCjkLanguageData(value)) continue; + if (!languageIsSelected(language, languages, CJK_LANGUAGE_ALIASES)) { + continue; + } + cjkNonPersonTerms.push(...value.nonPersonTerms); + cjkSurnameStarters.push(...value.surnameStarters); + } + + const jaSuffixes: string[] = []; + const arabicConnectors: string[] = []; + const relationConnectors: string[] = []; + const hyphenatedPrefixes: string[] = []; + for (const [language, value] of Object.entries(NAME_CORPUS_PARTICLES)) { + if (!isNameCorpusParticleLanguageData(value)) continue; + if (!languageIsSelected(language, languages)) continue; + jaSuffixes.push(...(value.suffixes ?? [])); + arabicConnectors.push(...(value.connectors ?? [])); + relationConnectors.push(...(value.relationConnectors ?? [])); + hyphenatedPrefixes.push(...(value.hyphenatedPrefixes ?? [])); + } + + const organizationTerms: string[] = []; + for (const value of Object.values(ORGANIZATION_INDICATORS)) { + if (Array.isArray(value)) { + organizationTerms.push(...value); + } + } + + return { + first_names: [...ctx.nameCorpus.firstNamesList], + surnames: [...ctx.nameCorpus.surnamesList], + title_tokens: [...ctx.nameCorpus.titlesList], + title_abbreviations: [...ctx.nameCorpus.titleAbbreviations], + excluded_words: [...ctx.nameCorpus.excludedList], + common_words: [...ctx.nameCorpus.commonWords], + non_western_names: [...ctx.nameCorpus.nonWesternNamesList], + excluded_all_caps: [...ctx.nameCorpus.excludedAllCapsList], + ja_suffixes: uniqueStrings(jaSuffixes), + arabic_connectors: uniqueStrings(arabicConnectors), + relation_connectors: uniqueStrings(relationConnectors), + hyphenated_prefixes: uniqueStrings(hyphenatedPrefixes), + cjk_non_person_terms: uniqueStrings(cjkNonPersonTerms), + cjk_surname_starters: uniqueStrings(cjkSurnameStarters), + organization_terms: uniqueStrings(organizationTerms), + }; +}; + +const isNameCorpusCjkLanguageData = ( + value: NameCorpusCjkData[string], +): value is NameCorpusCjkLanguageData => + typeof value === "object" && + value !== null && + Array.isArray(value.nonPersonTerms) && + Array.isArray(value.surnameStarters); + +const isNameCorpusParticleLanguageData = ( + value: NameCorpusParticleData[string], +): value is NameCorpusParticleLanguageData => + typeof value === "object" && value !== null; + +const languageIsSelected = ( + language: string, + selectedLanguages: readonly string[] | undefined, + aliases: Record = {}, +): boolean => { + if (selectedLanguages === undefined) { + return true; + } + const normalized = language.toLowerCase(); + const accepted = aliases[normalized] ?? [normalized]; + return accepted.some((entry) => selectedLanguages.includes(entry)); +}; + +const uniqueStrings = (values: readonly string[]): string[] => { + const seen = new Set(); + const result: string[] = []; + for (const value of values) { + if (seen.has(value)) continue; + seen.add(value); + result.push(value); + } + return result; +}; + +const buildUnifiedSearchSources = async ( config: PipelineConfig, gazetteerEntries: GazetteerEntry[] = [], ctx: PipelineContext = defaultContext, -): Promise => { +): Promise => { + config = applyPipelineLanguageScope(config); const legalFormsEnabled = isLegalFormsEnabled(config); + const hotwordRules = + config.enableHotwordRules === true ? await loadHotwordRuleSet() : []; const searchLabels = config.enableHotwordRules === true - ? expandLabelsForHotwordRules(config.labels) + ? expandLabelsForHotwordRuleSet(config.labels, hotwordRules) : config.labels; const allowedLabels = createAllowedLabelSet(searchLabels); + const regexMonetaryEnabled = + config.enableRegex && labelIsAllowed("monetary amount", allowedLabels); const customRegexes = config.enableRegex ? (config.customRegexes ?? []).filter((entry) => labelIsAllowed(entry.label, allowedLabels), @@ -123,13 +633,26 @@ export const buildUnifiedSearch = async ( // still gates whether the v2 detector runs in the pipeline, but // its pattern slice is always empty. const [ + _legalFormWarmup, triggers, denyListData, + falsePositiveFilters, streetTypes, currencyPatterns, datePatterns, signingPatterns, + nativeSigningPatterns, + dateMonthData, + yearWordData, + monetaryData, + addressSeedData, + zoneData, + addressContextData, + coreferenceData, ] = await Promise.all([ + legalFormsEnabled || config.enableTriggerPhrases || config.enableCoreference + ? warmLegalRoleHeads() + : Promise.resolve(), config.enableTriggerPhrases ? buildTriggerPatterns() : Promise.resolve({ @@ -137,8 +660,16 @@ export const buildUnifiedSearch = async ( rules: [] as TriggerRule[], }), config.enableDenyList ? buildDenyList(config, ctx) : Promise.resolve(null), + (async () => { + await ensureDenyListData( + ctx, + config.dictionaries, + config.nameCorpusLanguages, + ); + return buildDenyListFilterData(ctx); + })(), buildStreetTypePatterns(), - config.enableRegex && labelIsAllowed("monetary amount", allowedLabels) + regexMonetaryEnabled ? getCurrencyPatternEntries() : Promise.resolve([] as PatternEntry[]), config.enableRegex && labelIsAllowed("date", allowedLabels) @@ -147,6 +678,30 @@ export const buildUnifiedSearch = async ( config.enableRegex && labelIsAllowed("address", allowedLabels) ? getSigningClausePatterns() : Promise.resolve([] as string[]), + config.enableRegex && labelIsAllowed("address", allowedLabels) + ? getNativeSigningClausePatterns() + : Promise.resolve([] as string[]), + config.enableRegex && labelIsAllowed("date", allowedLabels) + ? getDateMonthData() + : Promise.resolve(null), + config.enableRegex && labelIsAllowed("date", allowedLabels) + ? getYearWordData() + : Promise.resolve(null), + config.enableTriggerPhrases || regexMonetaryEnabled + ? getMonetaryData() + : Promise.resolve(null), + labelIsAllowed("address", allowedLabels) + ? getAddressSeedData() + : Promise.resolve(null), + config.enableZoneClassification + ? buildNativeZoneData() + : Promise.resolve(null), + labelIsAllowed("address", allowedLabels) + ? Promise.resolve(getAddressContextData()) + : Promise.resolve(null), + config.enableCoreference + ? buildNativeCoreferenceData() + : Promise.resolve(null), ]); // Read but never populated: the legal-form slice in the unified // search is permanently empty after the v2 rewrite. Tracking it @@ -155,6 +710,51 @@ export const buildUnifiedSearch = async ( // that hasn't migrated to v2-aware indexing yet. const legalForms: readonly string[] = []; void legalFormsEnabled; + const nativeLegalFormPatterns = legalFormsEnabled + ? [...getKnownLegalSuffixes()] + : []; + const nativeLegalFormSuffixes = + legalFormsEnabled || config.enableTriggerPhrases || config.enableCoreference + ? [...getKnownLegalSuffixes()] + : []; + const nativeOrganizationSuffixes = config.enableCoreference + ? [...LEGAL_SUFFIXES] + : []; + const nativeLegalFormData = + nativeLegalFormSuffixes.length > 0 + ? { + suffixes: nativeLegalFormSuffixes, + normalized_boundary_suffixes: [ + ...getNormalizedLegalBoundarySuffixesSync(), + ], + normalized_in_name_words: [ + ...getNormalizedInNameLegalFormWordsSync(), + ], + normalized_suffix_words: nativeLegalFormSuffixes + .map((suffix) => suffix.replaceAll(/[.,\s]/g, "").toLowerCase()) + .filter((suffix) => suffix.length > 0), + role_heads: [...getLegalRoleHeadsSync()], + sentence_verb_indicators: [...getSentenceVerbIndicatorsSync()], + clause_noun_heads: [...getClauseNounHeadsSync()], + connector_prose_heads: [...getConnectorProseHeadsSync()], + structural_single_cap_prefixes: [ + ...getStructuralSingleCapPrefixesSync(), + ], + leading_clause_phrases: [...getLeadingClauseTrimsSync().phrases], + leading_clause_direct_prefixes: [ + ...getLeadingClauseTrimsSync().directPrefixes, + ], + connector_words: legalFormRuleWords.connectorWords, + and_connector_words: legalFormRuleWords.andConnectorWords, + in_name_prepositions: legalFormRuleWords.inNamePrepositions, + company_suffix_words: legalFormRuleWords.companySuffixWords, + comma_gated_direct_prefixes: + legalFormRuleWords.commaGatedDirectPrefixes, + } + : null; + const partyPositionTerms = config.enableTriggerPhrases + ? [...getLegalRoleHeadsSync()] + : []; // ── Instance 1: regex + triggers + legal-forms ── // Trigger patterns are lowercased strings with @@ -169,7 +769,7 @@ export const buildUnifiedSearch = async ( const allRegex: PatternEntry[] = []; const regexMeta: RegexMeta[] = []; if (config.enableRegex) { - for (const [index, pattern] of REGEX_PATTERNS.entries()) { + for (const [index, pattern] of REGEX_PATTERN_ENTRIES.entries()) { const meta = REGEX_META[index]; if (!meta || !labelIsAllowed(meta.label, allowedLabels)) { continue; @@ -178,14 +778,26 @@ export const buildUnifiedSearch = async ( regexMeta.push(meta); } } + const nativeCurrencyPatternRange = { + start: allRegex.length, + end: allRegex.length + currencyPatterns.length, + }; for (const pattern of currencyPatterns) { allRegex.push(pattern); regexMeta.push(CURRENCY_PATTERN_META); } + const nativeDatePatternRange = { + start: allRegex.length, + end: allRegex.length + datePatterns.length, + }; for (const pattern of datePatterns) { allRegex.push(pattern); regexMeta.push(DATE_PATTERN_META); } + const nativeSigningPatternRange = { + start: allRegex.length, + end: allRegex.length + signingPatterns.length, + }; for (const pattern of signingPatterns) { allRegex.push(pattern); regexMeta.push(SIGNING_CLAUSE_META); @@ -195,6 +807,19 @@ export const buildUnifiedSearch = async ( score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, sourceDetail: "custom-regex" as const, })); + const nativeDateData = + dateMonthData === null + ? null + : { + month_names_by_language: dateMonthData, + year_words_by_language: + config.enableTriggerPhrases === true ? (yearWordData ?? {}) : {}, + }; + const nativeMonetaryData = + config.enableTriggerPhrases || regexMonetaryEnabled ? monetaryData : null; + const nativeSentenceTerminalCurrencyTerms = + sentenceTerminalCurrencyTerms(monetaryData); + const nativeNameCorpusData = buildNativeNameCorpusData(config, ctx); let offset = 0; @@ -220,28 +845,6 @@ export const buildUnifiedSearch = async ( end: offset + triggers.patterns.length, }; - // Trigger patterns need caseInsensitive on AC - // (only ~120 objects, not 200K). Regex/legal-form - // patterns are bare strings (auto-classified). - const triggerEntries = triggers.patterns.map((p) => ({ - pattern: p, - literal: true as const, - caseInsensitive: true, - })); - - const regexAllPatterns = [...allRegex, ...legalForms, ...triggerEntries]; - - // TextSearch uses static complexity routing for - // regex patterns: common regexes share bounded - // chunks, while high-risk patterns are isolated. - const tsRegex = new (getTextSearch())(regexAllPatterns); - const tsCustomRegex = new (getTextSearch())( - customRegexes.map((entry) => entry.pattern), - { - overlapStrategy: "all", - }, - ); - // ── Instance 2: deny-list + street-types + gaz ── // Deny-list and street-type patterns are plain // strings (allLiteral). Gazetteer adds exact @@ -316,12 +919,14 @@ export const buildUnifiedSearch = async ( } return entry.pattern; }; - const hasCustomDenyListPatterns = - denyListData?.sources.some((sources) => - sources.includes("custom-deny-list"), + const hasCustomLiteralBoundaryOverride = + denyListData?.originals.some( + (pattern, index) => + (denyListData.sources[index] ?? []).includes("custom-deny-list") && + !customDenyListNeedsWholeWords(pattern), ) ?? false; const canUseGlobalWholeWordLiterals = - !hasCustomDenyListPatterns && gazResult === null; + !hasCustomLiteralBoundaryOverride && gazResult === null; const literalAllPatterns: PatternEntry[] | string[] = canUseGlobalWholeWordLiterals ? [ @@ -343,10 +948,155 @@ export const buildUnifiedSearch = async ( ...(countryResult?.patterns ?? []), ]; + return { + allRegex, + regexMeta, + customRegexes, + customRegexMeta, + legalForms, + triggers, + denyListData, + falsePositiveFilters, + streetTypes, + gazResult, + countryResult, + nativeLegalFormPatterns, + nativeLegalFormData, + nativeDateData, + nativeMonetaryData, + nativeSentenceTerminalCurrencyTerms, + nativeAddressSeedData: addressSeedData, + nativeZoneData: zoneData, + nativeAddressContextData: addressContextData, + nativeCoreferenceData: + coreferenceData === null + ? null + : { + ...coreferenceData, + legal_form_aliases: nativeLegalFormSuffixes, + organization_suffixes: nativeOrganizationSuffixes, + }, + nativeNameCorpusData, + nativeSigningPatterns, + partyPositionTerms, + hotwordRules, + nativeCurrencyPatternRange, + nativeDatePatternRange, + nativeSigningPatternRange, + nativeAllowedLabels: config.labels, + threshold: config.threshold, + confidenceBoost: config.enableConfidenceBoost, + slices: { + regex: regexSlice, + customRegex: customRegexSlice, + legalForms: legalFormsSlice, + triggers: triggersSlice, + denyList: denyListSlice, + streetTypes: streetTypesSlice, + gazetteer: gazetteerSlice, + countries: countriesSlice, + }, + literalAllPatterns, + canUseGlobalWholeWordLiterals, + customDenyListNeedsWholeWords, + }; +}; + +export const buildNativeStaticSearchBundle = async ( + config: PipelineConfig, + gazetteerEntries: GazetteerEntry[] = [], + ctx: PipelineContext = defaultContext, +): Promise => { + const sources = await buildUnifiedSearchSources( + config, + gazetteerEntries, + ctx, + ); + return { + nativeStaticConfig: buildNativeStaticConfig({ + regexPatterns: sources.allRegex, + regexMeta: sources.regexMeta, + customRegexes: sources.customRegexes, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, + triggerPatterns: sources.triggers.patterns, + triggerRules: sources.triggers.rules, + legalFormPatterns: sources.nativeLegalFormPatterns, + legalFormData: sources.nativeLegalFormData, + dateData: sources.nativeDateData, + monetaryData: sources.nativeMonetaryData, + sentenceTerminalCurrencyTerms: + sources.nativeSentenceTerminalCurrencyTerms, + addressSeedData: sources.nativeAddressSeedData, + zoneData: sources.nativeZoneData, + addressContextData: sources.nativeAddressContextData, + coreferenceData: sources.nativeCoreferenceData, + nameCorpusData: sources.nativeNameCorpusData, + nativeSigningPatterns: sources.nativeSigningPatterns, + partyPositionTerms: sources.partyPositionTerms, + hotwordRules: sources.hotwordRules, + streetTypes: sources.streetTypes, + omitRegexRanges: [ + sources.nativeCurrencyPatternRange, + sources.nativeDatePatternRange, + sources.nativeSigningPatternRange, + ], + gazetteerPatterns: sources.gazResult?.patterns ?? [], + gazetteerData: sources.gazResult?.data ?? null, + countryPatterns: sources.countryResult?.patterns ?? [], + countryData: sources.countryResult?.data ?? null, + canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, + customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, + allowedLabels: sources.nativeAllowedLabels, + threshold: sources.threshold, + confidenceBoost: sources.confidenceBoost, + }), + slices: sources.slices, + regexMeta: sources.regexMeta, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, + }; +}; + +export const buildUnifiedSearch = async ( + config: PipelineConfig, + gazetteerEntries: GazetteerEntry[] = [], + ctx: PipelineContext = defaultContext, +): Promise => { + const sources = await buildUnifiedSearchSources( + config, + gazetteerEntries, + ctx, + ); + const triggerEntries = sources.triggers.patterns.map((p) => ({ + pattern: p, + literal: true as const, + caseInsensitive: true, + })); + + const regexAllPatterns = [ + ...sources.allRegex, + ...sources.legalForms, + ...triggerEntries, + ]; + + // TextSearch uses static complexity routing for + // regex patterns: common regexes share bounded + // chunks, while high-risk patterns are isolated. + const tsRegex = new (getTextSearch())(regexAllPatterns); + const tsCustomRegex = new (getTextSearch())( + sources.customRegexes.map((entry) => entry.pattern), + { + overlapStrategy: "all", + }, + ); + const tsLiterals = - literalAllPatterns.length > 0 - ? new (getTextSearch())(literalAllPatterns, { - ...(canUseGlobalWholeWordLiterals + sources.literalAllPatterns.length > 0 + ? new (getTextSearch())(sources.literalAllPatterns, { + ...(sources.canUseGlobalWholeWordLiterals ? { allLiteral: true, wholeWords: true } : {}), caseInsensitive: true, @@ -354,25 +1104,763 @@ export const buildUnifiedSearch = async ( }) : new (getTextSearch())([]); + const nativeStaticConfig = buildNativeStaticConfig({ + regexPatterns: sources.allRegex, + regexMeta: sources.regexMeta, + customRegexes: sources.customRegexes, + customRegexMeta: sources.customRegexMeta, + denyListData: sources.denyListData, + falsePositiveFilters: sources.falsePositiveFilters, + triggerPatterns: sources.triggers.patterns, + triggerRules: sources.triggers.rules, + legalFormPatterns: sources.nativeLegalFormPatterns, + legalFormData: sources.nativeLegalFormData, + dateData: sources.nativeDateData, + monetaryData: sources.nativeMonetaryData, + sentenceTerminalCurrencyTerms: sources.nativeSentenceTerminalCurrencyTerms, + addressSeedData: sources.nativeAddressSeedData, + zoneData: sources.nativeZoneData, + addressContextData: sources.nativeAddressContextData, + coreferenceData: sources.nativeCoreferenceData, + nameCorpusData: sources.nativeNameCorpusData, + nativeSigningPatterns: sources.nativeSigningPatterns, + partyPositionTerms: sources.partyPositionTerms, + hotwordRules: sources.hotwordRules, + streetTypes: sources.streetTypes, + omitRegexRanges: [ + sources.nativeCurrencyPatternRange, + sources.nativeDatePatternRange, + sources.nativeSigningPatternRange, + ], + gazetteerPatterns: sources.gazResult?.patterns ?? [], + gazetteerData: sources.gazResult?.data ?? null, + countryPatterns: sources.countryResult?.patterns ?? [], + countryData: sources.countryResult?.data ?? null, + canUseGlobalWholeWordLiterals: sources.canUseGlobalWholeWordLiterals, + customDenyListNeedsWholeWords: sources.customDenyListNeedsWholeWords, + allowedLabels: sources.nativeAllowedLabels, + threshold: sources.threshold, + confidenceBoost: sources.confidenceBoost, + }); + return { tsRegex, tsCustomRegex, tsLiterals, + slices: sources.slices, + regexMeta: sources.regexMeta, + customRegexMeta: sources.customRegexMeta, + triggerRules: sources.triggers.rules, + denyListData: sources.denyListData, + gazetteerData: sources.gazResult?.data ?? null, + countryData: sources.countryResult?.data ?? null, + nativeStaticConfig, + }; +}; + +type BuildNativeStaticConfigArgs = { + regexPatterns: readonly PatternEntry[]; + regexMeta: readonly RegexMeta[]; + customRegexes: readonly { pattern: string }[]; + customRegexMeta: readonly RegexMeta[]; + denyListData: DenyListData | null; + falsePositiveFilters: DenyListFilterData; + triggerPatterns: readonly string[]; + triggerRules: readonly TriggerRule[]; + legalFormPatterns: readonly string[]; + legalFormData: NativeLegalFormData | null; + dateData: NativeDateData | null; + monetaryData: NativeMonetaryData | null; + sentenceTerminalCurrencyTerms: readonly string[]; + addressSeedData: NativeAddressSeedData | null; + zoneData: NativeZoneData | null; + addressContextData: NativeAddressContextData | null; + coreferenceData: NativeCoreferenceData | null; + nameCorpusData: NativeNameCorpusData | null; + nativeSigningPatterns: readonly string[]; + partyPositionTerms: readonly string[]; + hotwordRules: readonly HotwordRule[]; + omitRegexRanges?: readonly PatternSlice[]; + streetTypes: readonly string[]; + gazetteerPatterns: readonly PatternEntry[]; + gazetteerData: GazetteerData | null; + countryPatterns: readonly PatternEntry[]; + countryData: CountryData | null; + canUseGlobalWholeWordLiterals: boolean; + customDenyListNeedsWholeWords: (pattern: string) => boolean; + allowedLabels: readonly string[]; + threshold: number; + confidenceBoost: boolean; +}; + +const buildNativeStaticConfig = ({ + regexPatterns, + regexMeta, + customRegexes, + customRegexMeta, + denyListData, + falsePositiveFilters, + triggerPatterns, + triggerRules, + legalFormPatterns, + legalFormData, + dateData, + monetaryData, + sentenceTerminalCurrencyTerms, + addressSeedData, + zoneData, + addressContextData, + coreferenceData, + nameCorpusData, + nativeSigningPatterns, + partyPositionTerms, + hotwordRules, + omitRegexRanges, + streetTypes, + gazetteerPatterns, + gazetteerData, + countryPatterns, + countryData, + canUseGlobalWholeWordLiterals, + customDenyListNeedsWholeWords, + allowedLabels, + threshold, + confidenceBoost, +}: BuildNativeStaticConfigArgs): NativePreparedSearchConfig => { + const nativeRegexPatterns: NativeSearchPattern[] = []; + const nativeRegexMeta: NativeRegexMatchMeta[] = []; + for (const [index, pattern] of regexPatterns.entries()) { + if (omitRegexRanges?.some((range) => sliceContains(range, index))) { + continue; + } + const meta = regexMeta[index]; + if (!meta) { + continue; + } + nativeRegexPatterns.push(toNativeRegexPattern(pattern)); + nativeRegexMeta.push(toNativeRegexMeta(meta)); + } + for (const pattern of nativeSigningPatterns) { + nativeRegexPatterns.push(toNativeRegexPattern(pattern)); + nativeRegexMeta.push(toNativeRegexMeta(SIGNING_CLAUSE_META)); + } + + const nativeCustomRegexPatterns = customRegexes.map((entry) => ({ + kind: "regex" as const, + pattern: entry.pattern, + })); + const nativeCustomRegexMeta = customRegexMeta.map(toNativeRegexMeta); + const legalFormNativePatterns = legalFormPatterns.map( + toNativeLegalFormPattern, + ); + const triggerNativePatterns = triggerPatterns.map(toNativeTriggerPattern); + const streetTypeNativePatterns = addressSeedData + ? streetTypes.map((pattern) => + canUseGlobalWholeWordLiterals + ? toNativeGlobalLiteralPattern(pattern) + : toNativeDenyListPattern(pattern, true), + ) + : []; + const denyListPatternsFromData = + canUseGlobalWholeWordLiterals && denyListData !== null; + + const denyPatterns = + denyListData?.originals + .map((pattern, index) => { + if (denyListPatternsFromData) { + return null; + } + return toNativeDenyListPattern( + pattern, + stringArrayValue(denyListData.sources[index]).includes( + "custom-deny-list", + ) + ? customDenyListNeedsWholeWords(pattern) + : true, + ); + }) + .filter((pattern): pattern is NativeSearchPattern => pattern !== null) ?? + []; + const gazetteerNativePatterns = gazetteerPatterns.map(toNativeLiteralPattern); + const countryNativePatterns = countryPatterns.map((pattern) => + canUseGlobalWholeWordLiterals + ? toNativeGlobalLiteralPattern(patternEntryText(pattern)) + : toNativeLiteralPattern(pattern), + ); + let literalOffset = 0; + const denyListPatternCount = denyListPatternsFromData + ? (denyListData?.originals.length ?? 0) + : denyPatterns.length; + const denyListSlice = { + start: literalOffset, + end: literalOffset + denyListPatternCount, + }; + literalOffset = denyListSlice.end; + const streetTypesSlice = { + start: literalOffset, + end: literalOffset + streetTypeNativePatterns.length, + }; + literalOffset = streetTypesSlice.end; + const gazetteerSlice = { + start: literalOffset, + end: literalOffset + gazetteerNativePatterns.length, + }; + literalOffset = gazetteerSlice.end; + const countriesSlice = { + start: literalOffset, + end: literalOffset + countryNativePatterns.length, + }; + literalOffset = countriesSlice.end; + const hotwordsSlice = { + start: literalOffset, + end: literalOffset, + }; + const hasGazetteerFuzzyPatterns = + gazetteerData?.isFuzzy.some((isFuzzy) => isFuzzy) ?? false; + + const nativeConfig: NativePreparedSearchConfig = { + regex_patterns: nativeRegexPatterns, + custom_regex_patterns: nativeCustomRegexPatterns, + literal_patterns: [ + ...denyPatterns, + ...streetTypeNativePatterns, + ...gazetteerNativePatterns, + ...countryNativePatterns, + ], + regex_options: { + literal_case_insensitive: true, + literal_whole_words: false, + regex_whole_words: false, + }, + custom_regex_options: { + regex_whole_words: false, + regex_overlap_all: true, + }, + literal_options: { + literal_case_insensitive: true, + literal_whole_words: canUseGlobalWholeWordLiterals, + fuzzy_case_insensitive: true, + fuzzy_whole_words: !hasGazetteerFuzzyPatterns, + fuzzy_normalize_diacritics: true, + }, + literal_patterns_from_deny_list_data: denyListPatternsFromData, + allowed_labels: [...allowedLabels], + threshold, + confidence_boost: confidenceBoost, slices: { - regex: regexSlice, - customRegex: customRegexSlice, - legalForms: legalFormsSlice, - triggers: triggersSlice, - denyList: denyListSlice, - streetTypes: streetTypesSlice, + regex: { start: 0, end: nativeRegexPatterns.length }, + custom_regex: { start: 0, end: nativeCustomRegexPatterns.length }, + legal_forms: { + start: nativeRegexPatterns.length, + end: nativeRegexPatterns.length + legalFormNativePatterns.length, + }, + triggers: { + start: nativeRegexPatterns.length + legalFormNativePatterns.length, + end: + nativeRegexPatterns.length + + legalFormNativePatterns.length + + triggerNativePatterns.length, + }, + deny_list: denyListSlice, + street_types: streetTypesSlice, gazetteer: gazetteerSlice, countries: countriesSlice, + hotwords: hotwordsSlice, }, - regexMeta, - customRegexMeta, - triggerRules: triggers.rules, - denyListData, - gazetteerData: gazResult?.data ?? null, - countryData: countryResult?.data ?? null, + regex_meta: nativeRegexMeta, + custom_regex_meta: nativeCustomRegexMeta, + }; + nativeConfig.regex_patterns.push( + ...legalFormNativePatterns, + ...triggerNativePatterns, + ); + if (denyListData) { + nativeConfig.deny_list_data = toNativeDenyListData(denyListData); + } + nativeConfig.false_positive_filters = + toNativeDenyListFilters(falsePositiveFilters); + if (gazetteerData) { + nativeConfig.gazetteer_data = toNativeGazetteerData(gazetteerData); + } + if (countryData) { + nativeConfig.country_data = countryData; + } + if (hotwordRules.length > 0) { + nativeConfig.hotword_data = { + rules: hotwordRules.map(toNativeHotwordRule), + pattern_rule_indices: [], + }; + } + if (triggerRules.length > 0) { + nativeConfig.trigger_data = { + rules: triggerRules.map(toNativeTriggerRule), + address_stop_keywords: [...getAddressStopKeywordsSync()], + party_position_terms: [...partyPositionTerms], + post_nominals: [...POST_NOMINALS], + sentence_terminal_currency_terms: [...sentenceTerminalCurrencyTerms], + }; + } + if (legalFormData) { + nativeConfig.legal_form_data = legalFormData; + } + if (addressSeedData) { + nativeConfig.address_seed_data = addressSeedData; + } + if (zoneData) { + nativeConfig.zone_data = zoneData; + } + if (addressContextData) { + nativeConfig.address_context_data = addressContextData; + } + if (coreferenceData) { + nativeConfig.coreference_data = coreferenceData; + } + if (nameCorpusData) { + nativeConfig.name_corpus_data = nameCorpusData; + } + if (dateData) { + nativeConfig.date_data = dateData; + } + if (monetaryData) { + nativeConfig.monetary_data = monetaryData; + } + return nativeConfig; +}; + +const toNativeLegalFormPattern = (pattern: string): NativeSearchPattern => ({ + kind: "literal", + pattern, +}); + +const toNativeGazetteerData = (data: GazetteerData): NativeGazetteerData => ({ + labels: [...data.labels], + is_fuzzy: [...data.isFuzzy], +}); + +const toNativeTriggerPattern = (pattern: string): NativeSearchPattern => ({ + kind: "literal-with-options", + pattern, + case_insensitive: true, +}); + +const toNativeHotwordRule = (rule: HotwordRule): NativeHotwordRule => { + const result: NativeHotwordRule = { + hotwords: [...rule.hotwords], + target_labels: [...rule.targetLabels], + score_adjustment: rule.scoreAdjustment, + proximity_before: rule.proximityBefore, + proximity_after: rule.proximityAfter, + }; + if (rule.reclassifyTo !== undefined) { + result.reclassify_to = rule.reclassifyTo; + } + return result; +}; + +const toNativeTriggerRule = (rule: TriggerRule): NativeTriggerRule => ({ + trigger: rule.trigger, + label: rule.label, + strategy: toNativeTriggerStrategy(rule.strategy), + validations: rule.validations.map(toNativeTriggerValidation), + include_trigger: rule.includeTrigger, +}); + +const toNativeTriggerStrategy = ( + strategy: TriggerRule["strategy"], +): NativeTriggerStrategy => { + switch (strategy.type) { + case "to-next-comma": { + const result: NativeTriggerStrategy = { type: "to-next-comma" }; + if (strategy.stopWords !== undefined) { + result.stop_words = [...strategy.stopWords]; + } + if (strategy.maxLength !== undefined) { + result.max_length = strategy.maxLength; + } + return result; + } + case "to-end-of-line": + return { type: "to-end-of-line" }; + case "n-words": + return { type: "n-words", count: strategy.count }; + case "company-id-value": + return { type: "company-id-value" }; + case "address": { + const result: NativeTriggerStrategy = { type: "address" }; + if (strategy.maxChars !== undefined) { + result.max_chars = strategy.maxChars; + } + return result; + } + case "match-pattern": { + const result: NativeTriggerStrategy = { + type: "match-pattern", + pattern: strategy.pattern, + }; + if (strategy.flags !== undefined) { + result.flags = strategy.flags; + } + return result; + } + default: { + const _exhaustive: never = strategy; + throw new Error(`Unknown trigger strategy: ${String(_exhaustive)}`); + } + } +}; + +const toNativeTriggerValidation = ( + validation: TriggerRule["validations"][number], +): NativeTriggerValidation => { + switch (validation.type) { + case "starts-uppercase": + return { type: "starts-uppercase" }; + case "min-length": + return { type: "min-length", min: validation.min }; + case "max-length": + return { type: "max-length", max: validation.max }; + case "no-digits": + return { type: "no-digits" }; + case "has-digits": + return { type: "has-digits" }; + case "matches-pattern": { + const result: NativeTriggerValidation = { + type: "matches-pattern", + pattern: validation.re.source, + }; + if (validation.re.flags.length > 0) { + result.flags = validation.re.flags; + } + return result; + } + case "valid-id": + return { + type: "valid-id", + validator: validation.validator, + }; + default: { + const _exhaustive: never = validation; + throw new Error(`Unknown trigger validation: ${String(_exhaustive)}`); + } + } +}; + +const toNativeDenyListPattern = ( + pattern: string, + wholeWords: boolean, +): NativeSearchPattern => ({ + kind: "literal-with-options", + pattern, + case_insensitive: true, + whole_words: wholeWords, +}); + +const toNativeGlobalLiteralPattern = ( + pattern: string, +): NativeSearchPattern => ({ + kind: "literal", + pattern, +}); + +const toNativeRegexPattern = (entry: PatternEntry): NativeSearchPattern => { + const pattern: NativeSearchPattern = { + kind: "regex", + pattern: patternEntryText(entry), + }; + if ( + typeof entry === "string" || + entry instanceof RegExp || + entry.pattern instanceof RegExp + ) { + return pattern; + } + + const regexEntry = entry as { + lazy?: boolean; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; + }; + if (regexEntry.lazy !== undefined) { + pattern.lazy = regexEntry.lazy; + } + if (regexEntry.prefilterAny !== undefined) { + pattern.prefilter_any = [...regexEntry.prefilterAny]; + } + if (regexEntry.prefilterCaseInsensitive !== undefined) { + pattern.prefilter_case_insensitive = regexEntry.prefilterCaseInsensitive; + } + if (regexEntry.prefilterRegex !== undefined) { + pattern.prefilter_regex = toNativeRegexSource(regexEntry.prefilterRegex); + } + return pattern; +}; + +const toNativeRegexSource = (regex: RegExp): string => + regex.ignoreCase ? `(?i:${regex.source})` : regex.source; + +const toNativeLiteralPattern = (entry: PatternEntry): NativeSearchPattern => { + if (typeof entry === "string") { + return { kind: "literal", pattern: entry }; + } + if (entry instanceof RegExp) { + throw new Error("Native static config does not accept RegExp objects"); + } + if (entry.pattern instanceof RegExp) { + throw new Error("Native static config does not accept RegExp entries"); + } + if ("distance" in entry) { + const pattern: NativeSearchPattern = { + kind: "fuzzy", + pattern: entry.pattern, + }; + if (entry.distance !== "auto") { + pattern.distance = entry.distance; + } + return pattern; + } + if (entry.literal === true) { + const pattern: NativeSearchPattern = { + kind: "literal-with-options", + pattern: entry.pattern, + }; + if (entry.caseInsensitive !== undefined) { + pattern.case_insensitive = entry.caseInsensitive; + } + if (entry.wholeWords !== undefined) { + pattern.whole_words = entry.wholeWords; + } + return pattern; + } + return { kind: "regex", pattern: entry.pattern }; +}; + +const patternEntryText = (entry: PatternEntry): string => { + if (typeof entry === "string") { + return entry; + } + if (entry instanceof RegExp) { + return entry.source; + } + if (entry.pattern instanceof RegExp) { + return entry.pattern.source; + } + return entry.pattern; +}; + +const toNativeRegexMeta = (meta: RegexMeta): NativeRegexMatchMeta => { + const result: NativeRegexMatchMeta = { + label: meta.label, + score: meta.score, }; + if (meta.sourceDetail) { + result.source_detail = meta.sourceDetail; + } + if (meta.validator) { + const isSupportedValidator = nativeSupportsRegexMeta(meta); + if (!isSupportedValidator || !meta.validatorId) { + throw new Error( + `Native static config does not support regex validator ${meta.validatorId ?? "unknown"}`, + ); + } + result.requires_validation = true; + result.validator_id = meta.validatorId; + if (meta.validatorInputKind) { + result.validator_input = meta.validatorInputKind; + } + } + if (meta.minByteLength !== undefined) { + result.min_byte_length = meta.minByteLength; + } + return result; +}; + +const nativeSupportsRegexMeta = (meta: RegexMeta): boolean => { + if (!meta.validator) { + return true; + } + return ( + meta.validatorId !== undefined && + NATIVE_REGEX_VALIDATOR_IDS.has(meta.validatorId) && + (meta.validatorInputKind === undefined || + meta.validatorInputKind === "digits-only" || + meta.validatorInputKind === "crypto-wallet-candidate") + ); +}; + +const toNativeDenyListData = (data: DenyListData): NativeDenyListMatchData => { + const labelEncoder = createStringGroupEncoder(); + const sourceEncoder = createStringGroupEncoder(); + const result: NativeDenyListMatchData = { + label_table: labelEncoder.table, + label_indices: data.labels.map(labelEncoder.encode), + originals: data.originals, + source_table: sourceEncoder.table, + source_indices: data.sources.map(sourceEncoder.encode), + filters: toNativeDenyListFilters(data.filters), + }; + if (data.customLabels.length > 0) { + const customLabelIndices = data.originals.map((_, index) => + labelEncoder.encode(data.customLabels[index]), + ); + if (customLabelIndices.some((indices) => indices.length > 0)) { + result.custom_label_indices = customLabelIndices; + } + } + return result; +}; + +const sentenceTerminalCurrencyTerms = ( + monetaryData: NativeMonetaryData | null, +): string[] => { + if (monetaryData === null) { + return []; + } + return [ + ...new Set( + [ + ...monetaryData.currencies.codes, + ...monetaryData.currencies.symbols, + ...monetaryData.currencies.local_names, + ].filter((term) => term.length > 0), + ), + ].toSorted(); +}; + +const buildNativeCoreferenceData = async (): Promise => { + const [roleModule, determinerModule] = await Promise.all([ + import("./data/generic-roles.json"), + import("./data/coreference-org-determiners.json"), + ]); + const roleData = (roleModule.default ?? roleModule) as GenericRolesData; + const determinerData = (determinerModule.default ?? + determinerModule) as CoreferenceDeterminersData; + const configs = await loadLanguageConfigs( + "coreference", + (mod) => { + const moduleValue = mod as { + default?: readonly CoreferenceConfigRow[]; + }; + return moduleValue.default ?? (mod as readonly CoreferenceConfigRow[]); + }, + ); + const definitionPatterns: NativeCoreferencePatternData[] = []; + for (const rows of configs) { + for (const row of rows) { + definitionPatterns.push({ + pattern: row.pattern, + flags: row.flags, + }); + } + } + + return { + definition_patterns: definitionPatterns, + role_stop_terms: roleData.roles, + legal_form_aliases: [], + organization_suffixes: [], + organization_determiners: Object.entries(determinerData) + .flatMap(([language, values]) => { + if (language === "_comment" || !Array.isArray(values)) { + return []; + } + return values; + }) + .toSorted((left, right) => left.localeCompare(right)), + }; +}; + +const buildNativeZoneData = async (): Promise => { + const [headingModule, signingModule] = await Promise.all([ + import("./data/section-headings.json"), + import("./data/signing-clauses.json"), + ]); + const headingData = (headingModule.default ?? + headingModule) as SectionHeadingsConfig; + const signingData = (signingModule.default ?? + signingModule) as SigningClauseConfig; + + return { + section_heading_patterns: headingData.patterns.map((pattern) => ({ + pattern: pattern.re, + flags: pattern.flags, + })), + signing_clauses: signingData.patterns.map((pattern) => ({ + prefix: pattern.prefix ?? "", + suffix: pattern.suffix ?? "", + prepositions: pattern.prepositions ?? [], + })), + }; +}; + +const createStringGroupEncoder = (): { + table: string[]; + encode: (values: string | readonly string[] | undefined) => number[]; +} => { + const table: string[] = []; + const indexes = new Map(); + const encodeValue = (value: string): number => { + const existing = indexes.get(value); + if (existing !== undefined) { + return existing; + } + const index = table.length; + table.push(value); + indexes.set(value, index); + return index; + }; + return { + table, + encode: (values) => { + if (values === undefined) { + return []; + } + if (typeof values === "string") { + return [encodeValue(values)]; + } + const encoded: number[] = []; + for (const value of values) { + encoded.push(encodeValue(value)); + } + return encoded; + }, + }; +}; + +const toNativeDenyListFilters = ( + filters: DenyListData["filters"], +): NativeDenyListFilterData => ({ + stopwords: filters.stopwords, + allow_list: filters.allowList, + person_stopwords: filters.personStopwords, + person_trailing_nouns: filters.personTrailingNouns, + address_stopwords: filters.addressStopwords, + address_jurisdiction_prefixes: filters.addressJurisdictionPrefixes, + street_types: filters.streetTypes, + address_component_terms: filters.addressComponentTerms, + ambiguous_street_type_terms: filters.ambiguousStreetTypeTerms, + first_names: filters.firstNames, + generic_roles: filters.genericRoles, + number_abbrev_prefixes: filters.numberAbbrevPrefixes, + sentence_starters: filters.sentenceStarters, + trailing_address_word_exclusions: filters.trailingAddressWordExclusions, + document_heading_words: filters.documentHeadingWords, + document_heading_ordinal_markers: filters.documentHeadingOrdinalMarkers, + defined_term_cues: filters.definedTermCues, + signing_place_guards: filters.signingPlaceGuards.map((entry) => ({ + prefix_phrases: entry.prefixPhrases, + suffix_phrases: entry.suffixPhrases, + })), +}); + +const stringArrayValue = ( + value: string | readonly string[] | undefined, +): string[] => { + if (value === undefined) { + return []; + } + if (typeof value === "string") { + return [value]; + } + return [...value]; }; diff --git a/packages/anonymize/src/context.ts b/packages/anonymize/src/context.ts index 470e0c93..7c1ed162 100644 --- a/packages/anonymize/src/context.ts +++ b/packages/anonymize/src/context.ts @@ -69,6 +69,9 @@ export type PipelineContext = { search: UnifiedSearchInstance | null; searchKey: string; searchPromise: Promise | null; + nativePipelinePackage: Uint8Array | null; + nativePipelinePackageKey: string; + nativePipelinePackagePromise: Promise | null; // ── Name corpus ─────────────────────────────── nameCorpus: NameCorpusData | null; @@ -82,6 +85,8 @@ export type PipelineContext = { allowListPromise: Promise> | null; personStopwords: ReadonlySet | null; personStopwordsPromise: Promise> | null; + definedTermHeads: ReadonlySet | null; + definedTermHeadsPromise: Promise> | null; addressStopwords: ReadonlySet | null; addressStopwordsPromise: Promise> | null; /** First-name exclusions for stopword filtering. */ @@ -110,6 +115,9 @@ export const createPipelineContext = (): PipelineContext => ({ search: null, searchKey: "", searchPromise: null, + nativePipelinePackage: null, + nativePipelinePackageKey: "", + nativePipelinePackagePromise: null, nameCorpus: null, nameCorpusKey: "", @@ -121,6 +129,8 @@ export const createPipelineContext = (): PipelineContext => ({ allowListPromise: null, personStopwords: null, personStopwordsPromise: null, + definedTermHeads: null, + definedTermHeadsPromise: null, addressStopwords: null, addressStopwordsPromise: null, firstNameExclusions: null, diff --git a/packages/anonymize/src/data/address-boundaries.json b/packages/anonymize/src/data/address-boundaries.json index 12c5cc35..cbb8adc4 100644 --- a/packages/anonymize/src/data/address-boundaries.json +++ b/packages/anonymize/src/data/address-boundaries.json @@ -5,6 +5,7 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +36,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/anonymize/src/data/address-context.json b/packages/anonymize/src/data/address-context.json new file mode 100644 index 00000000..393a7981 --- /dev/null +++ b/packages/anonymize/src/data/address-context.json @@ -0,0 +1,72 @@ +{ + "_comment": "Address context guard words by language. These words can appear as '[Word] [number]' near address-like text in legal documents, but usually denote structure, payments, dates, or references rather than a bare street and house number.", + "bareHouseStopwords": { + "cs": [ + "Příloha", + "Smlouva", + "Článek", + "Dodatek", + "Celkem", + "Strana", + "Faktura", + "Částka", + "Položka", + "Kapitola", + "Zákon", + "Vyhláška", + "Nařízení", + "Usnesení", + "Rozsudek", + "Bod", + "Odstavec", + "Záloha", + "Zbývá", + "Dne", + "Platba", + "Datum", + "Splatnost", + "Variabilní", + "Konstantní", + "Specifický" + ], + "en": [ + "Section", + "Sections", + "Article", + "Articles", + "Schedule", + "Schedules", + "Exhibit", + "Exhibits", + "Annex", + "Annexes", + "Appendix", + "Appendices", + "Clause", + "Clauses", + "Chapter", + "Chapters", + "Paragraph", + "Paragraphs", + "Subsection", + "Subsections", + "Form", + "Page", + "Pages", + "Item", + "Items", + "Note", + "Notes", + "Rule", + "Rules", + "Attachment", + "Attachments", + "Volume", + "Volumes", + "Book", + "Books", + "Part", + "Parts" + ] + } +} diff --git a/packages/anonymize/src/data/address-jurisdiction-prefixes.json b/packages/anonymize/src/data/address-jurisdiction-prefixes.json new file mode 100644 index 00000000..f26e4fc1 --- /dev/null +++ b/packages/anonymize/src/data/address-jurisdiction-prefixes.json @@ -0,0 +1,4 @@ +{ + "_comment": "Address-like jurisdiction prefixes that are valid location/address spans without digits or street-type words. Lowercased and organized per language.", + "en": ["commonwealth of", "district of", "state of", "territory of"] +} diff --git a/packages/anonymize/src/data/address-stop-keywords.json b/packages/anonymize/src/data/address-stop-keywords.json index 98d9de5a..f7a0180c 100644 --- a/packages/anonymize/src/data/address-stop-keywords.json +++ b/packages/anonymize/src/data/address-stop-keywords.json @@ -17,6 +17,16 @@ "ičo", "ič" ], + "de": [ + "bank", + "bic", + "iban", + "steuer-id", + "steueridentifikationsnummer", + "steuernummer", + "ust-idnr", + "ust-idnr." + ], "en": ["e-mail", "email", "tel", "swift", "iban", "bic"], "pl": [ "nip", diff --git a/packages/anonymize/src/data/address-unit-abbreviations.json b/packages/anonymize/src/data/address-unit-abbreviations.json new file mode 100644 index 00000000..dfdf7cd7 --- /dev/null +++ b/packages/anonymize/src/data/address-unit-abbreviations.json @@ -0,0 +1,4 @@ +{ + "_comment": "Dotted address unit abbreviations that should not terminate address seed expansion. Organised per language because abbreviations are locale-specific.", + "en": ["apt.", "bldg.", "fl.", "ste.", "unit."] +} diff --git a/packages/anonymize/src/data/ambiguous-country-surfaces.json b/packages/anonymize/src/data/ambiguous-country-surfaces.json new file mode 100644 index 00000000..04962dd5 --- /dev/null +++ b/packages/anonymize/src/data/ambiguous-country-surfaces.json @@ -0,0 +1,4 @@ +{ + "_comment": "Country surface forms that collide with much more common non-country usage. Full country names and aliases remain registered separately when present.", + "words": ["indie", "island", "man", "norfolk"] +} diff --git a/packages/anonymize/src/data/clause-noun-heads.json b/packages/anonymize/src/data/clause-noun-heads.json index 937dc32a..b11bad4f 100644 --- a/packages/anonymize/src/data/clause-noun-heads.json +++ b/packages/anonymize/src/data/clause-noun-heads.json @@ -33,7 +33,11 @@ "přílohu", "dodatek", "dodatku", - "oznámení" + "článek", + "oznámení", + "podmínky", + "předmět", + "ustanovení" ], "de": [ "vertrag", diff --git a/packages/anonymize/src/data/coreference-org-determiners.json b/packages/anonymize/src/data/coreference-org-determiners.json new file mode 100644 index 00000000..c5579c22 --- /dev/null +++ b/packages/anonymize/src/data/coreference-org-determiners.json @@ -0,0 +1,8 @@ +{ + "_comment": "Organization reference determiners used before propagated bare organization names. Values are regex fragments grouped by language.", + "cs": ["společnost(?:i|í|em|u)?", "spolecnost(?:i|em|u)?"], + "de": ["die\\s+(?:gesellschaft|firma)"], + "en": ["the\\s+(?:company|corporation|firm)"], + "es": ["la\\s+(?:empresa|sociedad)", "el\\s+(?:empresa|sociedad)"], + "fr": ["la\\s+société"] +} diff --git a/packages/anonymize/src/data/defined-term-heads.json b/packages/anonymize/src/data/defined-term-heads.json new file mode 100644 index 00000000..aa1fffc2 --- /dev/null +++ b/packages/anonymize/src/data/defined-term-heads.json @@ -0,0 +1,4 @@ +{ + "_comment": "Common head nouns for capitalized defined/legal concepts. These are not person names by themselves; detector-specific filters assemble this vocabulary where needed. Lowercased and organized per language.", + "en": ["association", "period", "reform"] +} diff --git a/packages/anonymize/src/data/deny-list-filters.json b/packages/anonymize/src/data/deny-list-filters.json new file mode 100644 index 00000000..51b89152 --- /dev/null +++ b/packages/anonymize/src/data/deny-list-filters.json @@ -0,0 +1,48 @@ +{ + "en": { + "definedTermCues": [ + "mean", + "means", + "shall mean", + "shall means", + "shall have the meaning", + "shall have the meanings", + "refer to", + "refers to", + "has the meaning", + "has the meanings", + "is defined" + ], + "sentenceStarters": [ + "the", + "this", + "these", + "those", + "an", + "any", + "all", + "each", + "every", + "no", + "now", + "whereas", + "whereby", + "wherein", + "whereof", + "notwithstanding", + "subject", + "in", + "on", + "at", + "by", + "for", + "if", + "upon", + "unless", + "until", + "provided", + "pursuant", + "such" + ] + } +} diff --git a/packages/anonymize/src/data/false-positive-shapes.json b/packages/anonymize/src/data/false-positive-shapes.json new file mode 100644 index 00000000..e30eb764 --- /dev/null +++ b/packages/anonymize/src/data/false-positive-shapes.json @@ -0,0 +1,20 @@ +{ + "_comment": "Language-keyed lexical markers used by false-positive shape guards.", + "addressComponentTerms": { + "cs": ["č.p.", "č.ev.", "č.", "sídliště"] + }, + "ambiguousStreetTypeTerms": { + "fr": ["cours"] + }, + "numberAbbrevPrefixes": { + "cs": ["čís.", "č."], + "de": ["nr."], + "en": ["no.", "n."] + }, + "documentHeadingOrdinalMarkers": { + "cs": ["č.", "č"], + "de": ["nr.", "nr"], + "en": ["no.", "no", "n.", "n"], + "global": ["#"] + } +} diff --git a/packages/anonymize/src/data/language-scopes.json b/packages/anonymize/src/data/language-scopes.json new file mode 100644 index 00000000..5d9b85a5 --- /dev/null +++ b/packages/anonymize/src/data/language-scopes.json @@ -0,0 +1,73 @@ +{ + "_comment": "Default dictionary scopes for content language hints. Lower-level caller config can still override name corpus languages and deny-list countries independently.", + "languages": { + "cs": { + "nameCorpusLanguages": ["cs", "sk"], + "denyListCountries": ["CZ", "SK"] + }, + "de": { + "nameCorpusLanguages": ["de"], + "denyListCountries": ["DE", "AT", "CH"] + }, + "en": { + "nameCorpusLanguages": ["en"], + "denyListCountries": ["US", "GB", "CA", "AU", "IE"] + }, + "es": { + "nameCorpusLanguages": ["es"], + "denyListCountries": [ + "ES", + "MX", + "AR", + "CL", + "CO", + "PE", + "EC", + "VE", + "UY", + "PY", + "BO", + "CR", + "PA", + "DO", + "GT", + "HN", + "SV", + "NI", + "CU" + ] + }, + "fr": { + "nameCorpusLanguages": ["fr"], + "denyListCountries": ["FR", "BE", "CH", "CA", "LU", "MC"] + }, + "hu": { + "nameCorpusLanguages": ["hu"], + "denyListCountries": ["HU"] + }, + "it": { + "nameCorpusLanguages": ["it"], + "denyListCountries": ["IT", "CH"] + }, + "pl": { + "nameCorpusLanguages": ["pl"], + "denyListCountries": ["PL"] + }, + "pt-br": { + "nameCorpusLanguages": ["pt-br"], + "denyListCountries": ["BR"] + }, + "ro": { + "nameCorpusLanguages": ["ro"], + "denyListCountries": ["RO", "MD"] + }, + "sk": { + "nameCorpusLanguages": ["sk", "cs"], + "denyListCountries": ["SK", "CZ"] + }, + "sv": { + "nameCorpusLanguages": ["sv"], + "denyListCountries": ["SE", "FI"] + } + } +} diff --git a/packages/anonymize/src/data/legal-form-rule-words.json b/packages/anonymize/src/data/legal-form-rule-words.json new file mode 100644 index 00000000..d2d1c4fc --- /dev/null +++ b/packages/anonymize/src/data/legal-form-rule-words.json @@ -0,0 +1,27 @@ +{ + "connectorWords": ["a", "and", "und", "et", "e", "y", "i", "&"], + "andConnectorWords": ["and", "und", "et"], + "inNamePrepositions": ["of", "the"], + "companySuffixWords": [ + "Company", + "Co", + "Bank", + "Brothers", + "Bros", + "Sons", + "Group", + "Holdings", + "Trust", + "Partners", + "Associates", + "Corporation", + "Industries", + "Enterprises", + "Solutions", + "Systems", + "Services", + "Foundation", + "Institute" + ], + "commaGatedDirectPrefixes": ["among", "amongst", "between"] +} diff --git a/packages/anonymize/src/data/legal-role-heads.cs.json b/packages/anonymize/src/data/legal-role-heads.cs.json index ffab15d9..8c3debe5 100644 --- a/packages/anonymize/src/data/legal-role-heads.cs.json +++ b/packages/anonymize/src/data/legal-role-heads.cs.json @@ -28,6 +28,12 @@ "dodavatele", "odběratel", "odběratele", + "plátce", + "příjemce", + "uchazeč", + "uchazeče", + "zadavatel", + "zadavatele", "smluvní", "strana", "strany" diff --git a/packages/anonymize/src/data/name-corpus-cjk.json b/packages/anonymize/src/data/name-corpus-cjk.json new file mode 100644 index 00000000..aa9c3bb4 --- /dev/null +++ b/packages/anonymize/src/data/name-corpus-cjk.json @@ -0,0 +1,86 @@ +{ + "_comment": "CJK name-corpus heuristics organised by script language. Used by supplemental name-corpus detection.", + "zh": { + "nonPersonTerms": [ + "中国", + "中國", + "中文", + "人民", + "公司", + "香港", + "台湾", + "臺灣" + ], + "surnameStarters": [ + "王", + "李", + "张", + "張", + "刘", + "劉", + "陈", + "陳", + "杨", + "楊", + "黄", + "黃", + "赵", + "趙", + "吴", + "吳", + "周", + "徐", + "孙", + "孫", + "马", + "馬", + "朱", + "胡", + "郭", + "何", + "林", + "高", + "梁", + "郑", + "鄭", + "罗", + "羅", + "宋", + "谢", + "謝", + "唐", + "韩", + "韓", + "曹", + "许", + "許", + "邓", + "鄧", + "萧", + "蕭", + "田" + ] + }, + "ja": { + "nonPersonTerms": ["日本"], + "surnameStarters": ["山", "佐", "鈴", "渡", "伊", "中", "小", "吉"] + }, + "ko": { + "nonPersonTerms": ["韩国", "韓國"], + "surnameStarters": [ + "金", + "朴", + "박", + "김", + "이", + "최", + "정", + "강", + "조", + "윤", + "장", + "임", + "한" + ] + } +} diff --git a/packages/anonymize/src/data/name-corpus-particles.json b/packages/anonymize/src/data/name-corpus-particles.json new file mode 100644 index 00000000..da0f14c3 --- /dev/null +++ b/packages/anonymize/src/data/name-corpus-particles.json @@ -0,0 +1,13 @@ +{ + "_comment": "Language-specific particles and suffixes for supplemental name-corpus detection.", + "ar": { + "connectors": ["bin", "bint", "ibn", "al", "el"], + "hyphenatedPrefixes": ["al", "el"] + }, + "in": { + "relationConnectors": ["s/o", "d/o", "w/o", "r/o"] + }, + "ja-latn": { + "suffixes": ["san", "sama", "sensei"] + } +} diff --git a/packages/anonymize/src/data/organization-indicators.json b/packages/anonymize/src/data/organization-indicators.json new file mode 100644 index 00000000..033479c5 --- /dev/null +++ b/packages/anonymize/src/data/organization-indicators.json @@ -0,0 +1,36 @@ +{ + "_comment": "Organisation indicator words used to suppress person-name spans.", + "en": [ + "Group", + "Company", + "LLC", + "LLP", + "LP", + "Inc", + "Ltd", + "Corp", + "Corporation", + "Holdings", + "Partners", + "Association", + "University", + "Bank", + "Fund", + "Trust", + "Agency", + "Government", + "Ministry", + "Office", + "Department", + "Council", + "Board", + "Committee", + "Commission", + "Services", + "Solutions", + "Technologies", + "Systems", + "Analytics", + "Software" + ] +} diff --git a/packages/anonymize/src/data/organization-unit-heads.json b/packages/anonymize/src/data/organization-unit-heads.json new file mode 100644 index 00000000..78e4c8f9 --- /dev/null +++ b/packages/anonymize/src/data/organization-unit-heads.json @@ -0,0 +1,13 @@ +{ + "_comment": "Administrative or organizational unit nouns that can appear in legal prose without denoting a person or a street/city suffix. Lowercased and organized per language.", + "cs": [ + "agentura", + "inspekce", + "kancelář", + "odbor", + "oddělení", + "sekretariát", + "správa", + "úřad" + ] +} diff --git a/packages/anonymize/src/data/person-stopwords.json b/packages/anonymize/src/data/person-stopwords.json index ae496fd5..6fd797b8 100644 --- a/packages/anonymize/src/data/person-stopwords.json +++ b/packages/anonymize/src/data/person-stopwords.json @@ -1,5 +1,7 @@ { "_comment": "Words that are valid in other labels (address, org) but should never be classified as person. Checked only in person chain scoring.", + "cs": ["cena"], + "en": ["dodd-frank"], "words": [ "addendum", "agent", diff --git a/packages/anonymize/src/data/signing-clauses.json b/packages/anonymize/src/data/signing-clauses.json index e8c31718..c4a72d55 100644 --- a/packages/anonymize/src/data/signing-clauses.json +++ b/packages/anonymize/src/data/signing-clauses.json @@ -1,53 +1,69 @@ { - "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix (before place), suffix (after place), prepositions (allowed inside multi-word place names).", + "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix/suffix build regexes; guardPrefixPhrases/guardSuffixPhrases suppress deny-list place hits in the same signing context.", "patterns": [ { "lang": "cs", "prefix": "(?:V|Ve)\\s+", "suffix": "\\s*,?\\s*dne", - "prepositions": ["nad", "pod", "u", "ve", "na"] + "prepositions": ["nad", "pod", "u", "ve", "na"], + "guardPrefixPhrases": ["v", "ve"], + "guardSuffixPhrases": ["dne"] }, { "lang": "sk", "prefix": "(?:V|Vo)\\s+", "suffix": "\\s*,?\\s*dňa", - "prepositions": ["nad", "pod", "pri"] + "prepositions": ["nad", "pod", "pri"], + "guardPrefixPhrases": ["v", "vo"], + "guardSuffixPhrases": ["dňa"] }, { "lang": "de", "prefix": "", "suffix": "\\s*,\\s*den", - "prepositions": ["am", "an", "im"] + "prepositions": ["am", "an", "im"], + "guardPrefixPhrases": [""], + "guardSuffixPhrases": ["den"] }, { "lang": "fr", "prefix": "(?:Fait\\s+)?[Àà]\\s+", "suffix": "\\s*,?\\s*le", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["à", "fait à"], + "guardSuffixPhrases": ["le"] }, { "lang": "en", "prefix": "(?:Signed|Executed)\\s+in\\s+", "suffix": "", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["signed in", "executed in"], + "guardSuffixPhrases": [""] }, { "lang": "pl", "prefix": "(?:W|We)\\s+", "suffix": "\\s*,?\\s*dnia", - "prepositions": ["nad", "pod", "przy"] + "prepositions": ["nad", "pod", "przy"], + "guardPrefixPhrases": ["w", "we"], + "guardSuffixPhrases": ["dnia"] }, { "lang": "it", "prefix": "(?:Fatto\\s+)?[Aa]\\s+", "suffix": "\\s*,?\\s*(?:il|lì)", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["a", "fatto a"], + "guardSuffixPhrases": ["il", "lì"] }, { "lang": "es", "prefix": "(?:Firmado\\s+)?[Ee]n\\s+", "suffix": "\\s*,?\\s*(?:a|el)", - "prepositions": ["de", "del"] + "prepositions": ["de", "del"], + "guardPrefixPhrases": ["en", "firmado en"], + "guardSuffixPhrases": ["a", "el"] } ] } diff --git a/packages/anonymize/src/detectors/address-seeds.ts b/packages/anonymize/src/detectors/address-seeds.ts index 45878665..3e2c0bea 100644 --- a/packages/anonymize/src/detectors/address-seeds.ts +++ b/packages/anonymize/src/detectors/address-seeds.ts @@ -80,7 +80,14 @@ type Seed = { type DictionaryConfig = Record; +export type AddressSeedData = { + boundary_words: string[]; + br_cep_cue_words: string[]; + unit_abbreviations: string[]; +}; + let cachedBoundaryRe: RegExp | null = null; +let addressSeedDataPromise: Promise | null = null; const loadBoundaryWords = async (): Promise => { try { @@ -91,6 +98,24 @@ const loadBoundaryWords = async (): Promise => { } }; +const loadFieldStopWords = async (): Promise => { + try { + const mod = await import("../data/address-stop-keywords.json"); + return mod.default as DictionaryConfig; + } catch { + return {}; + } +}; + +const loadUnitAbbreviations = async (): Promise => { + try { + const mod = await import("../data/address-unit-abbreviations.json"); + return mod.default as DictionaryConfig; + } catch { + return {}; + } +}; + // ── pt-BR CEP context gating ──────────────────────── // // The bare `\d{5}-\d{3}` CEP shape collides with non- @@ -339,6 +364,64 @@ export const buildStreetTypePatterns = async (): Promise => { return streetTypePatternsPromise; }; +export const getAddressSeedData = async (): Promise => { + addressSeedDataPromise ??= (async () => { + const [boundaryWords, fieldStopWords, unitAbbreviations, brCueWords] = + await Promise.all([ + loadBoundaryWords(), + loadFieldStopWords(), + loadUnitAbbreviations(), + loadBrCueWords(), + ]); + return { + boundary_words: flattenDictionaries([boundaryWords, fieldStopWords]), + br_cep_cue_words: [...brCueWords], + unit_abbreviations: flattenDictionaries([unitAbbreviations]), + }; + })(); + return addressSeedDataPromise; +}; + +const flattenDictionaries = ( + configs: readonly DictionaryConfig[], +): string[] => { + const words: string[] = []; + const seen = new Set(); + for (const config of configs) { + for (const word of flattenDictionary(config)) { + const key = word.toLowerCase(); + if (seen.has(key)) { + continue; + } + seen.add(key); + words.push(word); + } + } + return words; +}; + +const flattenDictionary = (config: DictionaryConfig): string[] => { + const words: string[] = []; + const seen = new Set(); + for (const values of Object.values(config)) { + if (!Array.isArray(values)) { + continue; + } + for (const word of values) { + if (typeof word !== "string" || word.length === 0) { + continue; + } + const key = word.toLowerCase(); + if (seen.has(key)) { + continue; + } + seen.add(key); + words.push(word); + } + } + return words; +}; + // ── Seed collection ───────────────────────────────── const collectSeeds = ( diff --git a/packages/anonymize/src/detectors/countries.ts b/packages/anonymize/src/detectors/countries.ts index a21834aa..385a932f 100644 --- a/packages/anonymize/src/detectors/countries.ts +++ b/packages/anonymize/src/detectors/countries.ts @@ -4,6 +4,7 @@ import { DETECTION_SOURCES } from "../constants"; import type { Entity } from "../types"; import { normalizeForSearch } from "../util/normalize"; +import ambiguousCountrySurfaces from "../data/ambiguous-country-surfaces.json" with { type: "json" }; import countriesData from "../data/countries.json" with { type: "json" }; const ENTITY_LABEL = "country"; @@ -28,7 +29,7 @@ const INCLUDE_ALPHA2 = false; * All would flag every English occurrence as a country. */ const NAME_BLOCKLIST: ReadonlySet = new Set( - ["man", "island", "indie"].map((s) => s.toLowerCase()), + ambiguousCountrySurfaces.words.map((surface) => surface.toLowerCase()), ); /** diff --git a/packages/anonymize/src/detectors/deny-list.ts b/packages/anonymize/src/detectors/deny-list.ts index b1ea6075..3ea43ae1 100644 --- a/packages/anonymize/src/detectors/deny-list.ts +++ b/packages/anonymize/src/detectors/deny-list.ts @@ -18,9 +18,16 @@ import type { import type { PipelineContext } from "../context"; import { defaultContext } from "../context"; import { loadGenericRoles } from "../filters/false-positives"; +import { buildStreetTypePatterns } from "./address-seeds"; +import { + getClauseNounHeadsSync, + getLegalRoleHeadsSync, + warmLegalRoleHeads, +} from "./legal-forms"; import { normalizeForSearch } from "../util/normalize"; import { ALL_UPPER_RE, UPPER_START_RE } from "../util/text"; import { DASH } from "../util/char-groups"; +import denyListFiltersByLanguage from "../data/deny-list-filters.json"; export type DenyListConfig = Pick< PipelineConfig, @@ -35,6 +42,33 @@ export type DenyListConfig = Pick< | "enableCountries" >; +const lowerSortedUnique = (values: Iterable): string[] => + [...new Set([...values].map((value) => value.toLowerCase()))].toSorted(); + +const collectLanguageWordValues = (data: Record): string[] => { + const words: string[] = []; + const append = (value: unknown): void => { + if (!Array.isArray(value)) { + return; + } + for (const word of value) { + if (typeof word === "string" && word.length > 0) { + words.push(word); + } + } + }; + + append(data["words"]); + for (const [key, value] of Object.entries(data)) { + if (key === "words" || key.startsWith("_")) { + continue; + } + append(value); + } + + return lowerSortedUnique(words); +}; + // ── Allow list (lazy-loaded from JSON) ─────────────── const loadAllowList = (ctx: PipelineContext): Promise> => { @@ -268,10 +302,12 @@ const loadPersonStopwords = ( } ctx.personStopwordsPromise = (async () => { try { - const mod: { - default?: { words?: string[] }; - } = await import("../data/person-stopwords.json"); - const set: ReadonlySet = new Set(mod.default?.words ?? []); + const mod = await import("../data/person-stopwords.json"); + const parsed = + (mod as { default?: Record }).default ?? mod; + const set: ReadonlySet = new Set( + collectLanguageWordValues(parsed as Record), + ); ctx.personStopwords = set; return set; } catch { @@ -289,6 +325,37 @@ const EMPTY_PERSON_STOPWORDS: ReadonlySet = new Set(); export const getPersonStopwords = (ctx: PipelineContext): ReadonlySet => ctx.personStopwords ?? EMPTY_PERSON_STOPWORDS; +export const loadDefinedTermHeads = ( + ctx: PipelineContext, +): Promise> => { + if (ctx.definedTermHeadsPromise) { + return ctx.definedTermHeadsPromise; + } + ctx.definedTermHeadsPromise = (async () => { + try { + const mod = await import("../data/defined-term-heads.json"); + const parsed = + (mod as { default?: Record }).default ?? mod; + const set: ReadonlySet = new Set( + collectLanguageWordValues(parsed as Record), + ); + ctx.definedTermHeads = set; + return set; + } catch { + const empty: ReadonlySet = new Set(); + ctx.definedTermHeads = empty; + return empty; + } + })(); + return ctx.definedTermHeadsPromise; +}; + +const EMPTY_DEFINED_TERM_HEADS: ReadonlySet = new Set(); + +export const getDefinedTermHeads = ( + ctx: PipelineContext, +): ReadonlySet => ctx.definedTermHeads ?? EMPTY_DEFINED_TERM_HEADS; + // ── Address stopwords (single-token city collisions) ── const loadAddressStopwords = ( @@ -420,46 +487,90 @@ const hasAdjacentAddressEvidence = ( return streetRe !== null && streetRe.test(window); }; -/** - * Capitalised words that almost never start a person name. When a - * single-token surname candidate is immediately followed by one of - * these, the "next-word is uppercase" promotion heuristic would - * otherwise turn section headings ("Purchase Price↵The Purchaser - * undertakes…") into spurious person hits. Kept narrow on purpose; - * the surrounding pipeline still chains real names via the deny-list - * cascade when both halves are surnames. - */ -const SENTENCE_STARTER_WORDS: ReadonlySet = new Set([ - "The", - "This", - "These", - "Those", - "An", - "Any", - "All", - "Each", - "Every", - "No", - "Now", - "Whereas", - "Whereby", - "Wherein", - "Whereof", - "Notwithstanding", - "Subject", - "In", - "On", - "At", - "By", - "For", - "If", - "Upon", - "Unless", - "Until", - "Provided", - "Pursuant", - "Such", -]); +type DenyListLanguageFilters = { + sentenceStarters?: readonly string[]; + definedTermCues?: readonly string[]; +}; + +type FalsePositiveShapeFilters = { + addressComponentTerms: string[]; + ambiguousStreetTypeTerms: string[]; + numberAbbrevPrefixes: string[]; + documentHeadingOrdinalMarkers: string[]; +}; + +type SigningClauseData = { + patterns: readonly { + guardPrefixPhrases?: readonly string[]; + guardSuffixPhrases?: readonly string[]; + }[]; +}; + +export type DenyListFilterData = { + stopwords: string[]; + allowList: string[]; + personStopwords: string[]; + personTrailingNouns: string[]; + addressStopwords: string[]; + addressJurisdictionPrefixes: string[]; + streetTypes: string[]; + addressComponentTerms: string[]; + ambiguousStreetTypeTerms: string[]; + firstNames: string[]; + genericRoles: string[]; + numberAbbrevPrefixes: string[]; + sentenceStarters: string[]; + trailingAddressWordExclusions: string[]; + documentHeadingWords: string[]; + documentHeadingOrdinalMarkers: string[]; + definedTermCues: string[]; + signingPlaceGuards: DenyListSigningPlaceGuardData[]; +}; + +export type DenyListSigningPlaceGuardData = { + prefixPhrases: string[]; + suffixPhrases: string[]; +}; + +const DENY_LIST_FILTER_GROUPS: readonly DenyListLanguageFilters[] = + Object.values(denyListFiltersByLanguage); + +const escapeRegExp = (value: string): string => + value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + +const collectLanguageFilterValues = ( + selector: (filters: DenyListLanguageFilters) => readonly string[] | undefined, +): string[] => + lowerSortedUnique( + DENY_LIST_FILTER_GROUPS.flatMap((filters) => selector(filters) ?? []), + ); + +const DENY_LIST_STATIC_FILTERS = { + definedTermCues: collectLanguageFilterValues( + (filters) => filters.definedTermCues, + ), + sentenceStarters: collectLanguageFilterValues( + (filters) => filters.sentenceStarters, + ), +}; + +const SENTENCE_STARTER_WORDS: ReadonlySet = new Set( + DENY_LIST_STATIC_FILTERS.sentenceStarters, +); + +const buildDefinedTermCueRe = (): RegExp => { + const cues = DENY_LIST_STATIC_FILTERS.definedTermCues.toSorted( + (left, right) => right.length - left.length, + ); + if (cues.length === 0) { + return /$(?!)/; + } + const pattern = cues + .map((cue) => escapeRegExp(cue).replace(/\s+/g, "\\s+")) + .join("|"); + return new RegExp(`^[\\s,]*(?:${pattern})\\b`, "iu"); +}; +const DEFINED_TERM_CUE_RE = buildDefinedTermCueRe(); const PERSON_CHAIN_BREAK_RE = /[!?;:]|,/u; const WORD_CHAR_RE = /[\p{L}\p{N}]/u; @@ -601,6 +712,7 @@ export type DenyListData = { originals: string[]; /** Maps pattern index → source types (plural). */ sources: PatternSources[]; + filters: DenyListFilterData; }; const getCityEntries = ( @@ -660,14 +772,18 @@ export const buildDenyList = async ( loadStopwords(ctx), loadAllowList(ctx), loadPersonStopwords(ctx), + loadDefinedTermHeads(ctx), loadAddressStopwords(ctx), loadCommonWords(), loadMonthNames(), loadStreetTypeRe(), loadGenericRoles(ctx), + warmLegalRoleHeads(), + loadTrailingAddressWordExclusions(), ]); const commonWords = await loadCommonWords(); const monthNames = await loadMonthNames(); + const filters = await buildDenyListFilterData(ctx); const dictionaries = config.dictionaries; const hasDenyList = dictionaries?.denyList && dictionaries?.denyListMeta; @@ -683,7 +799,7 @@ export const buildDenyList = async ( // No dictionary data available — skip deny-list building if (!hasDenyList && !hasCities && !hasCustomDenyList) { // Still build name corpus entries if available - return buildNameCorpusOnly(config, ctx); + return buildNameCorpusOnly(config, ctx, filters); } const excluded = config.denyListExcludeCategories; @@ -835,6 +951,7 @@ export const buildDenyList = async ( customLabels: customLabelList, originals: patternList, sources: sourceList, + filters, }; }; @@ -847,6 +964,7 @@ export const buildDenyList = async ( const buildNameCorpusOnly = ( config: DenyListConfig, ctx: PipelineContext, + filters: DenyListFilterData, ): DenyListData | null => { if (!config.enableNameCorpus) { return null; @@ -882,6 +1000,7 @@ const buildNameCorpusOnly = ( customLabels: customLabelList, originals: patternList, sources: sourceList, + filters, }; }; @@ -979,6 +1098,207 @@ type RawMatch = { patternIdx: number; }; +const buildStreetTypeFilterValues = async (): Promise => + lowerSortedUnique(await buildStreetTypePatterns()); + +type SigningPlaceFilters = { + guards: DenyListSigningPlaceGuardData[]; +}; + +let signingPlaceFiltersPromise: Promise | null = null; + +const loadSigningPlaceFilters = (): Promise => { + if (signingPlaceFiltersPromise) { + return signingPlaceFiltersPromise; + } + + signingPlaceFiltersPromise = (async () => { + const mod = await import("../data/signing-clauses.json"); + const data: SigningClauseData = mod.default ?? mod; + return { + guards: data.patterns + .map((entry) => ({ + prefixPhrases: lowerSortedUnique(entry.guardPrefixPhrases ?? []), + suffixPhrases: lowerSortedUnique(entry.guardSuffixPhrases ?? []), + })) + .filter( + (entry) => + entry.prefixPhrases.length > 0 && entry.suffixPhrases.length > 0, + ), + }; + })().catch((error) => { + signingPlaceFiltersPromise = null; + throw error; + }); + + return signingPlaceFiltersPromise; +}; + +let trailingAddressWordExclusionsPromise: Promise> | null = + null; +let documentHeadingWordsPromise: Promise | null = null; +let addressJurisdictionPrefixesPromise: Promise | null = null; + +const loadLanguageWordFile = async ( + importer: () => Promise, +): Promise => { + const mod = await importer(); + const parsed = (mod as { default?: Record }).default ?? mod; + return collectLanguageWordValues(parsed as Record); +}; + +const isRecord = (value: unknown): value is Record => + typeof value === "object" && value !== null && !Array.isArray(value); + +const languageWordValues = (value: unknown): string[] => + isRecord(value) ? collectLanguageWordValues(value) : []; + +let falsePositiveShapeFiltersPromise: Promise | null = + null; + +const loadFalsePositiveShapeFilters = + (): Promise => { + if (falsePositiveShapeFiltersPromise) { + return falsePositiveShapeFiltersPromise; + } + + falsePositiveShapeFiltersPromise = (async () => { + const mod = await import("../data/false-positive-shapes.json"); + const defaultValue = isRecord(mod) ? mod.default : undefined; + let data: Record = {}; + if (isRecord(defaultValue)) { + data = defaultValue; + } else if (isRecord(mod)) { + data = mod; + } + return { + addressComponentTerms: languageWordValues( + data["addressComponentTerms"], + ), + ambiguousStreetTypeTerms: languageWordValues( + data["ambiguousStreetTypeTerms"], + ), + numberAbbrevPrefixes: languageWordValues(data["numberAbbrevPrefixes"]), + documentHeadingOrdinalMarkers: languageWordValues( + data["documentHeadingOrdinalMarkers"], + ), + }; + })().catch((error) => { + falsePositiveShapeFiltersPromise = null; + throw error; + }); + + return falsePositiveShapeFiltersPromise; + }; + +const loadDocumentHeadingWords = (): Promise => { + if (documentHeadingWordsPromise) { + return documentHeadingWordsPromise; + } + + documentHeadingWordsPromise = loadLanguageWordFile( + () => import("../data/document-structure-headings.json"), + ).catch((error) => { + documentHeadingWordsPromise = null; + throw error; + }); + + return documentHeadingWordsPromise; +}; + +const loadTrailingAddressWordExclusions = async (): Promise< + ReadonlySet +> => { + if (trailingAddressWordExclusionsPromise) { + return trailingAddressWordExclusionsPromise; + } + + trailingAddressWordExclusionsPromise = (async () => { + await warmLegalRoleHeads(); + const [organizationUnits, documentHeadings] = await Promise.all([ + loadLanguageWordFile( + () => import("../data/organization-unit-heads.json"), + ), + loadDocumentHeadingWords(), + ]); + return new Set( + lowerSortedUnique([ + ...getLegalRoleHeadsSync(), + ...getClauseNounHeadsSync(), + ...organizationUnits, + ...documentHeadings, + ]), + ); + })().catch((error) => { + trailingAddressWordExclusionsPromise = null; + throw error; + }); + + return trailingAddressWordExclusionsPromise; +}; + +const loadAddressJurisdictionPrefixes = (): Promise => { + if (addressJurisdictionPrefixesPromise) { + return addressJurisdictionPrefixesPromise; + } + + addressJurisdictionPrefixesPromise = loadLanguageWordFile( + () => import("../data/address-jurisdiction-prefixes.json"), + ).catch((error) => { + addressJurisdictionPrefixesPromise = null; + throw error; + }); + + return addressJurisdictionPrefixesPromise; +}; + +export const buildDenyListFilterData = async ( + ctx: PipelineContext, +): Promise => { + const [ + signingPlaceFilters, + trailingAddressWordExclusions, + addressJurisdictionPrefixes, + falsePositiveShapeFilters, + documentHeadingWords, + ] = await Promise.all([ + loadSigningPlaceFilters(), + loadTrailingAddressWordExclusions(), + loadAddressJurisdictionPrefixes(), + loadFalsePositiveShapeFilters(), + loadDocumentHeadingWords(), + ]); + + return { + stopwords: [...getStopwords(ctx)], + allowList: [...getAllowList(ctx)], + personStopwords: [...getPersonStopwords(ctx)], + personTrailingNouns: [...getDefinedTermHeads(ctx)], + addressStopwords: [...getAddressStopwords(ctx)], + addressJurisdictionPrefixes, + streetTypes: await buildStreetTypeFilterValues(), + addressComponentTerms: falsePositiveShapeFilters.addressComponentTerms, + ambiguousStreetTypeTerms: + falsePositiveShapeFilters.ambiguousStreetTypeTerms, + firstNames: [...getNameCorpusFirstNames(ctx)], + genericRoles: [ + ...(ctx.genericRoles ?? EMPTY_GENERIC_ROLES), + ...getLegalRoleHeadsSync(), + ], + numberAbbrevPrefixes: falsePositiveShapeFilters.numberAbbrevPrefixes, + sentenceStarters: [...DENY_LIST_STATIC_FILTERS.sentenceStarters], + trailingAddressWordExclusions: [...trailingAddressWordExclusions], + documentHeadingWords, + documentHeadingOrdinalMarkers: + falsePositiveShapeFilters.documentHeadingOrdinalMarkers, + definedTermCues: [...DENY_LIST_STATIC_FILTERS.definedTermCues], + signingPlaceGuards: signingPlaceFilters.guards.map((entry) => ({ + prefixPhrases: [...entry.prefixPhrases], + suffixPhrases: [...entry.suffixPhrases], + })), + }; +}; + const customMatchHasValidEdges = ( fullText: string, start: number, @@ -1020,9 +1340,13 @@ export const ensureDenyListData = async ( loadStopwords(ctx), loadAllowList(ctx), loadPersonStopwords(ctx), + loadDefinedTermHeads(ctx), loadAddressStopwords(ctx), loadStreetTypeRe(), loadGenericRoles(ctx), + warmLegalRoleHeads(), + loadTrailingAddressWordExclusions(), + loadAddressJurisdictionPrefixes(), ]); }; @@ -1262,14 +1586,9 @@ export const processDenyListMatches = ( continue; } - // Skip the trailing-capitalised-word extension when the - // chain sits inside a defined-term quote - // (`"Bond Hedge Transactions"`, `"Blue Sky Laws"`). - // Legal prose uses curly or straight quotes to introduce - // capitalised noun phrases that are not personal names; - // chaining beyond the name corpus inside that bracketed - // context produces unstable spans like - // `"Bond Hedge Transactions"`-as-person. + // Skip extension inside quoted defined-term contexts: + // legal prose often uses quoted capitalised noun phrases + // that are not personal names. const insideDefinedTermQuote = isSuppressibleDefinedTermQuote( fullText, first.start, @@ -1285,28 +1604,22 @@ export const processDenyListMatches = ( // Score: chained names get 0.9, single names 0.5 const score = chain.length >= 2 ? 0.9 : 0.5; - // Single-word deny-list matches are too noisy: - // "Rate", "Server", "Code" etc. are surnames but - // also common English words. Only accept single- - // word matches when the next word is also uppercase - // (likely a full name: "Alena Zemanová"). Skip - // sentence-starter articles ("The Purchaser…") - // which otherwise turn section headings like - // "Purchase Price↵The Purchaser…" into person hits. + // Single-word deny-list matches are noisy. Only accept + // them when the next token has the shape of a name word, + // while excluding language-data sentence starters. if (chain.length === 1) { const afterEnd = last.end; const rest = fullText.slice(afterEnd).trimStart(); - // Require Cap + lowercase: filters out acronyms like - // "EU", "USA" so "Rady EU" doesn't read as a name. + // Require Cap + lowercase so acronym-shaped tokens + // do not promote a single-token hit. const nextIsUpper = rest.length > 1 && /^\p{Lu}\p{Ll}/u.test(rest); if (!nextIsUpper) { continue; } - // Reject sentence-starter articles ("The Purchaser…") - // so section headings followed by a sentence don't - // get promoted to person hits. + // Reject sentence starters so headings followed by + // prose do not get promoted to person hits. const nextWord = /^\p{L}+/u.exec(rest)?.[0] ?? ""; - if (SENTENCE_STARTER_WORDS.has(nextWord)) { + if (SENTENCE_STARTER_WORDS.has(nextWord.toLowerCase())) { continue; } } @@ -1326,7 +1639,11 @@ export const processDenyListMatches = ( // "Praha 1", "Brno 2"). Czech and Slovak cities // commonly have numbered districts that are part of // the address. - extendCityDistricts(results, fullText); + extendCityDistricts( + results, + fullText, + new Set(data.filters.trailingAddressWordExclusions), + ); return results; }; @@ -1358,45 +1675,11 @@ const POSTAL_PREFIX_RE = new RegExp( `(?:\\d{5}|\\d{3}\\s\\d{2})\\s*${DASH}?\\s*$`, ); -// Words that must NOT be absorbed into an address span -// when they follow a postal-code + city pattern. Party -// roles, organizational nouns, and common legal terms. -const TRAILING_WORD_EXCLUSIONS: ReadonlySet = new Set([ - // CZ/SK party roles - "nájemce", - "pronajímatel", - "kupující", - "prodávající", - "objednatel", - "zhotovitel", - "dodavatel", - "odběratel", - "věřitel", - "dlužník", - "zadavatel", - "uchazeč", - "příjemce", - "plátce", - // Organizational nouns - "správa", - "sekretariát", - "kancelář", - "odbor", - "oddělení", - "úřad", - "inspekce", - "agentura", - // Legal clause starters - "článek", - "smlouva", - "dodatek", - "příloha", - "předmět", - "podmínky", - "ustanovení", -]); - -const extendCityDistricts = (entities: Entity[], fullText: string): void => { +const extendCityDistricts = ( + entities: Entity[], + fullText: string, + trailingAddressWordExclusions: ReadonlySet, +): void => { for (const entity of entities) { if (entity.label !== "address") { continue; @@ -1446,7 +1729,7 @@ const extendCityDistricts = (entities: Entity[], fullText: string): void => { const trailingWordM = /^[\s]{1,4}(\p{Lu}\p{Ll}+)/u.exec(afterExt); if (trailingWordM && !trailingWordM[0].includes("\n")) { const candidate = (trailingWordM[1] ?? "").toLowerCase(); - if (!TRAILING_WORD_EXCLUSIONS.has(candidate)) { + if (!trailingAddressWordExclusions.has(candidate)) { entity.end += trailingWordM[0].length; entity.text = fullText.slice(entity.start, entity.end); } @@ -1456,30 +1739,20 @@ const extendCityDistricts = (entities: Entity[], fullText: string): void => { /** * Extend a person name match to include subsequent - * capitalized words. "Pavel" + " Heřmánek" → "Pavel - * Heřmánek". Stops at lowercase words, punctuation, - * or end of text. Also extends backward if preceded - * by a capitalized word (for "Miroslav Braňka" when - * only "Braňka" matched). + * capitalized words. Stops at lowercase words, + * punctuation, or end of text. */ /** * Defined-term marker: an opening typographic or straight * quote enclosing the chain start, AND a closing quote - * within a short window followed by a - * definitional cue (`means`, `shall mean`, `shall have - * the meaning(s)`, `refers to`). Legal documents reserve - * this construction for defined terms; the contents are - * not personal names even when individual tokens collide - * with the name corpus. - * - * Plain quotations like `"John Unknown" said ...` do NOT - * count: there is no definitional cue, so the trailing - * surname extension is still allowed to absorb `Unknown`. + * within a short window followed by a language-data + * definitional cue. Legal documents reserve this + * construction for defined terms; the contents are not + * personal names even when individual tokens collide with + * the name corpus. */ const OPENING_QUOTES = new Set(['"', "'", "“", "„", "‟", "‘", "‛", "«"]); const CLOSING_QUOTES = new Set(['"', "'", "”", "’", "»", "“"]); -const DEFINED_TERM_CUE_RE = - /^[\s,]*(?:means?|shall\s+means?|shall\s+have\s+the\s+meanings?|refers?\s+to|has\s+the\s+meanings?|is\s+defined)\b/iu; const DEFINED_TERM_LOOKAHEAD = 120; const DEFINED_TERM_LOOKBEHIND = 80; const EMPTY_GENERIC_ROLES: ReadonlySet = new Set(); @@ -1591,12 +1864,9 @@ const isSuppressibleDefinedTermQuote = ( const words = definedTermQuote.content.match(WORD_RE) ?? []; - // A quoted defined term can itself be a real person: - // `"John Smith" shall mean the employee...`. Preserve those - // when the definition itself points at a legal/business role - // from dictionary data. Legal terms such as `"Bond Hedge"` - // stay suppressible even if their first token collides with - // a given-name corpus entry. + // A quoted defined term can itself be a real person. + // Preserve those when the definition points at a role from + // dictionary data. if ( words.length >= 2 && startsWithKnownFirstName(definedTermQuote.content, ctx) && @@ -1638,31 +1908,17 @@ const extendPersonName = ( wordEnd++; } - // Skip trailing punctuation (commas, periods, - // typographic closing quotes). Curly quotes survive - // normalisation because they often appear inside - // defined-term clauses (`"Blue Sky Laws"`); strip - // them so the allow-list / stopword check sees the - // bare word. + // Skip trailing punctuation and typographic closing + // quotes so stopword checks see the bare word. const word = text.slice(wordStart, wordEnd); const stripped = word.replace(/[,;.”"’'“»]+$/, ""); if (stripped.length < 2) { break; } - // Don't extend into stopwords or person stopwords. - // The global allow list is intentionally NOT consulted - // here: real surnames such as `Law`, `Tesla`, or - // `Vote` are common English words and live on the - // allow list to suppress single-token noise, but they - // are legitimate name extensions when preceded by a - // first name in plain prose (`John Law`, `Elon - // Tesla`). Defined-term contexts (`"Blue Sky Laws"`, - // `"Bond Hedge Transactions"`) are filtered earlier by - // `isInsideDefinedTermQuote`, so by the time - // `extendPersonName` runs we are in ordinary prose and - // the allow-list block would only swallow real - // surnames. + // Do not consult the global allow list here: common + // words can be legitimate name extensions once a first + // name has established person context. const lower = stripped.toLowerCase(); if (getStopwords(ctx).has(lower) || getPersonStopwords(ctx).has(lower)) { break; diff --git a/packages/anonymize/src/detectors/legal-forms.ts b/packages/anonymize/src/detectors/legal-forms.ts index bfd0dad3..1e74d055 100644 --- a/packages/anonymize/src/detectors/legal-forms.ts +++ b/packages/anonymize/src/detectors/legal-forms.ts @@ -157,7 +157,7 @@ const loadLeadingClauseTrims = async (): Promise => { return leadingClauseTrimsPromise; }; -const getLeadingClauseTrimsSync = (): LeadingClauseTrims => +export const getLeadingClauseTrimsSync = (): LeadingClauseTrims => leadingClauseTrimsCache ?? EMPTY_LEADING_CLAUSE_TRIMS; // Generic legal/contract role words that should never appear @@ -301,7 +301,7 @@ const loadAllLegalSuffixes = async (): Promise => { const getAllLegalSuffixesSync = (): readonly string[] => allLegalSuffixesCache ?? LEGAL_SUFFIXES; -const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => +export const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => normalizedLegalBoundarySuffixesCache ?? new Set( LEGAL_SUFFIXES.map(normalizeLegalSuffixToken).filter( @@ -309,7 +309,7 @@ const getNormalizedLegalBoundarySuffixesSync = (): ReadonlySet => ), ); -const getNormalizedInNameLegalFormWordsSync = (): ReadonlySet => +export const getNormalizedInNameLegalFormWordsSync = (): ReadonlySet => normalizedInNameLegalFormWordsCache ?? new Set(); /** @@ -418,7 +418,7 @@ const loadConnectorProseHeads = async (): Promise> => { return connectorProseHeadsPromise; }; -const getConnectorProseHeadsSync = (): ReadonlySet => +export const getConnectorProseHeadsSync = (): ReadonlySet => connectorProseHeadsCache ?? new Set(); let structuralSingleCapPrefixesCache: ReadonlySet | null = null; @@ -473,7 +473,7 @@ const loadStructuralSingleCapPrefixes = async (): Promise< return structuralSingleCapPrefixesPromise; }; -const getStructuralSingleCapPrefixesSync = (): ReadonlySet => +export const getStructuralSingleCapPrefixesSync = (): ReadonlySet => structuralSingleCapPrefixesCache ?? new Set(); // Used by the trim helpers below to escape literal suffix tokens diff --git a/packages/anonymize/src/detectors/regex.ts b/packages/anonymize/src/detectors/regex.ts index 71a6bcab..9c119e8e 100644 --- a/packages/anonymize/src/detectors/regex.ts +++ b/packages/anonymize/src/detectors/regex.ts @@ -1,4 +1,4 @@ -import type { Match } from "@stll/text-search"; +import type { Match, PatternEntry } from "@stll/text-search"; import type { Validator } from "@stll/stdnum"; import { at, @@ -76,6 +76,26 @@ const escapeRegexPhrase = (s: string): string => /** Escape for use inside a regex character class. */ const escapeCharClass = (s: string): string => s.replace(/[\]\\^-]/g, "\\$&"); +const utf8ByteLength = (text: string): number => { + let length = 0; + for (const char of text) { + const codePoint = char.codePointAt(0); + if (codePoint === undefined) { + continue; + } + if (codePoint <= 0x7f) { + length += 1; + } else if (codePoint <= 0x7ff) { + length += 2; + } else if (codePoint <= 0xffff) { + length += 3; + } else { + length += 4; + } + } + return length; +}; + const toSortedAlternation = (values: readonly string[]): string => [ ...new Set( @@ -141,21 +161,44 @@ export type RegexMeta = { label: string; score: number; sourceDetail?: Entity["sourceDetail"]; + minByteLength?: number; /** Post-match stdnum validator for confirmation. */ validator?: Validator; + validatorId?: string; /** Extract the identifier portion when context is part of the regex span. */ validatorInput?: (text: string) => string; + validatorInputKind?: "digits-only" | "crypto-wallet-candidate"; }; type RegexDef = { pattern: string; label: string; score: number; + minByteLength?: number; + lazy?: true; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; validator?: Validator; + validatorId?: string; validatorInput?: (text: string) => string; + validatorInputKind?: "digits-only" | "crypto-wallet-candidate"; +}; + +type RegexPatternEntry = { + pattern: string; + literal?: false; + lazy?: true; + prefilterAny?: readonly string[]; + prefilterCaseInsensitive?: boolean; + prefilterRegex?: RegExp; }; type AmountWordsConfig = { + patterns?: Array<{ + lang: string; + keywords: string[]; + }>; percentages?: Array<{ lang: string; keywords: string[]; @@ -180,6 +223,144 @@ type AmountWordsConfig = { const AMOUNT_WORDS = amountWordsConfig as AmountWordsConfig; +const DIGITS_ONLY_VALIDATOR_INPUT = (text: string): string => + text.replace(/\D/g, ""); + +const VALIDATOR_IDS = new Map([ + [at.businessid, "at.businessid"], + [at.tin, "at.tin"], + [at.uid, "at.uid"], + [au.abn, "au.abn"], + [au.acn, "au.acn"], + [be.nn, "be.nn"], + [be.vat, "be.vat"], + [bg.vat, "bg.vat"], + [br.cnpj, "br.cnpj"], + [br.cpf, "br.cpf"], + [ch.uid, "ch.uid"], + [cn.ric, "cn.ric"], + [crypto.wallet, "crypto.wallet"], + [cy.vat, "cy.vat"], + [cz.dic, "cz.dic"], + [cz.rc, "cz.rc"], + [de.idnr, "de.idnr"], + [de.stnr, "de.stnr"], + [de.svnr, "de.svnr"], + [de.vat, "de.vat"], + [dk.cpr, "dk.cpr"], + [dk.vat, "dk.vat"], + [ee.ik, "ee.ik"], + [ee.vat, "ee.vat"], + [es.cif, "es.cif"], + [es.dni, "es.dni"], + [es.nie, "es.nie"], + [es.nss, "es.nss"], + [es.vat, "es.vat"], + [fi.hetu, "fi.hetu"], + [fi.vat, "fi.vat"], + [fi.ytunnus, "fi.ytunnus"], + [fr.nir, "fr.nir"], + [fr.siren, "fr.siren"], + [fr.siret, "fr.siret"], + [fr.tva, "fr.tva"], + [gb.nhs, "gb.nhs"], + [gb.nino, "gb.nino"], + [gb.vat, "gb.vat"], + [gr.vat, "gr.vat"], + [hr.vat, "hr.vat"], + [hu.vat, "hu.vat"], + [ie.pps, "ie.pps"], + [ie.vat, "ie.vat"], + [it.codiceFiscale, "it.codiceFiscale"], + [it.iva, "it.iva"], + [lt.asmens, "lt.asmens"], + [lt.vat, "lt.vat"], + [lu.vat, "lu.vat"], + [lv.vat, "lv.vat"], + [mt.vat, "mt.vat"], + [nl.vat, "nl.vat"], + [no.mva, "no.mva"], + [no.orgnr, "no.orgnr"], + [pl.nip, "pl.nip"], + [pl.pesel, "pl.pesel"], + [pt.cc, "pt.cc"], + [pt.vat, "pt.vat"], + [ro.cnp, "ro.cnp"], + [ro.vat, "ro.vat"], + [se.personnummer, "se.personnummer"], + [si.vat, "si.vat"], + [sk.dic, "sk.dic"], + [us.ein, "us.ein"], +]); + +export const NATIVE_REGEX_VALIDATOR_IDS: ReadonlySet = new Set([ + "au.abn", + "au.acn", + "at.businessid", + "at.tin", + "at.uid", + "be.nn", + "be.vat", + "bg.vat", + "br.cnpj", + "br.cpf", + "ch.uid", + "cn.ric", + "crypto.wallet", + "cy.vat", + "cz.dic", + "cz.rc", + "de.idnr", + "de.stnr", + "de.svnr", + "de.vat", + "dk.cpr", + "dk.vat", + "ee.ik", + "ee.vat", + "es.cif", + "es.dni", + "es.nie", + "es.nss", + "es.vat", + "fi.hetu", + "fi.vat", + "fi.ytunnus", + "fr.nir", + "fr.siren", + "fr.siret", + "fr.tva", + "gb.nhs", + "gb.nino", + "gb.vat", + "gr.vat", + "hr.vat", + "hu.vat", + "ie.pps", + "ie.vat", + "it.codiceFiscale", + "it.iva", + "lt.asmens", + "lt.vat", + "lu.vat", + "lv.vat", + "mt.vat", + "nl.vat", + "no.mva", + "no.orgnr", + "pl.nip", + "pl.pesel", + "pt.cc", + "pt.vat", + "ro.cnp", + "ro.vat", + "se.personnummer", + "si.vat", + "sk.dic", + "us.ein", + "us.rtn", +]); + // ── stdnum validator entries ──────────────────────── // Each entry pairs a @stll/stdnum validator with a // label and confidence score. The pattern derived via @@ -457,6 +638,9 @@ const EMAIL: RegexDef = { pattern: `\\b[\\w.+\\-]+@[\\w\\-]+(?:\\.[\\w\\-]+)+\\b`, label: "email address", score: 1, + lazy: true, + prefilterAny: ["@"], + prefilterCaseInsensitive: false, }; // [^\S\n] instead of \s: separators must not @@ -468,6 +652,7 @@ const INTL_PHONE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{0,4}\\b`, label: "phone number", score: 1, + minByteLength: MIN_PHONE_LENGTH, }; // Czech phone numbers: mobiles start with 6/7, @@ -483,6 +668,7 @@ const CZ_PHONE: RegexDef = { `(?![^\\S\\n]*(?:Kč|,-|korun|EUR|USD|€|\\$))\\b`, label: "phone number", score: 0.85, + minByteLength: MIN_PHONE_LENGTH, }; /** @@ -498,6 +684,10 @@ const TEL_PREFIX_PHONE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{3}\\b`, label: "phone number", score: 0.95, + minByteLength: MIN_PHONE_LENGTH, + lazy: true, + prefilterAny: ["tel", "telefon"], + prefilterCaseInsensitive: true, }; /** @@ -518,6 +708,7 @@ const US_PAREN_PHONE: RegexDef = { `\\(\\d{3}\\)(?:[^\\S\\n]|[.\\-])?\\d{3}` + `(?:[^\\S\\n]|[.\\-])\\d{4}\\b`, label: "phone number", score: 0.9, + minByteLength: MIN_PHONE_LENGTH, }; const CREDIT_CARD: RegexDef = { @@ -597,6 +788,7 @@ const HU_LANDLINE: RegexDef = { `(?:[^\\S\\n]|[.\\-])?\\d{4}\\b`, label: "phone number", score: 0.9, + minByteLength: MIN_PHONE_LENGTH, }; // Czech license plates (SPZ/RZ). @@ -627,6 +819,9 @@ const ES_POSTAL: RegexDef = { `[^\\S\\n]{0,3}:?[^\\S\\n]{0,3}\\d{5}\\b`, label: "address", score: 0.7, + lazy: true, + prefilterAny: ["C.P", "CP", "código postal", "codigo postal"], + prefilterCaseInsensitive: true, }; // Spanish DNI: 8 digits + 1 letter. Letter is a @@ -668,7 +863,11 @@ const NHS_NUMBER_CONTEXT: RegexDef = { label: "national identification number", score: 0.95, validator: gb.nhs, - validatorInput: (text) => text.replace(/\D/g, ""), + validatorInput: DIGITS_ONLY_VALIDATOR_INPUT, + validatorInputKind: "digits-only", + lazy: true, + prefilterAny: ["NHS", "National Health Service"], + prefilterCaseInsensitive: true, }; const PASSPORT_CONTEXT: RegexDef = { @@ -682,6 +881,9 @@ const PASSPORT_CONTEXT: RegexDef = { `(?:[A-Za-z]{1,2}\\d{6,8}|\\d{2}[A-Za-z]{2}\\d{5}|\\d{7,9})\\b`, label: "passport number", score: 0.96, + lazy: true, + prefilterAny: ["passport"], + prefilterCaseInsensitive: true, }; const FR_CNI_CONTEXT: RegexDef = { @@ -696,6 +898,9 @@ const FR_CNI_CONTEXT: RegexDef = { `)\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["CNI", "carte nationale", "French national identity card"], + prefilterCaseInsensitive: true, }; const CY_TIC_CONTEXT: RegexDef = { @@ -707,6 +912,9 @@ const CY_TIC_CONTEXT: RegexDef = { `\\d{8}[A-Za-z]\\b`, label: "tax identification number", score: 0.96, + lazy: true, + prefilterAny: ["TIC", "tax identification code"], + prefilterCaseInsensitive: true, }; const CY_ID_CARD_CONTEXT: RegexDef = { @@ -718,6 +926,9 @@ const CY_ID_CARD_CONTEXT: RegexDef = { `\\d{6,8}\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["Cyprus", "Cypriot", "identity card", "ID card"], + prefilterCaseInsensitive: true, }; const UK_DRIVING_LICENCE_CONTEXT: RegexDef = { @@ -729,6 +940,9 @@ const UK_DRIVING_LICENCE_CONTEXT: RegexDef = { `[A-Za-z9]{5}\\d{6}[A-Za-z0-9]{2}\\d[A-Za-z]{2}\\b`, label: "identity card number", score: 0.96, + lazy: true, + prefilterAny: ["driving licence", "driving license"], + prefilterCaseInsensitive: true, }; const US_DRIVER_LICENSE_CONTEXT: RegexDef = { @@ -743,6 +957,20 @@ const US_DRIVER_LICENSE_CONTEXT: RegexDef = { `)\\b`, label: "identity card number", score: 0.8, + lazy: true, + prefilterAny: [ + "driver license", + "driver licence", + "drivers license", + "drivers licence", + "driver's license", + "driver's licence", + "driver’s license", + "driver’s licence", + "driving license", + "driving licence", + ], + prefilterCaseInsensitive: true, }; const MEDICAL_LICENSE_CONTEXT: RegexDef = { @@ -757,6 +985,18 @@ const MEDICAL_LICENSE_CONTEXT: RegexDef = { `(?:[A-Za-z]{0,3}\\d{5,8}|\\d{2}[A-Za-z]\\d{4}[A-Za-z])\\b`, label: "registration number", score: 0.85, + lazy: true, + prefilterAny: [ + "GMC", + "NMC", + "medical", + "physician", + "doctor", + "surgeon", + "nursing", + "nurse", + ], + prefilterCaseInsensitive: true, }; const CRYPTO_WALLET_CANDIDATE = crypto.wallet.candidatePattern ?? "(?!)"; @@ -775,6 +1015,10 @@ const CRYPTO_WALLET_ADDRESS: RegexDef = { score: 0.85, validator: crypto.wallet, validatorInput: getCryptoWalletCandidate, + validatorInputKind: "crypto-wallet-candidate", + lazy: true, + prefilterAny: ["0x", "bc1", "BTC", "Bitcoin", "crypto", "wallet", "address"], + prefilterCaseInsensitive: true, }; const AU_ABN_FORMATTED: RegexDef = { @@ -798,6 +1042,9 @@ const NO_MVA_FORMATTED: RegexDef = { label: "tax identification number", score: 0.95, validator: no.mva, + lazy: true, + prefilterAny: ["MVA"], + prefilterCaseInsensitive: false, }; const US_EIN_FORMATTED: RegexDef = { @@ -860,6 +1107,9 @@ const BR_RG_WITH_SSP: RegexDef = { `[^\\S\\n]+SSP(?:/[A-Z]{2})?\\b`, label: "national identification number", score: 0.95, + lazy: true, + prefilterAny: ["SSP"], + prefilterCaseInsensitive: false, }; // Brazilian OAB (lawyer registration). Format: @@ -872,6 +1122,9 @@ const BR_OAB: RegexDef = { `(?:\\d{1,3}(?:\\.\\d{3})+|\\d{4,6})\\b`, label: "registration number", score: 0.95, + lazy: true, + prefilterAny: ["OAB/"], + prefilterCaseInsensitive: false, }; // URL: scheme + host + optional port + path + query + @@ -889,6 +1142,9 @@ const URL: RegexDef = { `(?:[/?#][^\\s)\\]>]*[^\\s.,;:!?)\\]>])?`, label: "url", score: 1, + lazy: true, + prefilterAny: ["http://", "https://", "http:", "https:", "www."], + prefilterCaseInsensitive: false, }; // Bare domain: no protocol/www prefix, ends with a @@ -1002,6 +1258,9 @@ const TIME_12H: RegexDef = { `(?=[\\s,;!?)]|$)`, label: "date", score: 0.9, + lazy: true, + prefilterAny: ["am", "pm", "a.m", "p.m"], + prefilterCaseInsensitive: true, }; const PERCENT_NUMBER_BODY = `(?:\\d{1,3}(?:[.,]\\d{3})+(?:[.,]\\d{1,4})?|\\d+(?:[.,]\\d{1,4})?)`; @@ -1054,6 +1313,9 @@ const PERCENT_RATE: RegexDef = { `)(?![\\p{L}\\p{N}_])`, label: "monetary amount", score: 0.85, + lazy: true, + prefilterAny: ["%"], + prefilterCaseInsensitive: false, }; // ── Collected definitions ──────────────────────────── @@ -1129,6 +1391,36 @@ export const REGEX_PATTERNS: readonly string[] = ALL_REGEX_DEFS.map( (d) => d.pattern, ); +const toRegexPatternEntry = (definition: RegexDef): PatternEntry => { + if ( + definition.lazy === undefined && + definition.prefilterAny === undefined && + definition.prefilterCaseInsensitive === undefined && + definition.prefilterRegex === undefined + ) { + return definition.pattern; + } + + const entry: RegexPatternEntry = { pattern: definition.pattern }; + if (definition.lazy !== undefined) { + entry.lazy = definition.lazy; + } + if (definition.prefilterAny !== undefined) { + entry.prefilterAny = definition.prefilterAny; + } + if (definition.prefilterCaseInsensitive !== undefined) { + entry.prefilterCaseInsensitive = definition.prefilterCaseInsensitive; + } + if (definition.prefilterRegex !== undefined) { + entry.prefilterRegex = definition.prefilterRegex; + } + return entry; +}; + +/** Static regex entries with compile-time prefilter hints. */ +export const REGEX_PATTERN_ENTRIES: readonly PatternEntry[] = + ALL_REGEX_DEFS.map(toRegexPatternEntry); + /** Parallel metadata. Index = pattern index. */ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( (d): RegexMeta => { @@ -1138,9 +1430,21 @@ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( }; if (d.validator) { meta.validator = d.validator; + const validatorId = d.validatorId ?? VALIDATOR_IDS.get(d.validator); + if (!validatorId) { + throw new Error(`Missing regex validator id for ${d.label}`); + } + meta.validatorId = validatorId; + } + if (d.minByteLength) { + meta.minByteLength = d.minByteLength; } if (d.validatorInput) { meta.validatorInput = d.validatorInput; + if (!d.validatorInputKind) { + throw new Error(`Missing regex validator input kind for ${d.label}`); + } + meta.validatorInputKind = d.validatorInputKind; } return meta; }, @@ -1155,6 +1459,9 @@ export const REGEX_META: readonly RegexMeta[] = ALL_REGEX_DEFS.map( */ type DateMonths = Record; +export type DateMonthData = Record; +export type YearWordData = Record; + /** * Build month-name alternation from date-months.json. * Deduplicates across all 22 languages, filters names @@ -1183,6 +1490,18 @@ const buildMonthAlternation = (months: DateMonths): string => { .join("|"); }; +const buildDateMonthData = (months: DateMonths): DateMonthData => { + const result: DateMonthData = {}; + for (const [key, value] of Object.entries(months)) { + if (key.startsWith("_")) continue; + const names = Array.isArray(value) ? value : [value]; + result[key] = names.filter( + (name) => name.replace(/\.$/, "").length >= MIN_MONTH_NAME_LENGTH, + ); + } + return result; +}; + /** * Build date patterns from a month-name alternation. * Returns 6 patterns covering the major written-date @@ -1218,13 +1537,19 @@ const buildDatePatternsFromMonths = (alt: string): string[] => { /** Cached promise for date patterns. Loaded once. */ let datePatternPromise: Promise | null = null; +let dateMonthDataPromise: Promise | null = null; +let yearWordDataPromise: Promise | null = null; -const loadDatePatterns = async (): Promise => { +const loadDateMonths = async (): Promise => { const mod = await import("../data/date-months.json"); // Dynamic import of JSON returns { default, ...keys }. // Use `default` if present (ESM wrapper), else the // module itself. - const months: DateMonths = mod.default ?? mod; + return mod.default ?? mod; +}; + +const loadDatePatterns = async (): Promise => { + const months = await loadDateMonths(); const alt = buildMonthAlternation(months); return buildDatePatternsFromMonths(alt); }; @@ -1244,6 +1569,35 @@ export const getDatePatterns = (): Promise => { return datePatternPromise; }; +export const getDateMonthData = (): Promise => { + if (!dateMonthDataPromise) { + dateMonthDataPromise = loadDateMonths() + .then(buildDateMonthData) + .catch((err) => { + dateMonthDataPromise = null; + throw err; + }); + } + return dateMonthDataPromise; +}; + +export const getYearWordData = (): Promise => { + yearWordDataPromise ??= import("../data/year-words.json").then((mod) => { + const data = (mod.default ?? mod) as Record; + const result: YearWordData = {}; + for (const [key, words] of Object.entries(data)) { + if (key.startsWith("_") || !Array.isArray(words)) { + continue; + } + result[key] = words.filter( + (word): word is string => typeof word === "string" && word.length > 0, + ); + } + return result; + }); + return yearWordDataPromise; +}; + /** Date pattern metadata (all are score 1 dates). */ export const DATE_PATTERN_META: Readonly = Object.freeze({ label: "date", @@ -1262,6 +1616,28 @@ type CurrenciesData = { localNames?: string[]; }; +export type MonetaryData = { + currencies: { + codes: string[]; + symbols: string[]; + local_names: string[]; + }; + amount_words: { + written_amount_patterns: Array<{ + keywords: string[]; + }>; + magnitude_suffixes: Array<{ + words: string[]; + abbreviations_case_insensitive: string[]; + abbreviations_case_sensitive: string[]; + }>; + share_quantity_terms: Array<{ + modifiers: string[]; + nouns: string[]; + }>; + }; +}; + type FinancialLexicons = { magnitudeOptional: string; magnitudeRequired: string; @@ -1698,6 +2074,7 @@ const buildCurrencyPatternEntries = ( /** Cached promise for currency patterns. Loaded once. */ let currencyPatternPromise: Promise | null = null; let currencyPatternEntryPromise: Promise | null = null; +let monetaryDataPromise: Promise | null = null; const loadCurrencyPatternEntries = async (): Promise< CurrencyPatternEntry[] @@ -1710,6 +2087,37 @@ const loadCurrencyPatternEntries = async (): Promise< const loadCurrencyPatterns = async (): Promise => (await loadCurrencyPatternEntries()).map((entry) => entry.pattern); +const loadMonetaryData = async (): Promise => { + const mod = await import("../data/currencies.json"); + const currencies: CurrenciesData = mod.default ?? mod; + return { + currencies: { + codes: currencies.codes, + symbols: currencies.symbols, + local_names: currencies.localNames ?? [], + }, + amount_words: { + written_amount_patterns: (AMOUNT_WORDS.patterns ?? []).map((entry) => ({ + keywords: entry.keywords, + })), + magnitude_suffixes: (AMOUNT_WORDS.magnitudeSuffixes ?? []).map( + (entry) => ({ + words: entry.words ?? [], + abbreviations_case_insensitive: + entry.abbreviationsCaseInsensitive ?? [], + abbreviations_case_sensitive: entry.abbreviationsCaseSensitive ?? [], + }), + ), + share_quantity_terms: (AMOUNT_WORDS.shareQuantityTerms ?? []).map( + (entry) => ({ + modifiers: entry.modifiers ?? [], + nouns: entry.nouns, + }), + ), + }, + }; +}; + /** * Get dynamically built monetary amount patterns from * currencies.json. Returns a cached promise; the JSON @@ -1737,6 +2145,16 @@ export const getCurrencyPatternEntries = (): Promise< return currencyPatternEntryPromise; }; +export const getMonetaryData = (): Promise => { + if (!monetaryDataPromise) { + monetaryDataPromise = loadMonetaryData().catch((err) => { + monetaryDataPromise = null; + throw err; + }); + } + return monetaryDataPromise; +}; + /** Currency pattern metadata (score 0.9). */ export const CURRENCY_PATTERN_META: Readonly = Object.freeze({ label: "monetary amount", @@ -1777,8 +2195,8 @@ export const processRegexMatches = ( } if ( meta.sourceDetail !== "custom-regex" && - meta.label === "phone number" && - match.text.length < MIN_PHONE_LENGTH + meta.minByteLength !== undefined && + utf8ByteLength(match.text) < meta.minByteLength ) { continue; } @@ -1824,6 +2242,8 @@ type SigningClauseConfig = { prefix: string; suffix: string; prepositions: string[]; + guardPrefixPhrases?: string[]; + guardSuffixPhrases?: string[]; }>; }; @@ -1872,6 +2292,7 @@ export const SIGNING_CLAUSE_META: Readonly = { }; let signingPatternPromise: Promise | null = null; +let nativeSigningPatternPromise: Promise | null = null; const loadSigningPatterns = async (): Promise => { const mod = await import("../data/signing-clauses.json"); @@ -1879,6 +2300,12 @@ const loadSigningPatterns = async (): Promise => { return buildSigningClausePatterns(data); }; +const loadNativeSigningPatterns = async (): Promise => { + const mod = await import("../data/signing-clauses.json"); + const data: SigningClauseConfig = mod.default ?? mod; + return buildSigningClausePatterns(data); +}; + export const getSigningClausePatterns = (): Promise => { if (!signingPatternPromise) { signingPatternPromise = loadSigningPatterns().catch((err) => { @@ -1888,3 +2315,13 @@ export const getSigningClausePatterns = (): Promise => { } return signingPatternPromise; }; + +export const getNativeSigningClausePatterns = (): Promise => { + if (!nativeSigningPatternPromise) { + nativeSigningPatternPromise = loadNativeSigningPatterns().catch((err) => { + nativeSigningPatternPromise = null; + throw err; + }); + } + return nativeSigningPatternPromise; +}; diff --git a/packages/anonymize/src/detectors/triggers.ts b/packages/anonymize/src/detectors/triggers.ts index a81d9ac1..3510328c 100644 --- a/packages/anonymize/src/detectors/triggers.ts +++ b/packages/anonymize/src/detectors/triggers.ts @@ -168,6 +168,7 @@ const compileValidations = ( } return { type: "valid-id", + validator: v.validator, check: (value) => { // stdnum validators expect compact digits only; // strip formatting (spaces, dots, dashes, @@ -637,14 +638,13 @@ const loadAddressStopKeywords = async (): Promise => { return addressStopKeywordsPromise; }; -const getAddressStopKeywordsSync = (): readonly string[] => +export const getAddressStopKeywordsSync = (): readonly string[] => addressStopKeywordsCache ?? ADDRESS_STOP_KEYWORDS_SEED; /** - * Warm the address-stop-keywords cache. Pipeline callers + * Warm address support data. Pipeline callers * await this before invoking trigger detection so the - * synchronous `extractValue` path uses the merged list - * instead of the seed fallback. + * synchronous `extractValue` path uses merged data. */ export const warmAddressStopKeywords = async (): Promise => { await loadAddressStopKeywords(); @@ -1128,7 +1128,10 @@ const extractValue = ( const idRaw = trailingLetterMatch ? idMatch[0] + trailingLetterMatch[0] : idMatch[0]; - const idText = idRaw.trim(); + const idText = idRaw.trim().replace(/[.,;:!?]+$/u, ""); + if (idText.length === 0) { + return null; + } const leadingSpaces = idMatch[0].length - idMatch[0].trimStart().length; const idStart = triggerEnd + diff --git a/packages/anonymize/src/filters/confidence-boost.ts b/packages/anonymize/src/filters/confidence-boost.ts index 9e383e2e..1cda53fe 100644 --- a/packages/anonymize/src/filters/confidence-boost.ts +++ b/packages/anonymize/src/filters/confidence-boost.ts @@ -1,82 +1,9 @@ +import addressContextJson from "../data/address-context.json"; +import addressPrepositionsJson from "../data/address-prepositions.json"; import addressStreetTypesJson from "../data/address-street-types.json"; import type { Entity } from "../types"; import { isCallerOwnedEntity } from "../util/entity-source"; -// Capitalised words that look like the start of an -// `[Uppercase] [number]` address (Czech: "Vinohradská 12") -// but in contract prose introduce a section, clause, or -// document reference instead ("Section 6", "Article 9"). -// Listed here so the `bareHouseRe` near-address scan does -// not promote them to address spans. Module-level to avoid -// allocation in a hot loop. -const BARE_STOPWORDS = new Set([ - // ── Czech ──────────────────────────────────────── - "Příloha", - "Smlouva", - "Článek", - "Dodatek", - "Celkem", - "Strana", - "Faktura", - "Částka", - "Položka", - "Kapitola", - "Zákon", - "Vyhláška", - "Nařízení", - "Usnesení", - "Rozsudek", - "Bod", - "Odstavec", - "Záloha", - "Zbývá", - "Dne", - "Platba", - "Datum", - "Splatnost", - "Variabilní", - "Konstantní", - "Specifický", - // ── English ────────────────────────────────────── - "Section", - "Sections", - "Article", - "Articles", - "Schedule", - "Schedules", - "Exhibit", - "Exhibits", - "Annex", - "Annexes", - "Appendix", - "Appendices", - "Clause", - "Clauses", - "Chapter", - "Chapters", - "Paragraph", - "Paragraphs", - "Subsection", - "Subsections", - "Form", - "Page", - "Pages", - "Item", - "Items", - "Note", - "Notes", - "Rule", - "Rules", - "Attachment", - "Attachments", - "Volume", - "Volumes", - "Book", - "Books", - "Part", - "Parts", -]); - const NEAR_MISS_BAND = 0.15; const BOOST_PER_NEIGHBOUR = 0.05; const CONTEXT_WINDOW_CHARS = 150; @@ -155,29 +82,61 @@ type PrepositionData = { temporal: Record; }; +type AddressContextJson = { + bareHouseStopwords: Record; +}; + +export type AddressContextData = { + address_prepositions: string[]; + temporal_prepositions: string[]; + street_abbreviations: string[]; + bare_house_stopwords: string[]; +}; + +const languageRecordValues = ( + record: Record, + transform: (value: string) => string = (value) => value, +): string[] => { + const values: string[] = []; + for (const [language, words] of Object.entries(record)) { + if (language.startsWith("_") || !Array.isArray(words)) { + continue; + } + for (const word of words) { + values.push(transform(word)); + } + } + return values; +}; + +const buildPrepositionSets = ( + data: PrepositionData, +): { + address: ReadonlySet; + temporal: ReadonlySet; +} => ({ + address: new Set( + languageRecordValues(data.address, (word) => word.toLowerCase()), + ), + temporal: new Set( + languageRecordValues(data.temporal, (word) => word.toLowerCase()), + ), +}); + +const buildBareStopwords = (data: AddressContextJson): ReadonlySet => + new Set(languageRecordValues(data.bareHouseStopwords)); + +const BARE_STOPWORDS = buildBareStopwords(addressContextJson); + let _addressPreps: ReadonlySet | null = null; let _temporalPreps: ReadonlySet | null = null; let _prepsPromise: Promise | null = null; const loadPrepositions = async (): Promise => { try { - const mod = await import("../data/address-prepositions.json"); - const data: PrepositionData = mod.default ?? mod; - // Merge all languages into flat sets - const addr = new Set(); - const temp = new Set(); - for (const words of Object.values(data.address)) { - if (Array.isArray(words)) { - for (const w of words) addr.add(w.toLowerCase()); - } - } - for (const words of Object.values(data.temporal)) { - if (Array.isArray(words)) { - for (const w of words) temp.add(w.toLowerCase()); - } - } - _addressPreps = addr; - _temporalPreps = temp; + const prepositions = buildPrepositionSets(addressPrepositionsJson); + _addressPreps = prepositions.address; + _temporalPreps = prepositions.temporal; } catch { _addressPreps = new Set(); _temporalPreps = new Set(); @@ -240,6 +199,16 @@ export const initStreetAbbrevs = (): Promise => { export const getStreetAbbrevs = (): ReadonlySet => _streetAbbrevs ?? new Set(); +export const getAddressContextData = (): AddressContextData => { + const prepositions = buildPrepositionSets(addressPrepositionsJson); + return { + address_prepositions: [...prepositions.address], + temporal_prepositions: [...prepositions.temporal], + street_abbreviations: [...buildStreetAbbrevs(addressStreetTypesJson)], + bare_house_stopwords: [...buildBareStopwords(addressContextJson)], + }; +}; + /** * Scan backwards from known address entities and * house number patterns to find street names. diff --git a/packages/anonymize/src/filters/false-positives.ts b/packages/anonymize/src/filters/false-positives.ts index c9966061..f73a7879 100644 --- a/packages/anonymize/src/filters/false-positives.ts +++ b/packages/anonymize/src/filters/false-positives.ts @@ -2,7 +2,10 @@ import type { Entity } from "../types"; import type { PipelineContext } from "../context"; import { defaultContext } from "../context"; import { isCallerOwnedEntity } from "../util/entity-source"; -import { getPersonStopwords } from "../detectors/deny-list"; +import { + getDefinedTermHeads, + getPersonStopwords, +} from "../detectors/deny-list"; import { normalizeHomoglyphs } from "../util/homoglyphs"; const TEMPLATE_PLACEHOLDER_RE = /^(?:\.{3,}|_{3,}|\[[\w\s]+\]|\{[\w\s]+\})$/; @@ -182,11 +185,6 @@ const STANDALONE_YEAR_RE = /^(?:19|20)\d{2}$/; // by one of these, it's a reference number, not PII. const NUMBER_ABBREV_RE = /(?:^|[\s(])(?:č|čís|nr|no|n)\.\s*$/i; const SIGNING_CLAUSE_ADDRESS_RE = /^(?:v|ve)\s+[^\d,\n]{1,40},?\s+dne$/iu; -const PERSON_TRAILING_NOUNS: ReadonlySet = new Set([ - "association", - "period", - "reform", -]); const LEGAL_FORM_HEADING_RE = /\b(?:agreement|amendment|contract|exhibit)\b/iu; const LEADING_ARTIFACT_RE = /^(?:\.\s)+/u; const ADDRESS_ROLE_PREFIX_RE = @@ -506,6 +504,7 @@ export const filterFalsePositives = ( ): Entity[] => { const filtered: Entity[] = []; const roles = getGenericRoles(ctx); + const definedTermHeads = getDefinedTermHeads(ctx); for (const entity of entities) { if (isCallerOwnedEntity(entity)) { @@ -663,11 +662,7 @@ export const filterFalsePositives = ( const lastFolded = last ? normalizeHomoglyphs(last).toLowerCase() : undefined; - if ( - tokens.length > 1 && - lastFolded && - PERSON_TRAILING_NOUNS.has(lastFolded) - ) { + if (tokens.length > 1 && lastFolded && definedTermHeads.has(lastFolded)) { continue; } } diff --git a/packages/anonymize/src/filters/hotword-rules.ts b/packages/anonymize/src/filters/hotword-rules.ts index e4640579..b80e53c9 100644 --- a/packages/anonymize/src/filters/hotword-rules.ts +++ b/packages/anonymize/src/filters/hotword-rules.ts @@ -20,7 +20,7 @@ type HotwordRulesConfig = { // ── Lazy-loaded state ─────────────────────────────── -let rules: HotwordRule[] | null = null; +let rules: readonly HotwordRule[] | null = null; let search: { findIter: (text: string) => Match[] } | null = null; /** * Maps each TextSearch pattern index back to the @@ -28,14 +28,27 @@ let search: { findIter: (text: string) => Match[] } | null = null; * resolves all hotword hits to their rule. */ let patternToRule: number[] | null = null; +let ruleSetPromise: Promise | null = null; let initPromise: Promise | null = null; // ── Init ──────────────────────────────────────────── +export const loadHotwordRuleSet = (): Promise => { + if (ruleSetPromise !== null) return ruleSetPromise; + ruleSetPromise = import("../data/hotword-rules.json") + .then((mod) => { + const data: HotwordRulesConfig = mod.default ?? mod; + return data.rules; + }) + .catch((err) => { + ruleSetPromise = null; + throw err; + }); + return ruleSetPromise; +}; + const loadRules = async (): Promise => { - const mod = await import("../data/hotword-rules.json"); - const data: HotwordRulesConfig = mod.default ?? mod; - const loaded = data.rules; + const loaded = await loadHotwordRuleSet(); // Build a flat pattern list and the reverse map. const patterns: PatternEntry[] = []; @@ -102,11 +115,20 @@ export const expandLabelsForHotwordRules = ( if (rules === null || requestedLabels.length === 0) { return requestedLabels; } + return expandLabelsForHotwordRuleSet(requestedLabels, rules); +}; +export const expandLabelsForHotwordRuleSet = ( + requestedLabels: readonly string[], + ruleSet: readonly HotwordRule[], +): readonly string[] => { + if (requestedLabels.length === 0) { + return requestedLabels; + } const requested = new Set(requestedLabels); const expanded = new Set(requestedLabels); - for (const rule of rules) { + for (const rule of ruleSet) { if (rule.reclassifyTo === undefined || !requested.has(rule.reclassifyTo)) { continue; } diff --git a/packages/anonymize/src/index-shared.ts b/packages/anonymize/src/index-shared.ts index 511355fb..287d084f 100644 --- a/packages/anonymize/src/index-shared.ts +++ b/packages/anonymize/src/index-shared.ts @@ -49,6 +49,63 @@ export type { PipelineSearchOptions, } from "./pipeline"; +// ── Native Adapter ─────────────────────────────── +export { + PreparedSearch, + PreparedNativeAnonymizer, + assertNativeBindingVersion, + createNativeAnonymizerFromConfig, + createNativeAnonymizerFromPackage, + encodeNativeSearchConfig, + encodeNativeSearchConfigInput, + getNativeBindingVersion, + diagnostics_json, + load_prepared_package, + native_package_version, + normalize_for_search, + prepareNativeSearchPackage, + prepare_search_package, + redact_text, + redact_text_json, +} from "./native"; +export type { + NativeAnonymizeBinding, + NativeAnonymizerFromConfigOptions, + NativeAnonymizerFromPackageOptions, + NativeBindingVersionOptions, + NativeNormalizeOptions, + NativeOperatorConfig, + NativePipelineEntity, + NativePipelineFromPackageOptions, + NativePreparedSearchBinding, + NativeRedactionResult, + NativeSearchPackageInput, + NativeSearchPackageOptions, + NativeStaticRedactionResult, + PreparedSearch as PreparedSearchInstance, + SharedNativePreparedPackageOptions, + SharedNativeDiagnosticsJsonOptions, + SharedNativeRedactTextOptions, + SharedNativeRedactTextJsonOptions, + SharedNativeSearchPackageOptions, +} from "./native"; +export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; +export { + PreparedNativePipeline, + assertNativePipelineSupported, + createNativePipelineFromConfig, + createNativePipelineFromPackage, + getNativePipelineCompatibility, + prepareNativePipelineConfig, + prepareNativePipelinePackage, +} from "./native-pipeline"; +export type { + NativePipelineBuildOptions, + NativePipelineCompatibility, + NativePipelinePackageOptions, + NativePipelineUnsupportedFeature, +} from "./native-pipeline"; + // ── Redaction ───────────────────────────────────── export { redactText, diff --git a/packages/anonymize/src/language-scope.ts b/packages/anonymize/src/language-scope.ts new file mode 100644 index 00000000..7283345d --- /dev/null +++ b/packages/anonymize/src/language-scope.ts @@ -0,0 +1,86 @@ +import languageScopes from "./data/language-scopes.json"; + +import type { PipelineConfig } from "./types"; + +type LanguageScope = { + nameCorpusLanguages?: readonly string[]; + denyListCountries?: readonly string[]; +}; + +type LanguageScopeData = { + languages: Record; +}; + +const scopeData = languageScopes as LanguageScopeData; + +const normalizeLanguage = (language: string): string => + language.trim().toLowerCase(); + +const fallbackLanguage = (language: string): string | null => { + const index = language.indexOf("-"); + return index === -1 ? null : language.slice(0, index); +}; + +const uniquePush = (target: string[], values: readonly string[]): void => { + const seen = new Set(target); + for (const value of values) { + if (seen.has(value)) { + continue; + } + seen.add(value); + target.push(value); + } +}; + +const resolveLanguageScope = (language: string): LanguageScope | null => { + const normalized = normalizeLanguage(language); + if (normalized.length === 0) { + return null; + } + const exact = scopeData.languages[normalized]; + if (exact !== undefined) { + return exact; + } + const fallback = fallbackLanguage(normalized); + return fallback === null ? null : (scopeData.languages[fallback] ?? null); +}; + +const configuredLanguages = (config: PipelineConfig): readonly string[] => { + if (config.languages !== undefined) { + return config.languages; + } + return config.language === undefined ? [] : [config.language]; +}; + +export const applyPipelineLanguageScope = ( + config: PipelineConfig, +): PipelineConfig => { + const languages = configuredLanguages(config); + if (languages.length === 0) { + return config; + } + + const nameCorpusLanguages: string[] = []; + const denyListCountries: string[] = []; + for (const language of languages) { + const scope = resolveLanguageScope(language); + if (scope === null) { + continue; + } + uniquePush(nameCorpusLanguages, scope.nameCorpusLanguages ?? []); + uniquePush(denyListCountries, scope.denyListCountries ?? []); + } + + const next: Partial = {}; + if ( + config.nameCorpusLanguages === undefined && + nameCorpusLanguages.length > 0 + ) { + next.nameCorpusLanguages = nameCorpusLanguages; + } + if (config.denyListCountries === undefined && denyListCountries.length > 0) { + next.denyListCountries = denyListCountries; + } + + return Object.keys(next).length === 0 ? config : { ...config, ...next }; +}; diff --git a/packages/anonymize/src/native-default-config.ts b/packages/anonymize/src/native-default-config.ts new file mode 100644 index 00000000..54118ca3 --- /dev/null +++ b/packages/anonymize/src/native-default-config.ts @@ -0,0 +1,20 @@ +import { DEFAULT_ENTITY_LABELS } from "./constants"; +import type { PipelineConfig } from "./types"; + +export const DEFAULT_NATIVE_PIPELINE_CONFIG: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableCountries: true, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "native-pipeline-default", +}; diff --git a/packages/anonymize/src/native-node.ts b/packages/anonymize/src/native-node.ts new file mode 100644 index 00000000..af98e2e6 --- /dev/null +++ b/packages/anonymize/src/native-node.ts @@ -0,0 +1,558 @@ +import { createRequire } from "node:module"; +import { readFileSync } from "node:fs"; +import { readFile } from "node:fs/promises"; +import process from "node:process"; + +import { + assertNativeBindingVersion, + createNativePipelineFromPackage, + type NativeOperatorConfig, + type NativeAnonymizeBinding, + type NativeNormalizeOptions, + type NativeSearchPackageInput, + type PreparedNativePipeline, + type NativeStaticRedactionResult, + diagnostics_json as diagnosticsJsonWithBinding, + load_prepared_package as loadPreparedPackageWithBinding, + native_package_version as nativePackageVersionWithBinding, + normalize_for_search as normalizeForSearchWithBinding, + prepare_search_package as prepareSearchPackageWithBinding, + redact_text as redactTextWithBinding, + redact_text_json as redactTextJsonWithBinding, +} from "./native"; + +export * from "./native"; + +export type NativeRequire = (specifier: string) => unknown; + +export type NativeLibc = "gnu" | "musl"; + +export type LoadNativeBindingOptions = { + expectedVersion?: string; + platform?: string; + arch?: string; + libc?: NativeLibc; + env?: Record; + requireModule?: NativeRequire; +}; + +export type NativePipelinePackageFileOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; + packagePath: string; +}; + +export type NativeSdkOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; +}; + +export type NativeSdkPackageOptions = NativeSdkOptions & { + compressed?: boolean; +}; + +export type DefaultNativePipelinePackageOptions = LoadNativeBindingOptions & { + binding?: NativeAnonymizeBinding; + language?: string; + packagePath?: string; +}; + +type ResolvedDefaultNativePipelineOptions = { + binding: NativeAnonymizeBinding; + language?: string; + packagePath?: string; +}; + +export type DefaultNativePipelinePackageFileOptions = { + language?: string; +}; + +const LOCAL_NATIVE_LOADER = "../index.cjs"; +const PACKAGE_SPECIFIC_NATIVE_PATH = "STELLA_ANONYMIZE_NATIVE_LIBRARY_PATH"; +const DEFAULT_NATIVE_PIPELINE_PACKAGE_URL = new URL( + "../native-pipeline.stlanonpkg", + import.meta.url, +); +const DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN = /^[a-z0-9]+(?:-[a-z0-9]+)*$/u; +const DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY = ""; +const defaultNativePipelineCache = new WeakMap< + NativeAnonymizeBinding, + Map +>(); +const defaultNativePipelineInflightCache = new WeakMap< + NativeAnonymizeBinding, + Map> +>(); + +export { DEFAULT_NATIVE_PIPELINE_CONFIG } from "./native-default-config"; + +export const loadNativeAnonymizeBinding = ( + options: LoadNativeBindingOptions = {}, +): NativeAnonymizeBinding => { + const requireModule = options.requireModule ?? createRequire(import.meta.url); + const platform = options.platform ?? process.platform; + const arch = options.arch ?? process.arch; + const env = options.env ?? process.env; + const specifiers = nativeBindingSpecifiers({ env }); + const errors: string[] = []; + + for (const specifier of specifiers) { + const binding = tryLoadNativeBinding({ + specifier, + requireModule, + errors, + }); + if (!binding) { + continue; + } + if (options.expectedVersion !== undefined) { + assertNativeBindingVersion({ + binding, + expectedVersion: options.expectedVersion, + }); + } + return binding; + } + + throw new Error( + `Unable to load native anonymize binding for ${platform}/${arch}:\n${errors.join("\n")}`, + ); +}; + +export const readNativePipelinePackageFile = ( + packagePath: string, +): Uint8Array => new Uint8Array(readFileSync(packagePath)); + +export const readNativePipelinePackageFileAsync = async ( + packagePath: string, +): Promise => new Uint8Array(await readFile(packagePath)); + +export const native_package_version = ( + options: NativeSdkOptions = {}, +): string => nativePackageVersionWithBinding(resolveNativeSdkBinding(options)); + +export const normalize_for_search = ( + text: string, + options: NativeSdkOptions = {}, +): string => { + const args: NativeNormalizeOptions = { + binding: resolveNativeSdkBinding(options), + text, + }; + return normalizeForSearchWithBinding(args); +}; + +export const prepare_search_package = ( + config: NativeSearchPackageInput, + { compressed = true, ...options }: NativeSdkPackageOptions = {}, +): Uint8Array => + prepareSearchPackageWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + compressed, + }); + +export const load_prepared_package = ( + packageBytes: Uint8Array, + options: NativeSdkOptions = {}, +) => + loadPreparedPackageWithBinding({ + binding: resolveNativeSdkBinding(options), + packageBytes, + }); + +export const load_prepared_package_file = ( + packagePath: string, + options: NativeSdkOptions = {}, +) => load_prepared_package(readNativePipelinePackageFile(packagePath), options); + +export const redact_text = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): NativeStaticRedactionResult => + redactTextWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + +export const redact_text_json = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): string => + redactTextJsonWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + +export const diagnostics_json = ( + config: NativeSearchPackageInput, + fullText: string, + operators?: NativeOperatorConfig, + options: NativeSdkOptions = {}, +): string | null => + diagnosticsJsonWithBinding({ + binding: resolveNativeSdkBinding(options), + config, + fullText, + ...(operators !== undefined ? { operators } : {}), + }); + +export const readDefaultNativePipelinePackageFile = ({ + language, +}: DefaultNativePipelinePackageFileOptions = {}): Uint8Array => { + const packageUrl = defaultNativePipelinePackageUrl(language); + try { + return new Uint8Array(readFileSync(packageUrl)); + } catch (error) { + throw new Error( + `${defaultNativePipelinePackageDescription(language)} is unavailable: ${formatLoadError(error)}`, + ); + } +}; + +export const readDefaultNativePipelinePackageFileAsync = async ({ + language, +}: DefaultNativePipelinePackageFileOptions = {}): Promise => { + const packageUrl = defaultNativePipelinePackageUrl(language); + try { + return new Uint8Array(await readFile(packageUrl)); + } catch (error) { + throw new Error( + `${defaultNativePipelinePackageDescription(language)} is unavailable: ${formatLoadError(error)}`, + ); + } +}; + +export const createNativePipelineFromPackageFile = ({ + binding, + packagePath, + expectedVersion, + ...loadOptions +}: NativePipelinePackageFileOptions): PreparedNativePipeline => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + return createNativePipelineFromPackage({ + binding: resolvedBinding, + packageBytes: readNativePipelinePackageFile(packagePath), + }); +}; + +export const createNativePipelineFromDefaultPackage = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => + createNativePipelineFromResolvedDefaultPackage( + resolveDefaultNativePipelineOptions(options), + ); + +export const getDefaultNativePipeline = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => { + const resolvedOptions = resolveDefaultNativePipelineOptions(options); + const cache = defaultPipelineCacheFor(resolvedOptions.binding); + const key = defaultPipelineCacheKey(resolvedOptions); + const cached = cache.get(key); + if (cached !== undefined) { + return cached; + } + const pipeline = + createNativePipelineFromResolvedDefaultPackage(resolvedOptions); + cache.set(key, pipeline); + return pipeline; +}; + +export const preloadDefaultNativePipeline = ( + options: DefaultNativePipelinePackageOptions = {}, +): PreparedNativePipeline => { + const pipeline = getDefaultNativePipeline(options); + pipeline.warmLazyRegex(); + return pipeline; +}; + +export const preloadDefaultNativePipelineAsync = ( + options: DefaultNativePipelinePackageOptions = {}, +): Promise => { + const resolvedOptions = resolveDefaultNativePipelineOptions(options); + const cache = defaultPipelineCacheFor(resolvedOptions.binding); + const key = defaultPipelineCacheKey(resolvedOptions); + const cached = cache.get(key); + if (cached !== undefined) { + return Promise.resolve(cached); + } + + const inflightCache = defaultPipelineInflightCacheFor( + resolvedOptions.binding, + ); + const inflight = inflightCache.get(key); + if (inflight !== undefined) { + return inflight; + } + + const promise = createNativePipelineFromResolvedDefaultPackageAsync( + resolvedOptions, + ) + .then((pipeline) => { + pipeline.warmLazyRegex(); + cache.set(key, pipeline); + return pipeline; + }) + .finally(() => { + inflightCache.delete(key); + }); + inflightCache.set(key, promise); + return promise; +}; + +const resolveDefaultNativePipelineOptions = ({ + binding, + language, + packagePath, + expectedVersion, + ...loadOptions +}: DefaultNativePipelinePackageOptions = {}): ResolvedDefaultNativePipelineOptions => { + if (language !== undefined && packagePath !== undefined) { + throw new Error("Use either language or packagePath, not both"); + } + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + return { + binding: resolvedBinding, + ...(language !== undefined + ? { language: normalizeDefaultNativePipelineLanguage(language) } + : {}), + ...(packagePath !== undefined ? { packagePath } : {}), + }; +}; + +const createNativePipelineFromResolvedDefaultPackage = ({ + binding, + language, + packagePath, +}: ResolvedDefaultNativePipelineOptions): PreparedNativePipeline => { + const packageBytes = + packagePath === undefined + ? readDefaultNativePipelinePackageFile( + defaultPackageFileOptions(language), + ) + : readNativePipelinePackageFile(packagePath); + return createNativePipelineFromPackage({ + binding, + packageBytes, + }); +}; + +const createNativePipelineFromResolvedDefaultPackageAsync = async ({ + binding, + language, + packagePath, +}: ResolvedDefaultNativePipelineOptions): Promise => { + const packageBytes = + packagePath === undefined + ? await readDefaultNativePipelinePackageFileAsync( + defaultPackageFileOptions(language), + ) + : await readNativePipelinePackageFileAsync(packagePath); + return createNativePipelineFromPackage({ + binding, + packageBytes, + }); +}; + +const defaultPackageFileOptions = ( + language: string | undefined, +): DefaultNativePipelinePackageFileOptions => + language === undefined ? {} : { language }; + +const resolveNativeSdkBinding = ({ + binding, + expectedVersion, + ...loadOptions +}: NativeSdkOptions): NativeAnonymizeBinding => { + const resolvedBinding = + binding ?? + loadNativeAnonymizeBinding({ + ...loadOptions, + ...(expectedVersion !== undefined ? { expectedVersion } : {}), + }); + if (binding && expectedVersion !== undefined) { + assertNativeBindingVersion({ binding, expectedVersion }); + } + return resolvedBinding; +}; + +const defaultPipelineCacheFor = ( + binding: NativeAnonymizeBinding, +): Map => { + const cached = defaultNativePipelineCache.get(binding); + if (cached !== undefined) { + return cached; + } + const created = new Map(); + defaultNativePipelineCache.set(binding, created); + return created; +}; + +const defaultPipelineInflightCacheFor = ( + binding: NativeAnonymizeBinding, +): Map> => { + const cached = defaultNativePipelineInflightCache.get(binding); + if (cached !== undefined) { + return cached; + } + const created = new Map>(); + defaultNativePipelineInflightCache.set(binding, created); + return created; +}; + +const defaultPipelineCacheKey = ({ + binding, + language, + packagePath, +}: ResolvedDefaultNativePipelineOptions): string => + [ + binding.nativePackageVersion(), + packagePath ?? + (language === undefined + ? DEFAULT_NATIVE_PIPELINE_PACKAGE_CACHE_KEY + : `language:${language}`), + ].join("\0"); + +const defaultNativePipelinePackageUrl = (language: string | undefined): URL => { + if (language === undefined) { + return DEFAULT_NATIVE_PIPELINE_PACKAGE_URL; + } + const normalized = normalizeDefaultNativePipelineLanguage(language); + return new URL( + `../native-pipeline.${normalized}.stlanonpkg`, + import.meta.url, + ); +}; + +const defaultNativePipelinePackageDescription = ( + language: string | undefined, +): string => + language === undefined + ? "Default native pipeline package" + : `Default native pipeline package for language "${normalizeDefaultNativePipelineLanguage(language)}"`; + +const normalizeDefaultNativePipelineLanguage = (language: string): string => { + const normalized = language.trim().toLowerCase(); + if (!DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN.test(normalized)) { + throw new Error( + `Default native pipeline language must match ${DEFAULT_NATIVE_PIPELINE_LANGUAGE_PATTERN.source}`, + ); + } + return normalized; +}; + +type NativeBindingSpecifiersOptions = { + env: Record; +}; + +const nativeBindingSpecifiers = ({ + env, +}: NativeBindingSpecifiersOptions): string[] => { + const specifiers: string[] = []; + const overridePath = env[PACKAGE_SPECIFIC_NATIVE_PATH]; + if (overridePath) { + specifiers.push(overridePath); + } + specifiers.push(LOCAL_NATIVE_LOADER); + return specifiers; +}; + +type TryLoadNativeBindingOptions = { + specifier: string; + requireModule: NativeRequire; + errors: string[]; +}; + +const tryLoadNativeBinding = ({ + specifier, + requireModule, + errors, +}: TryLoadNativeBindingOptions): NativeAnonymizeBinding | null => { + try { + const loaded = requireModule(specifier); + const binding = toNativeAnonymizeBinding(loaded); + if (binding) { + return binding; + } + errors.push(`${specifier}: module does not match native binding shape`); + } catch (error) { + errors.push(`${specifier}: ${formatLoadError(error)}`); + } + return null; +}; + +const toNativeAnonymizeBinding = ( + value: unknown, +): NativeAnonymizeBinding | null => { + const candidate = + isPropertyBag(value) && isPropertyBag(value["default"]) + ? value["default"] + : value; + return isNativeAnonymizeBinding(candidate) ? candidate : null; +}; + +const isNativeAnonymizeBinding = ( + candidate: unknown, +): candidate is NativeAnonymizeBinding => { + if (!isPropertyBag(candidate)) { + return false; + } + if (typeof candidate["nativePackageVersion"] !== "function") { + return false; + } + if (typeof candidate["normalizeForSearch"] !== "function") { + return false; + } + if (typeof candidate["prepareStaticSearchPackageBytes"] !== "function") { + return false; + } + if ( + typeof candidate["prepareStaticSearchCompressedPackageBytes"] !== "function" + ) { + return false; + } + const preparedSearch = candidate["NativePreparedSearch"]; + if (!isPropertyBag(preparedSearch)) { + return false; + } + if (typeof preparedSearch["fromConfigJsonBytes"] !== "function") { + return false; + } + if (typeof preparedSearch["fromPreparedPackageBytes"] !== "function") { + return false; + } + return true; +}; + +const isPropertyBag = (value: unknown): value is Record => + (typeof value === "object" && value !== null) || typeof value === "function"; + +const formatLoadError = (error: unknown): string => { + if (error instanceof Error) { + return error.message; + } + return String(error); +}; diff --git a/packages/anonymize/src/native-pipeline.ts b/packages/anonymize/src/native-pipeline.ts new file mode 100644 index 00000000..aea10c46 --- /dev/null +++ b/packages/anonymize/src/native-pipeline.ts @@ -0,0 +1,269 @@ +import { + buildNativeStaticSearchBundle, + type NativePreparedSearchConfig, +} from "./build-unified-search"; +import type { PipelineContext } from "./context"; +import { defaultContext } from "./context"; +import { applyPipelineLanguageScope } from "./language-scope"; +import { pipelineConfigKey } from "./pipeline-cache-key"; +import type { Dictionaries, GazetteerEntry, PipelineConfig } from "./types"; +import { + createNativePipelineFromPackage, + prepareNativeSearchPackage, + PreparedNativePipeline, + type NativeAnonymizeBinding, +} from "./native"; + +export { + PreparedNativePipeline, + createNativePipelineFromPackage, +} from "./native"; + +export type NativePipelineUnsupportedFeature = "enableNer" | "enableNameCorpus"; + +export type NativePipelineCompatibility = + | { status: "supported" } + | { + status: "unsupported"; + unsupportedFeatures: NativePipelineUnsupportedFeature[]; + }; + +export type NativePipelineBuildOptions = { + binding: NativeAnonymizeBinding; + config: PipelineConfig; + gazetteerEntries?: GazetteerEntry[]; + context?: PipelineContext; +}; + +export type NativePipelinePackageOptions = NativePipelineBuildOptions & { + compressed?: boolean; +}; + +export type { NativePipelineFromPackageOptions } from "./native"; + +type NativePipelinePackageCacheValue = Promise | Uint8Array; + +const sharedPackageByDictionaries = new WeakMap< + Dictionaries, + Map +>(); +const sharedPackageWithoutDictionaries = new Map< + string, + NativePipelinePackageCacheValue +>(); +const dictionaryCacheIds = new WeakMap(); +let nextDictionaryCacheId = 0; + +const dictionaryCacheKey = (dictionaries: Dictionaries | undefined): string => { + if (dictionaries === undefined) { + return "none"; + } + const existing = dictionaryCacheIds.get(dictionaries); + if (existing !== undefined) { + return `dict:${existing}`; + } + nextDictionaryCacheId += 1; + dictionaryCacheIds.set(dictionaries, nextDictionaryCacheId); + return `dict:${nextDictionaryCacheId}`; +}; + +const sharedPackageCacheFor = ( + dictionaries: Dictionaries | undefined, +): Map => { + if (dictionaries === undefined) { + return sharedPackageWithoutDictionaries; + } + const cached = sharedPackageByDictionaries.get(dictionaries); + if (cached !== undefined) { + return cached; + } + const created = new Map(); + sharedPackageByDictionaries.set(dictionaries, created); + return created; +}; + +export const getNativePipelineCompatibility = ( + config: PipelineConfig, +): NativePipelineCompatibility => { + const unsupportedFeatures: NativePipelineUnsupportedFeature[] = []; + + if (config.enableNer) unsupportedFeatures.push("enableNer"); + if (config.enableNameCorpus && !config.enableDenyList) { + unsupportedFeatures.push("enableNameCorpus"); + } + if (unsupportedFeatures.length === 0) { + return { status: "supported" }; + } + return { status: "unsupported", unsupportedFeatures }; +}; + +export const assertNativePipelineSupported = (config: PipelineConfig): void => { + const compatibility = getNativePipelineCompatibility(config); + if (compatibility.status === "supported") { + return; + } + throw new Error( + `Native pipeline does not yet support: ${compatibility.unsupportedFeatures.join(", ")}`, + ); +}; + +export const prepareNativePipelineConfig = async ({ + config, + gazetteerEntries = [], + context, +}: Omit< + NativePipelineBuildOptions, + "binding" +>): Promise => { + assertNativePipelineSupported(config); + const bundle = await buildNativeStaticSearchBundle( + config, + gazetteerEntries, + context ?? defaultContext, + ); + return bundle.nativeStaticConfig; +}; + +export const prepareNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries = [], + context, + compressed = true, +}: NativePipelinePackageOptions): Promise => { + const packageBytes = await getCachedNativePipelinePackage({ + config, + binding, + gazetteerEntries, + ...(context ? { context } : {}), + compressed, + }); + return packageBytes.slice(); +}; + +export const createNativePipelineFromConfig = async ({ + binding, + config, + gazetteerEntries = [], + context, +}: NativePipelineBuildOptions): Promise => { + const packageBytes = await getCachedNativePipelinePackage({ + binding, + config, + gazetteerEntries, + ...(context ? { context } : {}), + }); + return createNativePipelineFromPackage({ binding, packageBytes }); +}; + +const getCachedNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries = [], + context, + compressed = true, +}: NativePipelinePackageOptions): Promise => { + const scopedConfig = applyPipelineLanguageScope(config); + assertNativePipelineSupported(scopedConfig); + const ctx = context ?? defaultContext; + const key = nativePackageCacheKey({ + binding, + config: scopedConfig, + gazetteerEntries, + compressed, + }); + if (ctx.nativePipelinePackage && ctx.nativePipelinePackageKey === key) { + return ctx.nativePipelinePackage; + } + if ( + ctx.nativePipelinePackagePromise && + ctx.nativePipelinePackageKey === key + ) { + return ctx.nativePipelinePackagePromise; + } + + const sharedCache = sharedPackageCacheFor(scopedConfig.dictionaries); + const shared = sharedCache.get(key); + if (shared !== undefined) { + const packageBytes = await shared; + ctx.nativePipelinePackage = packageBytes; + ctx.nativePipelinePackageKey = key; + ctx.nativePipelinePackagePromise = null; + return packageBytes; + } + + ctx.nativePipelinePackage = null; + ctx.nativePipelinePackageKey = key; + const promise = buildNativePipelinePackage({ + binding, + config: scopedConfig, + gazetteerEntries, + context: ctx, + compressed, + }); + ctx.nativePipelinePackagePromise = promise; + sharedCache.set(key, promise); + let packageBytes: Uint8Array; + try { + packageBytes = await promise; + } catch (error) { + if (sharedCache.get(key) === promise) { + sharedCache.delete(key); + } + if ( + ctx.nativePipelinePackageKey === key && + ctx.nativePipelinePackagePromise === promise + ) { + ctx.nativePipelinePackage = null; + ctx.nativePipelinePackagePromise = null; + } + throw error; + } + if (sharedCache.get(key) === promise) { + sharedCache.set(key, packageBytes); + } + if (ctx.nativePipelinePackageKey === key) { + ctx.nativePipelinePackage = packageBytes; + ctx.nativePipelinePackagePromise = null; + } + return packageBytes; +}; + +const buildNativePipelinePackage = async ({ + binding, + config, + gazetteerEntries, + context, + compressed, +}: Required): Promise => { + const bundle = await buildNativeStaticSearchBundle( + config, + gazetteerEntries, + context, + ); + return prepareNativeSearchPackage({ + binding, + config: bundle.nativeStaticConfig, + compressed, + }); +}; + +type NativePackageCacheKeyOptions = { + binding: NativeAnonymizeBinding; + config: PipelineConfig; + gazetteerEntries: readonly GazetteerEntry[]; + compressed: boolean; +}; + +const nativePackageCacheKey = ({ + binding, + config, + gazetteerEntries, + compressed, +}: NativePackageCacheKeyOptions): string => + [ + binding.nativePackageVersion(), + compressed ? "compressed" : "raw", + dictionaryCacheKey(config.dictionaries), + pipelineConfigKey(config, gazetteerEntries), + ].join(":"); diff --git a/packages/anonymize/src/native-sdk-contract.ts b/packages/anonymize/src/native-sdk-contract.ts new file mode 100644 index 00000000..1c2df34c --- /dev/null +++ b/packages/anonymize/src/native-sdk-contract.ts @@ -0,0 +1,27 @@ +export const SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS = [ + "prepare_search_package", + "load_prepared_package", + "native_package_version", + "normalize_for_search", + "redact_text", + "redact_text_json", + "diagnostics_json", +] as const; + +export const SHARED_NATIVE_SDK_TOP_LEVEL_FUNCTIONS = [ + ...SHARED_NATIVE_SDK_CORE_TOP_LEVEL_FUNCTIONS, + "load_prepared_package_file", +] as const; + +export const SHARED_NATIVE_SDK_PREPARED_METHODS = [ + "redact_text", + "redact_text_json", + "diagnostics_json", + "prepare_diagnostics_json", + "warm_lazy_regex", +] as const; + +export const SHARED_NATIVE_SDK_CLASS_NAMES = [ + "PreparedAnonymizer", + "PreparedSearch", +] as const; diff --git a/packages/anonymize/src/native.ts b/packages/anonymize/src/native.ts new file mode 100644 index 00000000..afbe37a3 --- /dev/null +++ b/packages/anonymize/src/native.ts @@ -0,0 +1,545 @@ +import type { NativePreparedSearchConfig } from "./build-unified-search"; +import type { OperatorType } from "./types"; + +type NativeBindingOperatorConfig = { + operators?: Record; + redactString?: string; +}; + +type NativeBindingRedactionEntry = { + placeholder: string; + original: string; +}; + +type NativeBindingOperatorEntry = { + placeholder: string; + operator: OperatorType; +}; + +type NativeBindingPipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + sourceDetail?: string | null; +}; + +type NativeBindingRedactionResult = { + redactedText: string; + redactionMap: NativeBindingRedactionEntry[]; + operatorMap: NativeBindingOperatorEntry[]; + entityCount: number; +}; + +type NativeBindingStaticRedactionResult = { + resolvedEntities: NativeBindingPipelineEntity[]; + redaction: NativeBindingRedactionResult; +}; + +type CanonicalPipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + source_detail?: string | null; +}; + +type CanonicalStaticRedactionResult = { + resolved_entities: CanonicalPipelineEntity[]; + redaction: { + redacted_text: string; + redaction_map: NativeBindingRedactionEntry[]; + operator_map: NativeBindingOperatorEntry[]; + entity_count: number; + }; +}; + +export type NativePreparedSearchBinding = { + prepareDiagnosticsJson?: () => string; + warmLazyRegex?: () => void; + warm_lazy_regex?: () => void; + redactStaticEntities: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => NativeBindingStaticRedactionResult; + redactStaticEntitiesJson?: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => string; + redactStaticEntitiesDiagnosticsJson?: ( + fullText: string, + operators?: NativeBindingOperatorConfig, + ) => string; +}; + +export type NativeAnonymizeBinding = { + normalizeForSearch: (text: string) => string; + nativePackageVersion: () => string; + NativePreparedSearch: { + fromConfigJsonBytes: ( + configJson: Uint8Array, + ) => NativePreparedSearchBinding; + fromPreparedPackageBytes: ( + packageBytes: Uint8Array, + ) => NativePreparedSearchBinding; + }; + prepareStaticSearchPackageBytes: (configJson: Uint8Array) => Uint8Array; + prepareStaticSearchCompressedPackageBytes: ( + configJson: Uint8Array, + ) => Uint8Array; +}; + +export type NativeOperatorConfig = { + operators?: Record; + redactString?: string; +}; + +export type NativePipelineEntity = { + start: number; + end: number; + label: string; + text: string; + score: number; + source: string; + sourceDetail?: string; +}; + +export type NativeRedactionResult = { + redactedText: string; + redactionMap: Map; + operatorMap: Map; + entityCount: number; +}; + +export type NativeStaticRedactionResult = { + resolvedEntities: NativePipelineEntity[]; + redaction: NativeRedactionResult; +}; + +export type NativeSearchPackageOptions = { + binding: NativeAnonymizeBinding; + config: NativePreparedSearchConfig; + compressed?: boolean; +}; + +export type NativeSearchPackageInput = + | NativePreparedSearchConfig + | string + | Uint8Array; + +export type SharedNativeSearchPackageOptions = { + binding: NativeAnonymizeBinding; + config: NativeSearchPackageInput; + compressed?: boolean; +}; + +export type SharedNativePreparedPackageOptions = { + binding: NativeAnonymizeBinding; + packageBytes: Uint8Array; +}; + +export type SharedNativeRedactTextJsonOptions = { + binding: NativeAnonymizeBinding; + config: NativeSearchPackageInput; + fullText: string; + operators?: NativeOperatorConfig; +}; + +export type SharedNativeRedactTextOptions = SharedNativeRedactTextJsonOptions; + +export type SharedNativeDiagnosticsJsonOptions = + SharedNativeRedactTextJsonOptions; + +export type NativeNormalizeOptions = { + binding: NativeAnonymizeBinding; + text: string; +}; + +export type NativeAnonymizerFromConfigOptions = { + binding: NativeAnonymizeBinding; + config: NativePreparedSearchConfig; +}; + +export type NativeAnonymizerFromPackageOptions = { + binding: NativeAnonymizeBinding; + packageBytes: Uint8Array; +}; + +export type NativePipelineFromPackageOptions = + NativeAnonymizerFromPackageOptions; + +export type NativeBindingVersionOptions = { + binding: NativeAnonymizeBinding; + expectedVersion: string; +}; + +export class PreparedNativeAnonymizer { + readonly #prepared: NativePreparedSearchBinding; + + constructor(prepared: NativePreparedSearchBinding) { + this.#prepared = prepared; + } + + prepareDiagnosticsJson(): string | null { + return this.#prepared.prepareDiagnosticsJson?.() ?? null; + } + + prepare_diagnostics_json(): string | null { + return this.prepareDiagnosticsJson(); + } + + warmLazyRegex(): void { + if (this.#prepared.warmLazyRegex) { + this.#prepared.warmLazyRegex(); + return; + } + this.#prepared.warm_lazy_regex?.(); + } + + warm_lazy_regex(): void { + this.warmLazyRegex(); + } + + redactStaticEntities( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return toNativeStaticRedactionResult( + this.#prepared.redactStaticEntities( + fullText, + toBindingOperatorConfig(operators), + ), + ); + } + + redact_text( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.redactStaticEntities(fullText, operators); + } + + redact_text_json(fullText: string, operators?: NativeOperatorConfig): string { + const bindingOperators = toBindingOperatorConfig(operators); + if (this.#prepared.redactStaticEntitiesJson) { + return this.#prepared.redactStaticEntitiesJson( + fullText, + bindingOperators, + ); + } + return JSON.stringify( + toBindingStaticRedactionResult( + toNativeStaticRedactionResult( + this.#prepared.redactStaticEntities(fullText, bindingOperators), + ), + ), + ); + } + + redactStaticEntitiesDiagnosticsJson( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + if (!this.#prepared.redactStaticEntitiesDiagnosticsJson) { + return null; + } + return this.#prepared.redactStaticEntitiesDiagnosticsJson( + fullText, + toBindingOperatorConfig(operators), + ); + } + + diagnostics_json( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.redactStaticEntitiesDiagnosticsJson(fullText, operators); + } +} + +export class PreparedNativePipeline { + readonly #anonymizer: PreparedNativeAnonymizer; + + constructor(anonymizer: PreparedNativeAnonymizer) { + this.#anonymizer = anonymizer; + } + + prepareDiagnosticsJson(): string | null { + return this.#anonymizer.prepareDiagnosticsJson(); + } + + prepare_diagnostics_json(): string | null { + return this.prepareDiagnosticsJson(); + } + + warmLazyRegex(): void { + this.#anonymizer.warmLazyRegex(); + } + + warm_lazy_regex(): void { + this.warmLazyRegex(); + } + + redactText( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.#anonymizer.redactStaticEntities(fullText, operators); + } + + redact_text( + fullText: string, + operators?: NativeOperatorConfig, + ): NativeStaticRedactionResult { + return this.redactText(fullText, operators); + } + + redact_text_json(fullText: string, operators?: NativeOperatorConfig): string { + return JSON.stringify( + toBindingStaticRedactionResult(this.redactText(fullText, operators)), + ); + } + + redactTextDiagnosticsJson( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.#anonymizer.redactStaticEntitiesDiagnosticsJson( + fullText, + operators, + ); + } + + diagnostics_json( + fullText: string, + operators?: NativeOperatorConfig, + ): string | null { + return this.redactTextDiagnosticsJson(fullText, operators); + } +} + +export const encodeNativeSearchConfig = ( + config: NativePreparedSearchConfig, +): Uint8Array => new TextEncoder().encode(JSON.stringify(config)); + +export const encodeNativeSearchConfigInput = ( + config: NativeSearchPackageInput, +): Uint8Array => { + if (typeof config === "string") { + return new TextEncoder().encode(config); + } + if (config instanceof Uint8Array) { + return config; + } + return encodeNativeSearchConfig(config); +}; + +export const getNativeBindingVersion = ( + binding: NativeAnonymizeBinding, +): string => binding.nativePackageVersion(); + +export const native_package_version = getNativeBindingVersion; + +export const normalize_for_search = ({ + binding, + text, +}: NativeNormalizeOptions): string => binding.normalizeForSearch(text); + +export const assertNativeBindingVersion = ({ + binding, + expectedVersion, +}: NativeBindingVersionOptions): void => { + const actualVersion = getNativeBindingVersion(binding); + if (actualVersion !== expectedVersion) { + throw new Error( + `Native anonymize binding version ${actualVersion} does not match ${expectedVersion}`, + ); + } +}; + +export const prepareNativeSearchPackage = ({ + binding, + config, + compressed = true, +}: NativeSearchPackageOptions): Uint8Array => { + const configBytes = encodeNativeSearchConfig(config); + return compressed + ? binding.prepareStaticSearchCompressedPackageBytes(configBytes) + : binding.prepareStaticSearchPackageBytes(configBytes); +}; + +export const prepare_search_package = ({ + binding, + config, + compressed = true, +}: SharedNativeSearchPackageOptions): Uint8Array => { + const configBytes = encodeNativeSearchConfigInput(config); + return compressed + ? binding.prepareStaticSearchCompressedPackageBytes(configBytes) + : binding.prepareStaticSearchPackageBytes(configBytes); +}; + +export const createNativeAnonymizerFromConfig = ({ + binding, + config, +}: NativeAnonymizerFromConfigOptions): PreparedNativeAnonymizer => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfig(config), + ), + ); + +export const createNativeAnonymizerFromPackage = ({ + binding, + packageBytes, +}: NativeAnonymizerFromPackageOptions): PreparedNativeAnonymizer => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromPreparedPackageBytes(packageBytes), + ); + +export const load_prepared_package = ({ + binding, + packageBytes, +}: SharedNativePreparedPackageOptions): PreparedNativeAnonymizer => + createNativeAnonymizerFromPackage({ binding, packageBytes }); + +export const redact_text_json = ({ + binding, + config, + fullText, + operators, +}: SharedNativeRedactTextJsonOptions): string => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).redact_text_json(fullText, operators); + +export const redact_text = ({ + binding, + config, + fullText, + operators, +}: SharedNativeRedactTextOptions): NativeStaticRedactionResult => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).redact_text(fullText, operators); + +export const diagnostics_json = ({ + binding, + config, + fullText, + operators, +}: SharedNativeDiagnosticsJsonOptions): string | null => + new PreparedNativeAnonymizer( + binding.NativePreparedSearch.fromConfigJsonBytes( + encodeNativeSearchConfigInput(config), + ), + ).diagnostics_json(fullText, operators); + +export const createNativePipelineFromPackage = ({ + binding, + packageBytes, +}: NativePipelineFromPackageOptions): PreparedNativePipeline => + new PreparedNativePipeline( + createNativeAnonymizerFromPackage({ binding, packageBytes }), + ); + +export const PreparedSearch = PreparedNativeAnonymizer; +export type PreparedSearch = PreparedNativeAnonymizer; +export const PreparedAnonymizer = PreparedNativeAnonymizer; +export type PreparedAnonymizer = PreparedNativeAnonymizer; + +const toBindingOperatorConfig = ( + config: NativeOperatorConfig | undefined, +): NativeBindingOperatorConfig | undefined => { + if (!config) { + return undefined; + } + const bindingConfig: NativeBindingOperatorConfig = {}; + if (config.operators !== undefined) { + bindingConfig.operators = config.operators; + } + if (config.redactString !== undefined) { + bindingConfig.redactString = config.redactString; + } + return bindingConfig; +}; + +const toNativeStaticRedactionResult = ( + result: NativeBindingStaticRedactionResult, +): NativeStaticRedactionResult => ({ + resolvedEntities: result.resolvedEntities.map(toNativePipelineEntity), + redaction: toNativeRedactionResult(result.redaction), +}); + +const toBindingStaticRedactionResult = ( + result: NativeStaticRedactionResult, +): CanonicalStaticRedactionResult => ({ + resolved_entities: result.resolvedEntities.map(toBindingPipelineEntity), + redaction: { + redacted_text: result.redaction.redactedText, + redaction_map: [...result.redaction.redactionMap.entries()].map( + ([placeholder, original]) => ({ placeholder, original }), + ), + operator_map: [...result.redaction.operatorMap.entries()].map( + ([placeholder, operator]) => ({ placeholder, operator }), + ), + entity_count: result.redaction.entityCount, + }, +}); + +const toNativePipelineEntity = ( + entity: NativeBindingPipelineEntity, +): NativePipelineEntity => ({ + start: entity.start, + end: entity.end, + label: entity.label, + text: entity.text, + score: entity.score, + source: entity.source, + ...(entity.sourceDetail ? { sourceDetail: entity.sourceDetail } : {}), +}); + +const toBindingPipelineEntity = ({ + sourceDetail, + ...entity +}: NativePipelineEntity): CanonicalPipelineEntity => ({ + ...entity, + source_detail: sourceDetail ?? null, +}); + +const toNativeRedactionResult = ( + result: NativeBindingRedactionResult, +): NativeRedactionResult => ({ + redactedText: result.redactedText, + redactionMap: toRedactionMap(result.redactionMap), + operatorMap: toOperatorMap(result.operatorMap), + entityCount: result.entityCount, +}); + +const toRedactionMap = ( + entries: readonly NativeBindingRedactionEntry[], +): Map => { + const map = new Map(); + for (const entry of entries) { + map.set(entry.placeholder, entry.original); + } + return map; +}; + +const toOperatorMap = ( + entries: readonly NativeBindingOperatorEntry[], +): Map => { + const map = new Map(); + for (const entry of entries) { + map.set(entry.placeholder, entry.operator); + } + return map; +}; diff --git a/packages/anonymize/src/pipeline-cache-key.ts b/packages/anonymize/src/pipeline-cache-key.ts new file mode 100644 index 00000000..99415a86 --- /dev/null +++ b/packages/anonymize/src/pipeline-cache-key.ts @@ -0,0 +1,76 @@ +import { + isLegalFormsEnabled, + type GazetteerEntry, + type PipelineConfig, +} from "./types"; + +const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; + +export const pipelineConfigKey = ( + config: PipelineConfig, + gazetteerEntries: readonly GazetteerEntry[], +): string => { + const legalFormsEnabled = isLegalFormsEnabled(config); + const customDenyFingerprint = + config.enableDenyList && config.customDenyList + ? config.customDenyList + .map((entry) => + JSON.stringify({ + label: entry.label, + value: entry.value, + variants: [...(entry.variants ?? [])].sort(), + }), + ) + .sort() + .join("\n") + : ""; + const customRegexFingerprint = + config.enableRegex && config.customRegexes + ? config.customRegexes + .map((entry) => + JSON.stringify({ + label: entry.label, + pattern: entry.pattern, + score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, + }), + ) + .sort() + .join("\n") + : ""; + const gazFingerprint = + config.enableGazetteer && gazetteerEntries.length > 0 + ? gazetteerEntries + .map( + (entry) => + `${entry.id}:${entry.canonical}:${entry.label}:${[ + ...entry.variants, + ] + .sort() + .join(",")}`, + ) + .toSorted() + .join(";") + : ""; + + return ( + `${config.enableDenyList}:` + + `${config.enableTriggerPhrases}:` + + `${legalFormsEnabled}:` + + `${config.enableNameCorpus}:` + + `${config.nameCorpusLanguages?.toSorted().join(",") ?? ""}:` + + `${config.enableRegex}:` + + `${config.threshold}:` + + `${config.enableConfidenceBoost}:` + + `${config.enableHotwordRules === true}:` + + `${config.enableCoreference === true}:` + + `${config.enableZoneClassification === true}:` + + `${config.labels.toSorted().join(",")}:` + + `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + + `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + + `${config.denyListExcludeCategories?.toSorted().join(",") ?? ""}:` + + `${customDenyFingerprint}:` + + `${customRegexFingerprint}:` + + `${config.enableGazetteer}:${gazFingerprint}:` + + `${config.enableCountries !== false}` + ); +}; diff --git a/packages/anonymize/src/pipeline.ts b/packages/anonymize/src/pipeline.ts index b0773d9d..e3339f30 100644 --- a/packages/anonymize/src/pipeline.ts +++ b/packages/anonymize/src/pipeline.ts @@ -19,6 +19,7 @@ import { } from "./detectors/triggers"; import { ensureDenyListData, + loadDefinedTermHeads, processDenyListMatches, } from "./detectors/deny-list"; import { processAddressSeeds } from "./detectors/address-seeds"; @@ -69,6 +70,8 @@ import { runUnifiedSearch } from "./unified-search"; import { maskDetectedSpans, unmaskNerEntities } from "./util/entity-masking"; import type { PipelineContext } from "./context"; import { defaultContext } from "./context"; +import { applyPipelineLanguageScope } from "./language-scope"; +import { pipelineConfigKey } from "./pipeline-cache-key"; /** * Sources backed by curated literal dictionaries. @@ -801,8 +804,6 @@ const createAllowedLabelSetFromLabels = ( const createAllowedLabelSet = (config: PipelineConfig): AllowedLabelSet => createAllowedLabelSetFromLabels(config.labels); -const DEFAULT_CUSTOM_REGEX_SCORE = 0.9; - const filterAllowedLabels = ( entities: Entity[], allowedLabels: AllowedLabelSet, @@ -841,69 +842,6 @@ const checkAbort = (signal?: AbortSignal): void => { } }; -const configKey = ( - config: PipelineConfig, - gazetteerEntries: GazetteerEntry[], -): string => { - const legalFormsEnabled = isLegalFormsEnabled(config); - const customDenyFingerprint = - config.enableDenyList && config.customDenyList - ? config.customDenyList - .map((entry) => - JSON.stringify({ - label: entry.label, - value: entry.value, - variants: [...(entry.variants ?? [])].sort(), - }), - ) - .sort() - .join("\n") - : ""; - const customRegexFingerprint = - config.enableRegex && config.customRegexes - ? config.customRegexes - .map((entry) => - JSON.stringify({ - label: entry.label, - pattern: entry.pattern, - score: entry.score ?? DEFAULT_CUSTOM_REGEX_SCORE, - }), - ) - .sort() - .join("\n") - : ""; - // Gazetteer fingerprint: sorted entry IDs, - // canonical forms, labels, and variants. - // Skip when gazetteer is disabled to avoid - // unnecessary cache misses. - const gazFingerprint = - config.enableGazetteer && gazetteerEntries.length > 0 - ? gazetteerEntries - .map( - (e) => - `${e.id}:${e.canonical}:${e.label}:${[...e.variants].sort().join(",")}`, - ) - .toSorted() - .join(";") - : ""; - return ( - `${config.enableDenyList}:` + - `${config.enableTriggerPhrases}:` + - `${legalFormsEnabled}:` + - `${config.enableNameCorpus}:` + - `${config.nameCorpusLanguages?.toSorted().join(",") ?? ""}:` + - `${config.enableRegex}:` + - `${config.labels.toSorted().join(",")}:` + - `${config.denyListCountries?.toSorted().join(",") ?? ""}:` + - `${config.denyListRegions?.toSorted().join(",") ?? ""}:` + - `${config.denyListExcludeCategories?.toSorted().join(",") ?? ""}:` + - `${customDenyFingerprint}:` + - `${customRegexFingerprint}:` + - `${config.enableGazetteer}:${gazFingerprint}:` + - `${config.enableCountries !== false}` - ); -}; - type SharedSearchCacheValue = | Promise | UnifiedSearchInstance; @@ -963,7 +901,7 @@ const getCachedSearch = async ( gazetteerEntries: GazetteerEntry[], ctx: PipelineContext, ): Promise => { - const key = configKey(config, gazetteerEntries); + const key = pipelineConfigKey(config, gazetteerEntries); if (ctx.search && ctx.searchKey === key) { return ctx.search; } @@ -1027,7 +965,11 @@ export const preparePipelineSearch = ({ gazetteerEntries = [], context, }: PipelineSearchOptions): Promise => - getCachedSearch(config, gazetteerEntries, context ?? defaultContext); + getCachedSearch( + applyPipelineLanguageScope(config), + gazetteerEntries, + context ?? defaultContext, + ); /** * Options for {@link runPipeline}. @@ -1068,7 +1010,7 @@ export const runPipeline = async ( ): Promise => { const { fullText, - config, + config: inputConfig, gazetteerEntries, nerInference = null, onProgress, @@ -1076,6 +1018,7 @@ export const runPipeline = async ( signal, context, } = options; + const config = applyPipelineLanguageScope(inputConfig); const ctx = context ?? defaultContext; const allowedLabels = createAllowedLabelSet(config); const legalFormsEnabled = isLegalFormsEnabled(config); @@ -1113,6 +1056,7 @@ export const runPipeline = async ( }); await Promise.all([ loadGenericRoles(ctx), + loadDefinedTermHeads(ctx), loadDocumentStructureHeadings(), initPrepositions(), initStreetAbbrevs(), @@ -1124,6 +1068,7 @@ export const runPipeline = async ( } else { await Promise.all([ loadGenericRoles(ctx), + loadDefinedTermHeads(ctx), loadDocumentStructureHeadings(), initPrepositions(), initStreetAbbrevs(), diff --git a/packages/anonymize/src/types.ts b/packages/anonymize/src/types.ts index eb0ae363..4eadd4e4 100644 --- a/packages/anonymize/src/types.ts +++ b/packages/anonymize/src/types.ts @@ -183,7 +183,11 @@ export type CompiledValidation = | { type: "no-digits"; re: RegExp } | { type: "has-digits"; re: RegExp } | { type: "matches-pattern"; re: RegExp } - | { type: "valid-id"; check: (value: string) => boolean }; + | { + type: "valid-id"; + validator: ValidIdValidator; + check: (value: string) => boolean; + }; /** * Runtime rule — one per trigger string after @@ -355,6 +359,18 @@ export type PipelineConfig = { threshold: number; enableTriggerPhrases: boolean; enableRegex: boolean; + /** + * Expected content language codes. When present, these + * derive default dictionary scopes for name corpus and + * deny-list matching unless the lower-level scope fields + * below are set explicitly. + */ + languages?: string[]; + /** + * Convenience form for single-language documents. Ignored + * when `languages` is also provided. + */ + language?: string; /** * Enables legal-form organization detection. * Required for typed callers; legacy untyped diff --git a/packages/anonymize/tsdown.config.ts b/packages/anonymize/tsdown.config.ts index 57fff9ff..df148b70 100644 --- a/packages/anonymize/tsdown.config.ts +++ b/packages/anonymize/tsdown.config.ts @@ -2,7 +2,12 @@ import { defineConfig } from "tsdown"; export default defineConfig([ { - entry: ["src/index.ts", "src/constants.ts"], + entry: [ + "src/index.ts", + "src/constants.ts", + "src/native.ts", + "src/native-node.ts", + ], outDir: "dist", format: ["esm"], dts: true, diff --git a/packages/anonymize/wasm/LICENSE b/packages/anonymize/wasm/LICENSE index 503b2e8a..9dac24d2 100644 --- a/packages/anonymize/wasm/LICENSE +++ b/packages/anonymize/wasm/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 Stella +Copyright (c) 2026 stella Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated @@ -25,4 +25,3 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/packages/anonymize/wasm/package.json b/packages/anonymize/wasm/package.json index 5a9d0543..7129124a 100644 --- a/packages/anonymize/wasm/package.json +++ b/packages/anonymize/wasm/package.json @@ -34,7 +34,7 @@ "dependencies": { "@huggingface/tokenizers": "^0.1.3", "@stll/stdnum": "^2.1.1", - "@stll/text-search-wasm": "^1.0.5" + "@stll/text-search-wasm": "^1.0.7" }, "peerDependencies": { "@stll/anonymize-data": "^0.0.6", diff --git a/packages/cli/package.json b/packages/cli/package.json index f638c473..8ff88b86 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -31,9 +31,9 @@ }, "devDependencies": { "@stll/anonymize-wasm": "workspace:*", - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3" } } diff --git a/packages/cli/src/dictionary-scope.ts b/packages/cli/src/dictionary-scope.ts index bcb773e4..4e056aac 100644 --- a/packages/cli/src/dictionary-scope.ts +++ b/packages/cli/src/dictionary-scope.ts @@ -34,7 +34,7 @@ const pickKeys = ( }; /** Dictionaries with every section present (possibly empty). */ -export type ScopedDictionaries = Dictionaries & { +export type ScopedDictionaries = { firstNames: Record; surnames: Record; denyList: Record; diff --git a/packages/corpus/package.json b/packages/corpus/package.json index 5578a675..a367a9ff 100644 --- a/packages/corpus/package.json +++ b/packages/corpus/package.json @@ -15,7 +15,7 @@ "@stll/anonymize-data": "^0.0.6" }, "devDependencies": { - "@types/node": "^25.9.3", + "@types/node": "^25.9.4", "bun-types": "^1.3.14", "typescript": "^6.0.3" } diff --git a/packages/data/README.md b/packages/data/README.md index 6a7da55d..6a6ecc75 100644 --- a/packages/data/README.md +++ b/packages/data/README.md @@ -1,5 +1,5 @@

- Stella anonymize + stella anonymize

# @stll/anonymize-data diff --git a/packages/data/config/address-boundaries.json b/packages/data/config/address-boundaries.json index 12c5cc35..cbb8adc4 100644 --- a/packages/data/config/address-boundaries.json +++ b/packages/data/config/address-boundaries.json @@ -5,6 +5,7 @@ "jednajícím", "jejímž jménem", "kontaktní osoba", + "pokud", "zapsán", "zapsaná", "zapsané", @@ -35,6 +36,7 @@ "shall govern", "shall be governed", "to be enforced", + "with a copy", "with the intention", "without reference", "without regard" diff --git a/packages/data/config/address-context.json b/packages/data/config/address-context.json new file mode 100644 index 00000000..393a7981 --- /dev/null +++ b/packages/data/config/address-context.json @@ -0,0 +1,72 @@ +{ + "_comment": "Address context guard words by language. These words can appear as '[Word] [number]' near address-like text in legal documents, but usually denote structure, payments, dates, or references rather than a bare street and house number.", + "bareHouseStopwords": { + "cs": [ + "Příloha", + "Smlouva", + "Článek", + "Dodatek", + "Celkem", + "Strana", + "Faktura", + "Částka", + "Položka", + "Kapitola", + "Zákon", + "Vyhláška", + "Nařízení", + "Usnesení", + "Rozsudek", + "Bod", + "Odstavec", + "Záloha", + "Zbývá", + "Dne", + "Platba", + "Datum", + "Splatnost", + "Variabilní", + "Konstantní", + "Specifický" + ], + "en": [ + "Section", + "Sections", + "Article", + "Articles", + "Schedule", + "Schedules", + "Exhibit", + "Exhibits", + "Annex", + "Annexes", + "Appendix", + "Appendices", + "Clause", + "Clauses", + "Chapter", + "Chapters", + "Paragraph", + "Paragraphs", + "Subsection", + "Subsections", + "Form", + "Page", + "Pages", + "Item", + "Items", + "Note", + "Notes", + "Rule", + "Rules", + "Attachment", + "Attachments", + "Volume", + "Volumes", + "Book", + "Books", + "Part", + "Parts" + ] + } +} diff --git a/packages/data/config/address-jurisdiction-prefixes.json b/packages/data/config/address-jurisdiction-prefixes.json new file mode 100644 index 00000000..f26e4fc1 --- /dev/null +++ b/packages/data/config/address-jurisdiction-prefixes.json @@ -0,0 +1,4 @@ +{ + "_comment": "Address-like jurisdiction prefixes that are valid location/address spans without digits or street-type words. Lowercased and organized per language.", + "en": ["commonwealth of", "district of", "state of", "territory of"] +} diff --git a/packages/data/config/address-stop-keywords.json b/packages/data/config/address-stop-keywords.json index 98d9de5a..f7a0180c 100644 --- a/packages/data/config/address-stop-keywords.json +++ b/packages/data/config/address-stop-keywords.json @@ -17,6 +17,16 @@ "ičo", "ič" ], + "de": [ + "bank", + "bic", + "iban", + "steuer-id", + "steueridentifikationsnummer", + "steuernummer", + "ust-idnr", + "ust-idnr." + ], "en": ["e-mail", "email", "tel", "swift", "iban", "bic"], "pl": [ "nip", diff --git a/packages/data/config/address-unit-abbreviations.json b/packages/data/config/address-unit-abbreviations.json new file mode 100644 index 00000000..dfdf7cd7 --- /dev/null +++ b/packages/data/config/address-unit-abbreviations.json @@ -0,0 +1,4 @@ +{ + "_comment": "Dotted address unit abbreviations that should not terminate address seed expansion. Organised per language because abbreviations are locale-specific.", + "en": ["apt.", "bldg.", "fl.", "ste.", "unit."] +} diff --git a/packages/data/config/ambiguous-country-surfaces.json b/packages/data/config/ambiguous-country-surfaces.json new file mode 100644 index 00000000..04962dd5 --- /dev/null +++ b/packages/data/config/ambiguous-country-surfaces.json @@ -0,0 +1,4 @@ +{ + "_comment": "Country surface forms that collide with much more common non-country usage. Full country names and aliases remain registered separately when present.", + "words": ["indie", "island", "man", "norfolk"] +} diff --git a/packages/data/config/clause-noun-heads.json b/packages/data/config/clause-noun-heads.json index 937dc32a..b11bad4f 100644 --- a/packages/data/config/clause-noun-heads.json +++ b/packages/data/config/clause-noun-heads.json @@ -33,7 +33,11 @@ "přílohu", "dodatek", "dodatku", - "oznámení" + "článek", + "oznámení", + "podmínky", + "předmět", + "ustanovení" ], "de": [ "vertrag", diff --git a/packages/data/config/coreference-org-determiners.json b/packages/data/config/coreference-org-determiners.json new file mode 100644 index 00000000..c5579c22 --- /dev/null +++ b/packages/data/config/coreference-org-determiners.json @@ -0,0 +1,8 @@ +{ + "_comment": "Organization reference determiners used before propagated bare organization names. Values are regex fragments grouped by language.", + "cs": ["společnost(?:i|í|em|u)?", "spolecnost(?:i|em|u)?"], + "de": ["die\\s+(?:gesellschaft|firma)"], + "en": ["the\\s+(?:company|corporation|firm)"], + "es": ["la\\s+(?:empresa|sociedad)", "el\\s+(?:empresa|sociedad)"], + "fr": ["la\\s+société"] +} diff --git a/packages/data/config/defined-term-heads.json b/packages/data/config/defined-term-heads.json new file mode 100644 index 00000000..aa1fffc2 --- /dev/null +++ b/packages/data/config/defined-term-heads.json @@ -0,0 +1,4 @@ +{ + "_comment": "Common head nouns for capitalized defined/legal concepts. These are not person names by themselves; detector-specific filters assemble this vocabulary where needed. Lowercased and organized per language.", + "en": ["association", "period", "reform"] +} diff --git a/packages/data/config/deny-list-filters.json b/packages/data/config/deny-list-filters.json new file mode 100644 index 00000000..51b89152 --- /dev/null +++ b/packages/data/config/deny-list-filters.json @@ -0,0 +1,48 @@ +{ + "en": { + "definedTermCues": [ + "mean", + "means", + "shall mean", + "shall means", + "shall have the meaning", + "shall have the meanings", + "refer to", + "refers to", + "has the meaning", + "has the meanings", + "is defined" + ], + "sentenceStarters": [ + "the", + "this", + "these", + "those", + "an", + "any", + "all", + "each", + "every", + "no", + "now", + "whereas", + "whereby", + "wherein", + "whereof", + "notwithstanding", + "subject", + "in", + "on", + "at", + "by", + "for", + "if", + "upon", + "unless", + "until", + "provided", + "pursuant", + "such" + ] + } +} diff --git a/packages/data/config/false-positive-shapes.json b/packages/data/config/false-positive-shapes.json new file mode 100644 index 00000000..e30eb764 --- /dev/null +++ b/packages/data/config/false-positive-shapes.json @@ -0,0 +1,20 @@ +{ + "_comment": "Language-keyed lexical markers used by false-positive shape guards.", + "addressComponentTerms": { + "cs": ["č.p.", "č.ev.", "č.", "sídliště"] + }, + "ambiguousStreetTypeTerms": { + "fr": ["cours"] + }, + "numberAbbrevPrefixes": { + "cs": ["čís.", "č."], + "de": ["nr."], + "en": ["no.", "n."] + }, + "documentHeadingOrdinalMarkers": { + "cs": ["č.", "č"], + "de": ["nr.", "nr"], + "en": ["no.", "no", "n.", "n"], + "global": ["#"] + } +} diff --git a/packages/data/config/language-scopes.json b/packages/data/config/language-scopes.json new file mode 100644 index 00000000..5d9b85a5 --- /dev/null +++ b/packages/data/config/language-scopes.json @@ -0,0 +1,73 @@ +{ + "_comment": "Default dictionary scopes for content language hints. Lower-level caller config can still override name corpus languages and deny-list countries independently.", + "languages": { + "cs": { + "nameCorpusLanguages": ["cs", "sk"], + "denyListCountries": ["CZ", "SK"] + }, + "de": { + "nameCorpusLanguages": ["de"], + "denyListCountries": ["DE", "AT", "CH"] + }, + "en": { + "nameCorpusLanguages": ["en"], + "denyListCountries": ["US", "GB", "CA", "AU", "IE"] + }, + "es": { + "nameCorpusLanguages": ["es"], + "denyListCountries": [ + "ES", + "MX", + "AR", + "CL", + "CO", + "PE", + "EC", + "VE", + "UY", + "PY", + "BO", + "CR", + "PA", + "DO", + "GT", + "HN", + "SV", + "NI", + "CU" + ] + }, + "fr": { + "nameCorpusLanguages": ["fr"], + "denyListCountries": ["FR", "BE", "CH", "CA", "LU", "MC"] + }, + "hu": { + "nameCorpusLanguages": ["hu"], + "denyListCountries": ["HU"] + }, + "it": { + "nameCorpusLanguages": ["it"], + "denyListCountries": ["IT", "CH"] + }, + "pl": { + "nameCorpusLanguages": ["pl"], + "denyListCountries": ["PL"] + }, + "pt-br": { + "nameCorpusLanguages": ["pt-br"], + "denyListCountries": ["BR"] + }, + "ro": { + "nameCorpusLanguages": ["ro"], + "denyListCountries": ["RO", "MD"] + }, + "sk": { + "nameCorpusLanguages": ["sk", "cs"], + "denyListCountries": ["SK", "CZ"] + }, + "sv": { + "nameCorpusLanguages": ["sv"], + "denyListCountries": ["SE", "FI"] + } + } +} diff --git a/packages/data/config/legal-form-rule-words.json b/packages/data/config/legal-form-rule-words.json new file mode 100644 index 00000000..d2d1c4fc --- /dev/null +++ b/packages/data/config/legal-form-rule-words.json @@ -0,0 +1,27 @@ +{ + "connectorWords": ["a", "and", "und", "et", "e", "y", "i", "&"], + "andConnectorWords": ["and", "und", "et"], + "inNamePrepositions": ["of", "the"], + "companySuffixWords": [ + "Company", + "Co", + "Bank", + "Brothers", + "Bros", + "Sons", + "Group", + "Holdings", + "Trust", + "Partners", + "Associates", + "Corporation", + "Industries", + "Enterprises", + "Solutions", + "Systems", + "Services", + "Foundation", + "Institute" + ], + "commaGatedDirectPrefixes": ["among", "amongst", "between"] +} diff --git a/packages/data/config/legal-role-heads.cs.json b/packages/data/config/legal-role-heads.cs.json index ffab15d9..8c3debe5 100644 --- a/packages/data/config/legal-role-heads.cs.json +++ b/packages/data/config/legal-role-heads.cs.json @@ -28,6 +28,12 @@ "dodavatele", "odběratel", "odběratele", + "plátce", + "příjemce", + "uchazeč", + "uchazeče", + "zadavatel", + "zadavatele", "smluvní", "strana", "strany" diff --git a/packages/data/config/name-corpus-cjk.json b/packages/data/config/name-corpus-cjk.json new file mode 100644 index 00000000..aa9c3bb4 --- /dev/null +++ b/packages/data/config/name-corpus-cjk.json @@ -0,0 +1,86 @@ +{ + "_comment": "CJK name-corpus heuristics organised by script language. Used by supplemental name-corpus detection.", + "zh": { + "nonPersonTerms": [ + "中国", + "中國", + "中文", + "人民", + "公司", + "香港", + "台湾", + "臺灣" + ], + "surnameStarters": [ + "王", + "李", + "张", + "張", + "刘", + "劉", + "陈", + "陳", + "杨", + "楊", + "黄", + "黃", + "赵", + "趙", + "吴", + "吳", + "周", + "徐", + "孙", + "孫", + "马", + "馬", + "朱", + "胡", + "郭", + "何", + "林", + "高", + "梁", + "郑", + "鄭", + "罗", + "羅", + "宋", + "谢", + "謝", + "唐", + "韩", + "韓", + "曹", + "许", + "許", + "邓", + "鄧", + "萧", + "蕭", + "田" + ] + }, + "ja": { + "nonPersonTerms": ["日本"], + "surnameStarters": ["山", "佐", "鈴", "渡", "伊", "中", "小", "吉"] + }, + "ko": { + "nonPersonTerms": ["韩国", "韓國"], + "surnameStarters": [ + "金", + "朴", + "박", + "김", + "이", + "최", + "정", + "강", + "조", + "윤", + "장", + "임", + "한" + ] + } +} diff --git a/packages/data/config/name-corpus-particles.json b/packages/data/config/name-corpus-particles.json new file mode 100644 index 00000000..da0f14c3 --- /dev/null +++ b/packages/data/config/name-corpus-particles.json @@ -0,0 +1,13 @@ +{ + "_comment": "Language-specific particles and suffixes for supplemental name-corpus detection.", + "ar": { + "connectors": ["bin", "bint", "ibn", "al", "el"], + "hyphenatedPrefixes": ["al", "el"] + }, + "in": { + "relationConnectors": ["s/o", "d/o", "w/o", "r/o"] + }, + "ja-latn": { + "suffixes": ["san", "sama", "sensei"] + } +} diff --git a/packages/data/config/organization-indicators.json b/packages/data/config/organization-indicators.json new file mode 100644 index 00000000..033479c5 --- /dev/null +++ b/packages/data/config/organization-indicators.json @@ -0,0 +1,36 @@ +{ + "_comment": "Organisation indicator words used to suppress person-name spans.", + "en": [ + "Group", + "Company", + "LLC", + "LLP", + "LP", + "Inc", + "Ltd", + "Corp", + "Corporation", + "Holdings", + "Partners", + "Association", + "University", + "Bank", + "Fund", + "Trust", + "Agency", + "Government", + "Ministry", + "Office", + "Department", + "Council", + "Board", + "Committee", + "Commission", + "Services", + "Solutions", + "Technologies", + "Systems", + "Analytics", + "Software" + ] +} diff --git a/packages/data/config/organization-unit-heads.json b/packages/data/config/organization-unit-heads.json new file mode 100644 index 00000000..78e4c8f9 --- /dev/null +++ b/packages/data/config/organization-unit-heads.json @@ -0,0 +1,13 @@ +{ + "_comment": "Administrative or organizational unit nouns that can appear in legal prose without denoting a person or a street/city suffix. Lowercased and organized per language.", + "cs": [ + "agentura", + "inspekce", + "kancelář", + "odbor", + "oddělení", + "sekretariát", + "správa", + "úřad" + ] +} diff --git a/packages/data/config/person-stopwords.json b/packages/data/config/person-stopwords.json index ae496fd5..6fd797b8 100644 --- a/packages/data/config/person-stopwords.json +++ b/packages/data/config/person-stopwords.json @@ -1,5 +1,7 @@ { "_comment": "Words that are valid in other labels (address, org) but should never be classified as person. Checked only in person chain scoring.", + "cs": ["cena"], + "en": ["dodd-frank"], "words": [ "addendum", "agent", diff --git a/packages/data/config/signing-clauses.json b/packages/data/config/signing-clauses.json index e8c31718..c4a72d55 100644 --- a/packages/data/config/signing-clauses.json +++ b/packages/data/config/signing-clauses.json @@ -1,53 +1,69 @@ { - "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix (before place), suffix (after place), prepositions (allowed inside multi-word place names).", + "_comment": "Signing clause patterns. Captures the place name from contract signing locations. Each entry: prefix/suffix build regexes; guardPrefixPhrases/guardSuffixPhrases suppress deny-list place hits in the same signing context.", "patterns": [ { "lang": "cs", "prefix": "(?:V|Ve)\\s+", "suffix": "\\s*,?\\s*dne", - "prepositions": ["nad", "pod", "u", "ve", "na"] + "prepositions": ["nad", "pod", "u", "ve", "na"], + "guardPrefixPhrases": ["v", "ve"], + "guardSuffixPhrases": ["dne"] }, { "lang": "sk", "prefix": "(?:V|Vo)\\s+", "suffix": "\\s*,?\\s*dňa", - "prepositions": ["nad", "pod", "pri"] + "prepositions": ["nad", "pod", "pri"], + "guardPrefixPhrases": ["v", "vo"], + "guardSuffixPhrases": ["dňa"] }, { "lang": "de", "prefix": "", "suffix": "\\s*,\\s*den", - "prepositions": ["am", "an", "im"] + "prepositions": ["am", "an", "im"], + "guardPrefixPhrases": [""], + "guardSuffixPhrases": ["den"] }, { "lang": "fr", "prefix": "(?:Fait\\s+)?[Àà]\\s+", "suffix": "\\s*,?\\s*le", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["à", "fait à"], + "guardSuffixPhrases": ["le"] }, { "lang": "en", "prefix": "(?:Signed|Executed)\\s+in\\s+", "suffix": "", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["signed in", "executed in"], + "guardSuffixPhrases": [""] }, { "lang": "pl", "prefix": "(?:W|We)\\s+", "suffix": "\\s*,?\\s*dnia", - "prepositions": ["nad", "pod", "przy"] + "prepositions": ["nad", "pod", "przy"], + "guardPrefixPhrases": ["w", "we"], + "guardSuffixPhrases": ["dnia"] }, { "lang": "it", "prefix": "(?:Fatto\\s+)?[Aa]\\s+", "suffix": "\\s*,?\\s*(?:il|lì)", - "prepositions": [] + "prepositions": [], + "guardPrefixPhrases": ["a", "fatto a"], + "guardSuffixPhrases": ["il", "lì"] }, { "lang": "es", "prefix": "(?:Firmado\\s+)?[Ee]n\\s+", "suffix": "\\s*,?\\s*(?:a|el)", - "prepositions": ["de", "del"] + "prepositions": ["de", "del"], + "guardPrefixPhrases": ["en", "firmado en"], + "guardSuffixPhrases": ["a", "el"] } ] } diff --git a/packages/data/dictionaries/index.ts b/packages/data/dictionaries/index.ts index f4f90629..71ab052e 100644 --- a/packages/data/dictionaries/index.ts +++ b/packages/data/dictionaries/index.ts @@ -4,7 +4,7 @@ * via dynamic imports and cached after first load. */ -type DenyListCategory = +export type DenyListCategory = | "Names" | "Places" | "Addresses" @@ -17,7 +17,7 @@ type DenyListCategory = | "Organizations" | "International"; -type DictionaryMeta = { +export type DictionaryMeta = { label: string; category: DenyListCategory; country: string | null; @@ -847,9 +847,9 @@ export const loadCityDictionary = async ( if (!/^[A-Z]{2}$/.test(cc)) { return []; } - const mod = (await import( - `../dictionaries/cities/${cc}.json` - )) as JsonModule; + const mod = (await import(`../dictionaries/cities/${cc}.json`, { + with: { type: "json" }, + })) as JsonModule; const entries = mod.default; cityCache.set(cc, entries); return entries; @@ -886,7 +886,7 @@ export const CITY_DICTIONARY_META: DictionaryMeta = { // ── Name dictionaries (first + surnames by language) ─ -const NAME_LANGUAGES = [ +export const NAME_LANGUAGES = [ "cs", "sk", "de", @@ -902,6 +902,96 @@ const NAME_LANGUAGES = [ export type NameLanguage = (typeof NAME_LANGUAGES)[number]; +export type DictionaryBundle = { + firstNames: Record; + surnames: Record; + denyList: Record; + denyListMeta: Record; + cities: readonly string[]; + citiesByCountry: Record; +}; + +export type LoadDictionaryBundleOptions = { + countries?: readonly string[]; + cityCountries?: readonly string[]; + nameLanguages?: readonly string[]; +}; + +const DEFAULT_CITY_COUNTRIES = [ + "AT", + "AU", + "BE", + "BG", + "BR", + "CA", + "CH", + "CZ", + "DE", + "DK", + "ES", + "FI", + "FR", + "GB", + "GR", + "HR", + "HU", + "IE", + "IT", + "LU", + "NL", + "NO", + "NZ", + "PL", + "PT", + "RO", + "SE", + "SI", + "SK", + "US", +] as const; + +const normalizeCountryCodes = ( + countries: readonly string[] | undefined, +): Set | null => { + if (countries === undefined || countries.length === 0) { + return null; + } + return new Set(countries.map((country) => country.toUpperCase())); +}; + +const isNameLanguage = (language: string): language is NameLanguage => + NAME_LANGUAGES.some((supported) => supported === language); + +const normalizeNameLanguages = ( + languages: readonly string[] | undefined, +): NameLanguage[] => { + if (languages === undefined || languages.length === 0) { + return [...NAME_LANGUAGES]; + } + const result: NameLanguage[] = []; + for (const language of languages) { + const normalized = language.toLowerCase(); + if (isNameLanguage(normalized)) { + result.push(normalized); + } + } + return result; +}; + +const dictionaryIdIsInScope = ( + id: DictionaryId, + countries: Set | null, + hasScopedNames: boolean, +): boolean => { + const meta = DICTIONARY_META[id]; + if (hasScopedNames && meta.category === "Names") { + return false; + } + return ( + countries === null || meta.country === null || countries.has(meta.country) + ); +}; + /** * Load first-name and surname dictionaries for the * requested languages. Returns the shape expected by @@ -934,3 +1024,60 @@ export const loadNameDictionaries = async ( return { firstNames, surnames }; }; + +export const loadDictionaryBundle = async ({ + countries, + cityCountries, + nameLanguages, +}: LoadDictionaryBundleOptions = {}): Promise => { + const countryScope = normalizeCountryCodes(countries); + const scopedNameLanguages = normalizeNameLanguages(nameLanguages); + const hasScopedNames = + nameLanguages !== undefined && nameLanguages.length > 0; + const dictionaryIds = ALL_DICTIONARY_IDS.filter((id) => + dictionaryIdIsInScope(id, countryScope, hasScopedNames), + ); + const dictionaryResults = await Promise.all( + dictionaryIds.map(async (id) => ({ + id, + entries: await loadDictionary(id), + })), + ); + const denyList: Record = {}; + const denyListMeta: Record = {}; + for (const { id, entries } of dictionaryResults) { + denyList[id] = entries; + denyListMeta[id] = DICTIONARY_META[id]; + } + + const nameDictionaries = await loadNameDictionaries( + hasScopedNames ? scopedNameLanguages : undefined, + ); + const requestedCityScope = cityCountries ?? countries; + const cityScope = + requestedCityScope === undefined || requestedCityScope.length === 0 + ? DEFAULT_CITY_COUNTRIES + : requestedCityScope; + const cityResults = await Promise.all( + cityScope.map(async (country) => ({ + country: country.toUpperCase(), + entries: await loadCityDictionary(country), + })), + ); + const citiesByCountry: Record = {}; + const cities: string[] = []; + for (const { country, entries } of cityResults) { + citiesByCountry[country] = entries; + for (const entry of entries) { + cities.push(entry); + } + } + + return { + ...nameDictionaries, + denyList, + denyListMeta, + cities, + citiesByCountry, + }; +}; diff --git a/packages/data/package.json b/packages/data/package.json index f992d4c8..cb6265b4 100644 --- a/packages/data/package.json +++ b/packages/data/package.json @@ -52,7 +52,7 @@ }, "devDependencies": { "stopwords-iso": "1.1.0", - "tsdown": "^0.22.2", + "tsdown": "^0.22.3", "typescript": "^6.0.3" } } diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 00000000..0e9dcbcd --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "1.96.0" +components = ["rustfmt", "clippy"] +profile = "minimal" diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 00000000..b1b458b0 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,4 @@ +edition = "2024" +max_width = 80 +reorder_imports = true +tab_spaces = 2 diff --git a/turbo.json b/turbo.json index 5582dd51..0809af65 100644 --- a/turbo.json +++ b/turbo.json @@ -3,7 +3,13 @@ "globalDependencies": [".oxfmtrc.json"], "tasks": { "build": { - "outputs": ["dist/**", "wasm/dist/**"] + "dependsOn": ["^build"], + "outputs": [ + "dist/**", + "wasm/dist/**", + "stella_anonymize_napi.node", + "native-pipeline.stlanonpkg" + ] }, "typecheck": { "dependsOn": ["^build"],