stella · jan-kubica · Jun 12, 2026 · Jun 12, 2026
diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs
@@ -7,6 +7,9 @@ const PACKAGES = [
     expected: [
       "dist/index.d.mts",
       "dist/index.mjs",
+      // Dynamically imported corpus chunk; missing means the
+      // bundler stopped resolving the non-Western name imports.
+      "dist/names-nw-in.mjs",
       "README.md",
       "LICENSE",
       "package.json",
@@ -41,6 +44,7 @@ const PACKAGES = [
     expected: [
       "dist/wasm.d.mts",
       "dist/wasm.mjs",
+      "dist/names-nw-in.mjs",
       "dist/vite.d.mts",
       "dist/vite.mjs",
       "README.md",

diff --git a/bun.lock b/bun.lock
diff --git a/packages/anonymize/src/detectors/names.ts b/packages/anonymize/src/detectors/names.ts
@@ -49,6 +49,28 @@ const NONWESTERN_LOCALE_KEYS = [
   "id",
 ] as const;
 
+type NonWesternNamesModule = Promise<{ default: { names: string[] } }>;
+
+// Literal import specifiers so the bundler resolves each corpus
+// file into the build output; a template-literal specifier survives
+// bundling as a runtime-relative path that does not exist in dist.
+const NONWESTERN_NAME_IMPORTS: Record<
+  (typeof NONWESTERN_LOCALE_KEYS)[number],
+  () => NonWesternNamesModule
+> = {
+  in: () => import("../data/names-nw-in.json") as NonWesternNamesModule,
+  ar: () => import("../data/names-nw-ar.json") as NonWesternNamesModule,
+  "ja-latn": () =>
+    import("../data/names-nw-ja-latn.json") as NonWesternNamesModule,
+  ko: () => import("../data/names-nw-ko.json") as NonWesternNamesModule,
+  "zh-latn": () =>
+    import("../data/names-nw-zh-latn.json") as NonWesternNamesModule,
+  th: () => import("../data/names-nw-th.json") as NonWesternNamesModule,
+  vi: () => import("../data/names-nw-vi.json") as NonWesternNamesModule,
+  fil: () => import("../data/names-nw-fil.json") as NonWesternNamesModule,
+  id: () => import("../data/names-nw-id.json") as NonWesternNamesModule,
+};
+
 const normalizeCorpusLanguage = (language: string): string =>
   language.toLowerCase();
 
@@ -219,12 +241,7 @@ export const initNameCorpus = (
       const nwLocaleKeys = getScopedNonWesternLocaleKeys(languages);
       const [nwNameMods, nwExcludedMod] = await Promise.all([
         Promise.all(
-          nwLocaleKeys.map(
-            (locale) =>
-              import(`../data/names-nw-${locale}.json`) as Promise<{
-                default: { names: string[] };
-              }>,
-          ),
+          nwLocaleKeys.map((locale) => NONWESTERN_NAME_IMPORTS[locale]()),
         ),
         import("../data/names-nw-excluded-allcaps.json") as Promise<{
           default: { words: string[] };

diff --git a/packages/bench/README.md b/packages/bench/README.md
@@ -0,0 +1,108 @@
+# @stll/anonymize-bench
+
+Reproducible quality and throughput benchmarks for `@stll/anonymize`.
+Private workspace package; nothing here is published to npm.
+
+## Running
+
+```sh
+bun install
+bun run build            # bench imports the built @stll/anonymize dist
+cd packages/bench
+bun run bench            # quality + throughput + render results/RESULTS.md
+```
+
+Individual steps: `bun run bench:quality`, `bun run bench:throughput`
+(`--iterations N --warmup N`), `bun run bench:render`. Results land in
+`results/` as JSON plus a rendered `results/RESULTS.md`.
+
+## Corpus
+
+The corpus is the contract fixture set in
+`packages/anonymize/src/__test__/fixtures/contracts/` (Czech, German,
+and English legal contracts; public or synthetic documents, several
+sourced from SEC EDGAR filings). The same fixtures gate releases via
+the regression suite, so the benchmark always describes the pipeline
+that actually ships.
+
+All measurements run the deterministic layers only (`enableNer:
+false`): regex, trigger phrases, legal forms, name corpus, deny
+lists, coreference, hotword rules, and zone classification, with the
+full published dictionary set from `@stll/anonymize-data` loaded the
+way a production consumer loads it.
+
+## Reference annotations, and what they can tell you
+
+Quality is scored against the `.snapshot.json` sidecars next to each
+fixture. These are produced by the pipeline itself and then human
+reviewed: every change to them is diffed in PRs, and
+`contract-snapshots.test.ts` plus `contract-quality.test.ts` pin
+specific true positives and false positives that reviewers have
+verified by hand.
+
+Because the reference derives from reviewed pipeline output, the
+pipeline's own score against it is close to perfect **by
+construction**. That number is a drift detector, not proof of
+accuracy. The honest uses of this harness are:
+
+- **Cross-tool comparison.** Other tools' outputs (see interchange
+  format below) are scored against the same reference with the same
+  scorer; relative differences on identical documents are meaningful
+  even when the reference has our bias. Comparisons should be read
+  per label, restricted to labels both tools claim to detect
+  (`--labels person,organization,...`).
+- **Per-label and per-language coverage tracking** across releases.
+- **Throughput**, which does not depend on the reference at all.
+
+Independent third-party corpora are a planned extension; numbers on
+this corpus alone should not be quoted as absolute accuracy claims.
+
+## Scoring
+
+Span-level, per label, one-to-one matching:
+
+- **exact**: label, start, and end must all match.
+- **overlap**: label must match and spans must share at least one
+  character; gold spans claim the unmatched prediction with the
+  largest overlap. For anonymization a partial hit still redacts part
+  of the value, but exact mode is the honest headline metric.
+
+Precision, recall, and F1 are reported per label, per language, and
+micro-averaged. Offsets are UTF-16 code units; fixture text is
+CRLF-normalized to match the regression suite.
+
+## Comparing another tool
+
+Run the tool over the same fixture files and write a predictions file:
+
+```json
+{
+  "tool": "some-tool",
+  "docs": [
+    {
+      "id": "en/software-license-agreement.txt",
+      "entities": [{ "start": 100, "end": 117, "label": "date" }]
+    }
+  ]
+}
+```
+
+Labels must be mapped to the canonical `@stll/anonymize` labels
+(`person`, `organization`, `address`, `date`, ...) by the adapter
+producing the file. Then:
+
+```sh
+bun run bench:quality -- --predictions path/to/predictions.json \
+  --labels person,organization,email address,phone number,date
+bun run bench:render
+```
+
+## Throughput methodology
+
+One-time costs (dictionary load, search automaton preparation) are
+measured separately from steady-state latency. The corpus is run
+`--warmup` full passes (default 2), then `--iterations` measured
+passes (default 10); per-document medians and corpus chars/second are
+reported together with the Bun version and CPU model. Numbers in
+committed results come from a developer laptop; treat them as
+order-of-magnitude, and re-run locally for decisions.
diff --git a/packages/bench/package.json b/packages/bench/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "@stll/anonymize-bench",
+  "version": "0.0.0",
+  "private": true,
+  "description": "Reproducible quality and throughput benchmarks for @stll/anonymize",
+  "type": "module",
+  "license": "MIT",
+  "scripts": {
+    "bench": "bun run bench:quality && bun run bench:throughput && bun run bench:render",
+    "bench:quality": "bun src/run-quality.ts",
+    "bench:throughput": "bun src/run-throughput.ts",
+    "bench:render": "bun src/render-results.ts",
+    "typecheck": "tsc --noEmit -p tsconfig.json",
+    "test": "bun test",
+    "format": "oxfmt ."
+  },
+  "dependencies": {
+    "@stll/anonymize": "workspace:*",
+    "@stll/anonymize-data": "workspace:*"
+  },
+  "devDependencies": {
+    "@types/node": "^25.9.2",
+    "bun-types": "^1.3.14",
+    "typescript": "^6.0.3"
+  }
+}
diff --git a/packages/bench/results/RESULTS.md b/packages/bench/results/RESULTS.md
@@ -0,0 +1,85 @@
+# Benchmark results
+
+Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you.
+
+## Throughput
+
+Environment: Bun 1.3.14, Apple M3 (darwin/arm64). 2 warmup + 10 measured passes; medians reported.
+
+One-time costs: dictionary load 202 ms, search preparation 777 ms.
+
+Corpus: 13 documents, 146,106 chars; median full pass 185.5 ms (787,480 chars/s).
+
+| Document                                     |  Chars | Median ms |  Min |  Max |   Chars/s |
+| -------------------------------------------- | -----: | --------: | ---: | ---: | --------: |
+| cs/asset-transfer-court-declensions.txt      |  1,517 |       4.1 |  3.5 |  7.0 |   371,745 |
+| cs/database-cz-service-contract.txt          |  7,924 |       9.9 |  8.6 | 13.1 |   801,766 |
+| cs/eagles-rental-agreement.txt               |  6,970 |       9.0 |  8.1 | 11.1 |   776,587 |
+| cs/nakit-legal-services-framework.txt        | 45,767 |      62.2 | 59.6 | 71.9 |   735,535 |
+| cs/patrik-nguyen-used-vehicle-sale.txt       |  8,391 |      15.5 | 14.6 | 18.5 |   541,236 |
+| cs/probo-frame-purchase-contract.txt         |  3,204 |       7.1 |  6.1 |  7.7 |   449,708 |
+| cs/sanofi-bonus-agreement.txt                |  1,740 |       3.6 |  3.4 |  4.7 |   478,430 |
+| cs/vinci-donation-agreement.txt              |  4,607 |       6.3 |  5.6 |  8.2 |   728,219 |
+| de/geschaeftsfuehrer-dienstvertrag.txt       |  1,912 |       4.7 |  4.2 |  9.9 |   405,789 |
+| en/gt-biopharma-employment-amendment.txt     |  4,806 |       5.1 |  4.8 |  8.7 |   942,646 |
+| en/healthcare-trust-employment-amendment.txt |  8,627 |      10.6 | 10.0 | 18.7 |   810,201 |
+| en/pra-group-employment-agreement.txt        | 48,324 |      33.1 | 30.9 | 46.3 | 1,461,483 |
+| en/software-license-agreement.txt            |  2,317 |       5.8 |  5.3 | 14.7 |   402,334 |
+
+## Quality vs. reference annotations
+
+The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison.
+
+### anonymize
+
+13 documents, 332 reference entities.
+
+#### exact match
+
+| Label                     | Gold | Precision | Recall |     F1 |
+| ------------------------- | ---: | --------: | -----: | -----: |
+| address                   |   55 |    100.0% | 100.0% | 100.0% |
+| bank account number       |    4 |    100.0% | 100.0% | 100.0% |
+| country                   |   11 |    100.0% | 100.0% | 100.0% |
+| date                      |   52 |    100.0% | 100.0% | 100.0% |
+| date of birth             |    2 |    100.0% | 100.0% | 100.0% |
+| email address             |    4 |    100.0% | 100.0% | 100.0% |
+| iban                      |    1 |    100.0% | 100.0% | 100.0% |
+| monetary amount           |   54 |    100.0% | 100.0% | 100.0% |
+| organization              |   56 |    100.0% | 100.0% | 100.0% |
+| person                    |   48 |    100.0% | 100.0% | 100.0% |
+| phone number              |    3 |    100.0% | 100.0% | 100.0% |
+| registration number       |   27 |    100.0% | 100.0% | 100.0% |
+| tax identification number |   15 |    100.0% | 100.0% | 100.0% |
+| **all (micro)**           |  332 |    100.0% | 100.0% | 100.0% |
+
+| Language | Gold | Precision | Recall |     F1 |
+| -------- | ---: | --------: | -----: | -----: |
+| cs       |  207 |    100.0% | 100.0% | 100.0% |
+| de       |   24 |    100.0% | 100.0% | 100.0% |
+| en       |  101 |    100.0% | 100.0% | 100.0% |
+
+#### overlap match
+
+| Label                     | Gold | Precision | Recall |     F1 |
+| ------------------------- | ---: | --------: | -----: | -----: |
+| address                   |   55 |    100.0% | 100.0% | 100.0% |
+| bank account number       |    4 |    100.0% | 100.0% | 100.0% |
+| country                   |   11 |    100.0% | 100.0% | 100.0% |
+| date                      |   52 |    100.0% | 100.0% | 100.0% |
+| date of birth             |    2 |    100.0% | 100.0% | 100.0% |
+| email address             |    4 |    100.0% | 100.0% | 100.0% |
+| iban                      |    1 |    100.0% | 100.0% | 100.0% |
+| monetary amount           |   54 |    100.0% | 100.0% | 100.0% |
+| organization              |   56 |    100.0% | 100.0% | 100.0% |
+| person                    |   48 |    100.0% | 100.0% | 100.0% |
+| phone number              |    3 |    100.0% | 100.0% | 100.0% |
+| registration number       |   27 |    100.0% | 100.0% | 100.0% |
+| tax identification number |   15 |    100.0% | 100.0% | 100.0% |
+| **all (micro)**           |  332 |    100.0% | 100.0% | 100.0% |
+
+| Language | Gold | Precision | Recall |     F1 |
+| -------- | ---: | --------: | -----: | -----: |
+| cs       |  207 |    100.0% | 100.0% | 100.0% |
+| de       |   24 |    100.0% | 100.0% | 100.0% |
+| en       |  101 |    100.0% | 100.0% | 100.0% |