diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs
index 573f6d18..3e039419 100644
--- a/.github/tools/check-packlist.mjs
+++ b/.github/tools/check-packlist.mjs
@@ -7,6 +7,9 @@ const PACKAGES = [
     expected: [
       "dist/index.d.mts",
       "dist/index.mjs",
+      // Dynamically imported corpus chunk; missing means the
+      // bundler stopped resolving the non-Western name imports.
+      "dist/names-nw-in.mjs",
       "README.md",
       "LICENSE",
       "package.json",
@@ -41,6 +44,7 @@ const PACKAGES = [
     expected: [
       "dist/wasm.d.mts",
       "dist/wasm.mjs",
+      "dist/names-nw-in.mjs",
       "dist/vite.d.mts",
       "dist/vite.mjs",
       "README.md",
diff --git a/bun.lock b/bun.lock
index af9f858e..c4d022c5 100644
--- a/bun.lock
+++ b/bun.lock
@@ -54,6 +54,19 @@
         "vite",
       ],
     },
+    "packages/bench": {
+      "name": "@stll/anonymize-bench",
+      "version": "0.0.0",
+      "dependencies": {
+        "@stll/anonymize": "workspace:*",
+        "@stll/anonymize-data": "workspace:*",
+      },
+      "devDependencies": {
+        "@types/node": "^25.9.2",
+        "bun-types": "^1.3.14",
+        "typescript": "^6.0.3",
+      },
+    },
     "packages/cli": {
       "name": "@stll/anonymize-cli",
       "version": "1.4.10",
@@ -253,6 +266,8 @@
 
     "@stll/anonymize": ["@stll/anonymize@workspace:packages/anonymize"],
 
+    "@stll/anonymize-bench": ["@stll/anonymize-bench@workspace:packages/bench"],
+
     "@stll/anonymize-cli": ["@stll/anonymize-cli@workspace:packages/cli"],
 
     "@stll/anonymize-data": ["@stll/anonymize-data@workspace:packages/data"],
diff --git a/packages/anonymize/src/detectors/names.ts b/packages/anonymize/src/detectors/names.ts
index 146912de..5732b711 100644
--- a/packages/anonymize/src/detectors/names.ts
+++ b/packages/anonymize/src/detectors/names.ts
@@ -49,6 +49,28 @@ const NONWESTERN_LOCALE_KEYS = [
   "id",
 ] as const;
 
+type NonWesternNamesModule = Promise<{ default: { names: string[] } }>;
+
+// Literal import specifiers so the bundler resolves each corpus
+// file into the build output; a template-literal specifier survives
+// bundling as a runtime-relative path that does not exist in dist.
+const NONWESTERN_NAME_IMPORTS: Record<
+  (typeof NONWESTERN_LOCALE_KEYS)[number],
+  () => NonWesternNamesModule
+> = {
+  in: () => import("../data/names-nw-in.json") as NonWesternNamesModule,
+  ar: () => import("../data/names-nw-ar.json") as NonWesternNamesModule,
+  "ja-latn": () =>
+    import("../data/names-nw-ja-latn.json") as NonWesternNamesModule,
+  ko: () => import("../data/names-nw-ko.json") as NonWesternNamesModule,
+  "zh-latn": () =>
+    import("../data/names-nw-zh-latn.json") as NonWesternNamesModule,
+  th: () => import("../data/names-nw-th.json") as NonWesternNamesModule,
+  vi: () => import("../data/names-nw-vi.json") as NonWesternNamesModule,
+  fil: () => import("../data/names-nw-fil.json") as NonWesternNamesModule,
+  id: () => import("../data/names-nw-id.json") as NonWesternNamesModule,
+};
+
 const normalizeCorpusLanguage = (language: string): string =>
   language.toLowerCase();
 
@@ -219,12 +241,7 @@ export const initNameCorpus = (
       const nwLocaleKeys = getScopedNonWesternLocaleKeys(languages);
       const [nwNameMods, nwExcludedMod] = await Promise.all([
         Promise.all(
-          nwLocaleKeys.map(
-            (locale) =>
-              import(`../data/names-nw-${locale}.json`) as Promise<{
-                default: { names: string[] };
-              }>,
-          ),
+          nwLocaleKeys.map((locale) => NONWESTERN_NAME_IMPORTS[locale]()),
         ),
         import("../data/names-nw-excluded-allcaps.json") as Promise<{
           default: { words: string[] };
diff --git a/packages/bench/README.md b/packages/bench/README.md
new file mode 100644
index 00000000..9adaf483
--- /dev/null
+++ b/packages/bench/README.md
@@ -0,0 +1,108 @@
+# @stll/anonymize-bench
+
+Reproducible quality and throughput benchmarks for `@stll/anonymize`.
+Private workspace package; nothing here is published to npm.
+
+## Running
+
+```sh
+bun install
+bun run build            # bench imports the built @stll/anonymize dist
+cd packages/bench
+bun run bench            # quality + throughput + render results/RESULTS.md
+```
+
+Individual steps: `bun run bench:quality`, `bun run bench:throughput`
+(`--iterations N --warmup N`), `bun run bench:render`. Results land in
+`results/` as JSON plus a rendered `results/RESULTS.md`.
+
+## Corpus
+
+The corpus is the contract fixture set in
+`packages/anonymize/src/__test__/fixtures/contracts/` (Czech, German,
+and English legal contracts; public or synthetic documents, several
+sourced from SEC EDGAR filings). The same fixtures gate releases via
+the regression suite, so the benchmark always describes the pipeline
+that actually ships.
+
+All measurements run the deterministic layers only (`enableNer:
+false`): regex, trigger phrases, legal forms, name corpus, deny
+lists, coreference, hotword rules, and zone classification, with the
+full published dictionary set from `@stll/anonymize-data` loaded the
+way a production consumer loads it.
+
+## Reference annotations, and what they can tell you
+
+Quality is scored against the `.snapshot.json` sidecars next to each
+fixture. These are produced by the pipeline itself and then human
+reviewed: every change to them is diffed in PRs, and
+`contract-snapshots.test.ts` plus `contract-quality.test.ts` pin
+specific true positives and false positives that reviewers have
+verified by hand.
+
+Because the reference derives from reviewed pipeline output, the
+pipeline's own score against it is close to perfect **by
+construction**. That number is a drift detector, not proof of
+accuracy. The honest uses of this harness are:
+
+- **Cross-tool comparison.** Other tools' outputs (see interchange
+  format below) are scored against the same reference with the same
+  scorer; relative differences on identical documents are meaningful
+  even when the reference has our bias. Comparisons should be read
+  per label, restricted to labels both tools claim to detect
+  (`--labels person,organization,...`).
+- **Per-label and per-language coverage tracking** across releases.
+- **Throughput**, which does not depend on the reference at all.
+
+Independent third-party corpora are a planned extension; numbers on
+this corpus alone should not be quoted as absolute accuracy claims.
+
+## Scoring
+
+Span-level, per label, one-to-one matching:
+
+- **exact**: label, start, and end must all match.
+- **overlap**: label must match and spans must share at least one
+  character; gold spans claim the unmatched prediction with the
+  largest overlap. For anonymization a partial hit still redacts part
+  of the value, but exact mode is the honest headline metric.
+
+Precision, recall, and F1 are reported per label, per language, and
+micro-averaged. Offsets are UTF-16 code units; fixture text is
+CRLF-normalized to match the regression suite.
+
+## Comparing another tool
+
+Run the tool over the same fixture files and write a predictions file:
+
+```json
+{
+  "tool": "some-tool",
+  "docs": [
+    {
+      "id": "en/software-license-agreement.txt",
+      "entities": [{ "start": 100, "end": 117, "label": "date" }]
+    }
+  ]
+}
+```
+
+Labels must be mapped to the canonical `@stll/anonymize` labels
+(`person`, `organization`, `address`, `date`, ...) by the adapter
+producing the file. Then:
+
+```sh
+bun run bench:quality -- --predictions path/to/predictions.json \
+  --labels person,organization,email address,phone number,date
+bun run bench:render
+```
+
+## Throughput methodology
+
+One-time costs (dictionary load, search automaton preparation) are
+measured separately from steady-state latency. The corpus is run
+`--warmup` full passes (default 2), then `--iterations` measured
+passes (default 10); per-document medians and corpus chars/second are
+reported together with the Bun version and CPU model. Numbers in
+committed results come from a developer laptop; treat them as
+order-of-magnitude, and re-run locally for decisions.
diff --git a/packages/bench/package.json b/packages/bench/package.json
new file mode 100644
index 00000000..809f8904
--- /dev/null
+++ b/packages/bench/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "@stll/anonymize-bench",
+  "version": "0.0.0",
+  "private": true,
+  "description": "Reproducible quality and throughput benchmarks for @stll/anonymize",
+  "type": "module",
+  "license": "MIT",
+  "scripts": {
+    "bench": "bun run bench:quality && bun run bench:throughput && bun run bench:render",
+    "bench:quality": "bun src/run-quality.ts",
+    "bench:throughput": "bun src/run-throughput.ts",
+    "bench:render": "bun src/render-results.ts",
+    "typecheck": "tsc --noEmit -p tsconfig.json",
+    "test": "bun test",
+    "format": "oxfmt ."
+  },
+  "dependencies": {
+    "@stll/anonymize": "workspace:*",
+    "@stll/anonymize-data": "workspace:*"
+  },
+  "devDependencies": {
+    "@types/node": "^25.9.2",
+    "bun-types": "^1.3.14",
+    "typescript": "^6.0.3"
+  }
+}
diff --git a/packages/bench/results/RESULTS.md b/packages/bench/results/RESULTS.md
new file mode 100644
index 00000000..96dcc5ec
--- /dev/null
+++ b/packages/bench/results/RESULTS.md
@@ -0,0 +1,85 @@
+# Benchmark results
+
+Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you.
+
+## Throughput
+
+Environment: Bun 1.3.14, Apple M3 (darwin/arm64). 2 warmup + 10 measured passes; medians reported.
+
+One-time costs: dictionary load 202 ms, search preparation 777 ms.
+
+Corpus: 13 documents, 146,106 chars; median full pass 185.5 ms (787,480 chars/s).
+
+| Document                                     |  Chars | Median ms |  Min |  Max |   Chars/s |
+| -------------------------------------------- | -----: | --------: | ---: | ---: | --------: |
+| cs/asset-transfer-court-declensions.txt      |  1,517 |       4.1 |  3.5 |  7.0 |   371,745 |
+| cs/database-cz-service-contract.txt          |  7,924 |       9.9 |  8.6 | 13.1 |   801,766 |
+| cs/eagles-rental-agreement.txt               |  6,970 |       9.0 |  8.1 | 11.1 |   776,587 |
+| cs/nakit-legal-services-framework.txt        | 45,767 |      62.2 | 59.6 | 71.9 |   735,535 |
+| cs/patrik-nguyen-used-vehicle-sale.txt       |  8,391 |      15.5 | 14.6 | 18.5 |   541,236 |
+| cs/probo-frame-purchase-contract.txt         |  3,204 |       7.1 |  6.1 |  7.7 |   449,708 |
+| cs/sanofi-bonus-agreement.txt                |  1,740 |       3.6 |  3.4 |  4.7 |   478,430 |
+| cs/vinci-donation-agreement.txt              |  4,607 |       6.3 |  5.6 |  8.2 |   728,219 |
+| de/geschaeftsfuehrer-dienstvertrag.txt       |  1,912 |       4.7 |  4.2 |  9.9 |   405,789 |
+| en/gt-biopharma-employment-amendment.txt     |  4,806 |       5.1 |  4.8 |  8.7 |   942,646 |
+| en/healthcare-trust-employment-amendment.txt |  8,627 |      10.6 | 10.0 | 18.7 |   810,201 |
+| en/pra-group-employment-agreement.txt        | 48,324 |      33.1 | 30.9 | 46.3 | 1,461,483 |
+| en/software-license-agreement.txt            |  2,317 |       5.8 |  5.3 | 14.7 |   402,334 |
+
+## Quality vs. reference annotations
+
+The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison.
+
+### anonymize
+
+13 documents, 332 reference entities.
+
+#### exact match
+
+| Label                     | Gold | Precision | Recall |     F1 |
+| ------------------------- | ---: | --------: | -----: | -----: |
+| address                   |   55 |    100.0% | 100.0% | 100.0% |
+| bank account number       |    4 |    100.0% | 100.0% | 100.0% |
+| country                   |   11 |    100.0% | 100.0% | 100.0% |
+| date                      |   52 |    100.0% | 100.0% | 100.0% |
+| date of birth             |    2 |    100.0% | 100.0% | 100.0% |
+| email address             |    4 |    100.0% | 100.0% | 100.0% |
+| iban                      |    1 |    100.0% | 100.0% | 100.0% |
+| monetary amount           |   54 |    100.0% | 100.0% | 100.0% |
+| organization              |   56 |    100.0% | 100.0% | 100.0% |
+| person                    |   48 |    100.0% | 100.0% | 100.0% |
+| phone number              |    3 |    100.0% | 100.0% | 100.0% |
+| registration number       |   27 |    100.0% | 100.0% | 100.0% |
+| tax identification number |   15 |    100.0% | 100.0% | 100.0% |
+| **all (micro)**           |  332 |    100.0% | 100.0% | 100.0% |
+
+| Language | Gold | Precision | Recall |     F1 |
+| -------- | ---: | --------: | -----: | -----: |
+| cs       |  207 |    100.0% | 100.0% | 100.0% |
+| de       |   24 |    100.0% | 100.0% | 100.0% |
+| en       |  101 |    100.0% | 100.0% | 100.0% |
+
+#### overlap match
+
+| Label                     | Gold | Precision | Recall |     F1 |
+| ------------------------- | ---: | --------: | -----: | -----: |
+| address                   |   55 |    100.0% | 100.0% | 100.0% |
+| bank account number       |    4 |    100.0% | 100.0% | 100.0% |
+| country                   |   11 |    100.0% | 100.0% | 100.0% |
+| date                      |   52 |    100.0% | 100.0% | 100.0% |
+| date of birth             |    2 |    100.0% | 100.0% | 100.0% |
+| email address             |    4 |    100.0% | 100.0% | 100.0% |
+| iban                      |    1 |    100.0% | 100.0% | 100.0% |
+| monetary amount           |   54 |    100.0% | 100.0% | 100.0% |
+| organization              |   56 |    100.0% | 100.0% | 100.0% |
+| person                    |   48 |    100.0% | 100.0% | 100.0% |
+| phone number              |    3 |    100.0% | 100.0% | 100.0% |
+| registration number       |   27 |    100.0% | 100.0% | 100.0% |
+| tax identification number |   15 |    100.0% | 100.0% | 100.0% |
+| **all (micro)**           |  332 |    100.0% | 100.0% | 100.0% |
+
+| Language | Gold | Precision | Recall |     F1 |
+| -------- | ---: | --------: | -----: | -----: |
+| cs       |  207 |    100.0% | 100.0% | 100.0% |
+| de       |   24 |    100.0% | 100.0% | 100.0% |
+| en       |  101 |    100.0% | 100.0% | 100.0% |
diff --git a/packages/bench/results/quality.anonymize.json b/packages/bench/results/quality.anonymize.json
new file mode 100644
index 00000000..0821b6ed
--- /dev/null
+++ b/packages/bench/results/quality.anonymize.json
@@ -0,0 +1,334 @@
+{
+  "tool": "anonymize",
+  "generatedAt": "2026-06-12T11:26:03.458Z",
+  "corpus": {
+    "docs": 13,
+    "docsPerLanguage": {
+      "cs": 8,
+      "de": 1,
+      "en": 4
+    },
+    "goldEntities": 332
+  },
+  "labelsFilter": null,
+  "modes": {
+    "exact": {
+      "micro": {
+        "truePositives": 332,
+        "falsePositives": 0,
+        "falseNegatives": 0,
+        "goldCount": 332,
+        "precision": 1,
+        "recall": 1,
+        "f1": 1
+      },
+      "perLabel": {
+        "address": {
+          "truePositives": 55,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 55,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "bank account number": {
+          "truePositives": 4,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 4,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "country": {
+          "truePositives": 11,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 11,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "date": {
+          "truePositives": 52,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 52,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "date of birth": {
+          "truePositives": 2,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 2,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "email address": {
+          "truePositives": 4,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 4,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "iban": {
+          "truePositives": 1,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 1,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "monetary amount": {
+          "truePositives": 54,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 54,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "organization": {
+          "truePositives": 56,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 56,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "person": {
+          "truePositives": 48,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 48,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "phone number": {
+          "truePositives": 3,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 3,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "registration number": {
+          "truePositives": 27,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 27,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "tax identification number": {
+          "truePositives": 15,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 15,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        }
+      },
+      "perLanguage": {
+        "cs": {
+          "truePositives": 207,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 207,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "de": {
+          "truePositives": 24,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 24,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "en": {
+          "truePositives": 101,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 101,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        }
+      }
+    },
+    "overlap": {
+      "micro": {
+        "truePositives": 332,
+        "falsePositives": 0,
+        "falseNegatives": 0,
+        "goldCount": 332,
+        "precision": 1,
+        "recall": 1,
+        "f1": 1
+      },
+      "perLabel": {
+        "address": {
+          "truePositives": 55,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 55,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "bank account number": {
+          "truePositives": 4,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 4,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "country": {
+          "truePositives": 11,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 11,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "date": {
+          "truePositives": 52,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 52,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "date of birth": {
+          "truePositives": 2,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 2,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "email address": {
+          "truePositives": 4,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 4,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "iban": {
+          "truePositives": 1,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 1,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "monetary amount": {
+          "truePositives": 54,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 54,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "organization": {
+          "truePositives": 56,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 56,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "person": {
+          "truePositives": 48,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 48,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "phone number": {
+          "truePositives": 3,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 3,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "registration number": {
+          "truePositives": 27,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 27,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "tax identification number": {
+          "truePositives": 15,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 15,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        }
+      },
+      "perLanguage": {
+        "cs": {
+          "truePositives": 207,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 207,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "de": {
+          "truePositives": 24,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 24,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        },
+        "en": {
+          "truePositives": 101,
+          "falsePositives": 0,
+          "falseNegatives": 0,
+          "goldCount": 101,
+          "precision": 1,
+          "recall": 1,
+          "f1": 1
+        }
+      }
+    }
+  }
+}
diff --git a/packages/bench/results/throughput.json b/packages/bench/results/throughput.json
new file mode 100644
index 00000000..54dd1af3
--- /dev/null
+++ b/packages/bench/results/throughput.json
@@ -0,0 +1,142 @@
+{
+  "generatedAt": "2026-06-12T11:27:49.606Z",
+  "environment": {
+    "bun": "1.3.14",
+    "platform": "darwin",
+    "arch": "arm64",
+    "cpu": "Apple M3"
+  },
+  "settings": {
+    "iterations": 10,
+    "warmup": 2
+  },
+  "oneTime": {
+    "dictionaryLoadMs": 202.265,
+    "prepareMs": 777.44
+  },
+  "corpus": {
+    "docs": 13,
+    "totalChars": 146106,
+    "medianPassMs": 185.536,
+    "charsPerSecond": 787480
+  },
+  "documents": [
+    {
+      "id": "cs/asset-transfer-court-declensions.txt",
+      "language": "cs",
+      "chars": 1517,
+      "medianMs": 4.081,
+      "minMs": 3.478,
+      "maxMs": 6.955,
+      "charsPerSecond": 371745
+    },
+    {
+      "id": "cs/database-cz-service-contract.txt",
+      "language": "cs",
+      "chars": 7924,
+      "medianMs": 9.883,
+      "minMs": 8.588,
+      "maxMs": 13.129,
+      "charsPerSecond": 801766
+    },
+    {
+      "id": "cs/eagles-rental-agreement.txt",
+      "language": "cs",
+      "chars": 6970,
+      "medianMs": 8.975,
+      "minMs": 8.094,
+      "maxMs": 11.058,
+      "charsPerSecond": 776587
+    },
+    {
+      "id": "cs/nakit-legal-services-framework.txt",
+      "language": "cs",
+      "chars": 45767,
+      "medianMs": 62.223,
+      "minMs": 59.606,
+      "maxMs": 71.866,
+      "charsPerSecond": 735535
+    },
+    {
+      "id": "cs/patrik-nguyen-used-vehicle-sale.txt",
+      "language": "cs",
+      "chars": 8391,
+      "medianMs": 15.503,
+      "minMs": 14.645,
+      "maxMs": 18.458,
+      "charsPerSecond": 541236
+    },
+    {
+      "id": "cs/probo-frame-purchase-contract.txt",
+      "language": "cs",
+      "chars": 3204,
+      "medianMs": 7.125,
+      "minMs": 6.098,
+      "maxMs": 7.661,
+      "charsPerSecond": 449708
+    },
+    {
+      "id": "cs/sanofi-bonus-agreement.txt",
+      "language": "cs",
+      "chars": 1740,
+      "medianMs": 3.637,
+      "minMs": 3.358,
+      "maxMs": 4.733,
+      "charsPerSecond": 478430
+    },
+    {
+      "id": "cs/vinci-donation-agreement.txt",
+      "language": "cs",
+      "chars": 4607,
+      "medianMs": 6.326,
+      "minMs": 5.614,
+      "maxMs": 8.226,
+      "charsPerSecond": 728219
+    },
+    {
+      "id": "de/geschaeftsfuehrer-dienstvertrag.txt",
+      "language": "de",
+      "chars": 1912,
+      "medianMs": 4.712,
+      "minMs": 4.231,
+      "maxMs": 9.883,
+      "charsPerSecond": 405789
+    },
+    {
+      "id": "en/gt-biopharma-employment-amendment.txt",
+      "language": "en",
+      "chars": 4806,
+      "medianMs": 5.098,
+      "minMs": 4.768,
+      "maxMs": 8.735,
+      "charsPerSecond": 942646
+    },
+    {
+      "id": "en/healthcare-trust-employment-amendment.txt",
+      "language": "en",
+      "chars": 8627,
+      "medianMs": 10.648,
+      "minMs": 9.966,
+      "maxMs": 18.707,
+      "charsPerSecond": 810201
+    },
+    {
+      "id": "en/pra-group-employment-agreement.txt",
+      "language": "en",
+      "chars": 48324,
+      "medianMs": 33.065,
+      "minMs": 30.881,
+      "maxMs": 46.263,
+      "charsPerSecond": 1461483
+    },
+    {
+      "id": "en/software-license-agreement.txt",
+      "language": "en",
+      "chars": 2317,
+      "medianMs": 5.759,
+      "minMs": 5.263,
+      "maxMs": 14.676,
+      "charsPerSecond": 402334
+    }
+  ]
+}
diff --git a/packages/bench/src/__test__/scorer.test.ts b/packages/bench/src/__test__/scorer.test.ts
new file mode 100644
index 00000000..e22a0826
--- /dev/null
+++ b/packages/bench/src/__test__/scorer.test.ts
@@ -0,0 +1,144 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+  type LabelCounts,
+  mergeCounts,
+  microCounts,
+  scoreDocument,
+  toMetrics,
+} from "../scorer";
+import type { BenchSpan } from "../types";
+
+const span = (start: number, end: number, label: string): BenchSpan => ({
+  start,
+  end,
+  label,
+});
+
+const counts = (
+  result: Map<string, LabelCounts>,
+  label: string,
+): LabelCounts => {
+  const labelCounts = result.get(label);
+  if (!labelCounts) throw new Error(`no counts for label ${label}`);
+  return labelCounts;
+};
+
+describe("scoreDocument", () => {
+  test("exact mode requires identical bounds", () => {
+    const gold = [span(0, 10, "person")];
+    const shifted = [span(1, 10, "person")];
+    const exact = scoreDocument({ gold, predicted: shifted, mode: "exact" });
+    expect(counts(exact, "person")).toEqual({
+      truePositives: 0,
+      falsePositives: 1,
+      falseNegatives: 1,
+    });
+    const overlap = scoreDocument({
+      gold,
+      predicted: shifted,
+      mode: "overlap",
+    });
+    expect(counts(overlap, "person")).toEqual({
+      truePositives: 1,
+      falsePositives: 0,
+      falseNegatives: 0,
+    });
+  });
+
+  test("label mismatch never matches even with identical bounds", () => {
+    const result = scoreDocument({
+      gold: [span(0, 5, "person")],
+      predicted: [span(0, 5, "organization")],
+      mode: "overlap",
+    });
+    expect(counts(result, "person").falseNegatives).toBe(1);
+    expect(counts(result, "organization").falsePositives).toBe(1);
+  });
+
+  test("adjacent spans do not overlap (end is exclusive)", () => {
+    const result = scoreDocument({
+      gold: [span(0, 5, "person")],
+      predicted: [span(5, 9, "person")],
+      mode: "overlap",
+    });
+    expect(counts(result, "person").truePositives).toBe(0);
+  });
+
+  test("one gold span absorbs at most one of several predictions", () => {
+    const result = scoreDocument({
+      gold: [span(0, 10, "person")],
+      predicted: [span(0, 4, "person"), span(2, 10, "person")],
+      mode: "overlap",
+    });
+    expect(counts(result, "person")).toEqual({
+      truePositives: 1,
+      falsePositives: 1,
+      falseNegatives: 0,
+    });
+  });
+
+  test("largest overlap wins when several predictions compete", () => {
+    const gold = [span(0, 10, "person"), span(20, 30, "person")];
+    const predicted = [span(8, 25, "person"), span(0, 9, "person")];
+    const result = scoreDocument({ gold, predicted, mode: "overlap" });
+    // First gold takes the 9-char overlap (0..9); second takes 8..25.
+    expect(counts(result, "person")).toEqual({
+      truePositives: 2,
+      falsePositives: 0,
+      falseNegatives: 0,
+    });
+  });
+
+  test("labels filter drops both gold and predictions", () => {
+    const result = scoreDocument({
+      gold: [span(0, 5, "person"), span(10, 15, "date")],
+      predicted: [span(10, 15, "date"), span(20, 25, "organization")],
+      mode: "exact",
+      labels: ["date"],
+    });
+    expect([...result.keys()]).toEqual(["date"]);
+    expect(counts(result, "date").truePositives).toBe(1);
+  });
+});
+
+describe("aggregation", () => {
+  test("mergeCounts accumulates and microCounts sums labels", () => {
+    const into = scoreDocument({
+      gold: [span(0, 5, "person")],
+      predicted: [span(0, 5, "person")],
+      mode: "exact",
+    });
+    const from = scoreDocument({
+      gold: [span(0, 5, "person"), span(8, 12, "date")],
+      predicted: [span(1, 5, "person")],
+      mode: "exact",
+    });
+    mergeCounts(into, from);
+    expect(counts(into, "person")).toEqual({
+      truePositives: 1,
+      falsePositives: 1,
+      falseNegatives: 1,
+    });
+    expect(microCounts(into)).toEqual({
+      truePositives: 1,
+      falsePositives: 1,
+      falseNegatives: 2,
+    });
+  });
+
+  test("toMetrics handles empty sides without dividing by zero", () => {
+    expect(
+      toMetrics({ truePositives: 0, falsePositives: 0, falseNegatives: 0 }),
+    ).toMatchObject({ precision: 0, recall: 0, f1: 0 });
+    const metrics = toMetrics({
+      truePositives: 3,
+      falsePositives: 1,
+      falseNegatives: 1,
+    });
+    expect(metrics.precision).toBeCloseTo(0.75);
+    expect(metrics.recall).toBeCloseTo(0.75);
+    expect(metrics.f1).toBeCloseTo(0.75);
+    expect(metrics.goldCount).toBe(4);
+  });
+});
diff --git a/packages/bench/src/adapters/anonymize.ts b/packages/bench/src/adapters/anonymize.ts
new file mode 100644
index 00000000..ef38118c
--- /dev/null
+++ b/packages/bench/src/adapters/anonymize.ts
@@ -0,0 +1,57 @@
+import {
+  createPipelineContext,
+  DEFAULT_ENTITY_LABELS,
+  runPipeline,
+  type PipelineConfig,
+} from "@stll/anonymize";
+
+import { loadBenchDictionaries } from "../dictionaries";
+import type { GoldDocument, PredictionsFile } from "../types";
+
+/**
+ * Deterministic layers only (NER off): identical to the config the
+ * regression snapshots are generated with, so quality numbers and
+ * throughput numbers describe the same pipeline.
+ */
+export const BENCH_PIPELINE_CONFIG: PipelineConfig = {
+  threshold: 0.3,
+  enableTriggerPhrases: true,
+  enableRegex: true,
+  enableLegalForms: true,
+  enableNameCorpus: true,
+  enableDenyList: true,
+  enableGazetteer: false,
+  enableNer: false,
+  enableConfidenceBoost: true,
+  enableCoreference: true,
+  enableHotwordRules: true,
+  enableZoneClassification: true,
+  labels: [...DEFAULT_ENTITY_LABELS],
+  workspaceId: "bench",
+};
+
+export const runAnonymizeAdapter = async (
+  docs: GoldDocument[],
+): Promise<PredictionsFile> => {
+  const dictionaries = await loadBenchDictionaries();
+  const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries };
+  const context = createPipelineContext();
+  const predictions: PredictionsFile = { tool: "anonymize", docs: [] };
+  for (const doc of docs) {
+    const entities = await runPipeline({
+      fullText: doc.text,
+      config,
+      gazetteerEntries: [],
+      context,
+    });
+    predictions.docs.push({
+      id: doc.id,
+      entities: entities.map(({ start, end, label }) => ({
+        start,
+        end,
+        label,
+      })),
+    });
+  }
+  return predictions;
+};
diff --git a/packages/bench/src/dictionaries.ts b/packages/bench/src/dictionaries.ts
new file mode 100644
index 00000000..0ae23425
--- /dev/null
+++ b/packages/bench/src/dictionaries.ts
@@ -0,0 +1,136 @@
+/**
+ * Loads the full published dictionary set from @stll/anonymize-data
+ * the way a production consumer would. Mirrors the corpus used by
+ * the anonymize regression suite (see
+ * packages/anonymize/src/__test__/load-dictionaries.ts) so bench
+ * results stay comparable with the committed snapshots; keep the
+ * language and country lists in sync.
+ */
+import type { Dictionaries, DictionaryMeta } from "@stll/anonymize";
+
+let cached: Dictionaries | null = null;
+
+const NAME_LANGUAGES = [
+  "cs",
+  "sk",
+  "de",
+  "pl",
+  "hu",
+  "ro",
+  "fr",
+  "es",
+  "it",
+  "en",
+  "sv",
+] as const;
+
+const CITY_COUNTRIES = [
+  "AT",
+  "AU",
+  "BE",
+  "BG",
+  "BR",
+  "CA",
+  "CH",
+  "CZ",
+  "DE",
+  "DK",
+  "ES",
+  "FI",
+  "FR",
+  "GB",
+  "GR",
+  "HR",
+  "HU",
+  "IE",
+  "IT",
+  "LU",
+  "NL",
+  "NO",
+  "NZ",
+  "PL",
+  "PT",
+  "RO",
+  "SE",
+  "SI",
+  "SK",
+  "US",
+] as const;
+
+type NameDictionaryModule = {
+  default: readonly string[];
+};
+
+const loadNameDictionary = async (
+  kind: "first" | "surnames",
+  language: string,
+): Promise<readonly string[] | null> => {
+  try {
+    const mod: NameDictionaryModule = await import(
+      `@stll/anonymize-data/dictionaries/names/${kind}/${language}.json`
+    );
+    return mod.default;
+  } catch {
+    return null;
+  }
+};
+
+export const loadBenchDictionaries = async (): Promise<Dictionaries> => {
+  if (cached) return cached;
+
+  const dataModule = await import("@stll/anonymize-data");
+
+  const denyList: Record<string, readonly string[]> = {};
+  const denyListMeta: Record<string, DictionaryMeta> = {};
+  const denyListResults = await Promise.all(
+    [...dataModule.ALL_DICTIONARY_IDS].map(async (id) => ({
+      id,
+      entries: await dataModule.loadDictionary(id),
+    })),
+  );
+  for (const { id, entries } of denyListResults) {
+    const meta = dataModule.DICTIONARY_META[id];
+    if (!meta) continue;
+    denyList[id] = entries;
+    // SAFETY: anonymize-data categories match DenyListCategory at runtime
+    denyListMeta[id] = meta as DictionaryMeta;
+  }
+
+  const firstNames: Record<string, readonly string[]> = {};
+  const surnames: Record<string, readonly string[]> = {};
+  await Promise.all(
+    NAME_LANGUAGES.map(async (language) => {
+      const [first, last] = await Promise.all([
+        loadNameDictionary("first", language),
+        loadNameDictionary("surnames", language),
+      ]);
+      if (first) firstNames[language] = first;
+      if (last) surnames[language] = last;
+    }),
+  );
+
+  const cityResults = await Promise.all(
+    CITY_COUNTRIES.map(async (country) => ({
+      country,
+      entries: await dataModule.loadCityDictionary(country),
+    })),
+  );
+  const citiesByCountry: Record<string, readonly string[]> = {};
+  const mergedCities: string[] = [];
+  for (const { country, entries } of cityResults) {
+    citiesByCountry[country] = entries;
+    for (const entry of entries) {
+      mergedCities.push(entry);
+    }
+  }
+
+  cached = {
+    firstNames,
+    surnames,
+    denyList,
+    denyListMeta,
+    cities: mergedCities,
+    citiesByCountry,
+  };
+  return cached;
+};
diff --git a/packages/bench/src/fixtures.ts b/packages/bench/src/fixtures.ts
new file mode 100644
index 00000000..016ad730
--- /dev/null
+++ b/packages/bench/src/fixtures.ts
@@ -0,0 +1,58 @@
+import { readdirSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+
+import type { BenchSpan, GoldDocument } from "./types";
+
+/**
+ * The bench corpus lives with the anonymize regression suite so the
+ * same fixtures gate releases and feed the benchmarks. Reference
+ * annotations come from the human-reviewed `.snapshot.json` sidecars
+ * maintained by contract-snapshots.test.ts.
+ */
+const CONTRACTS_DIR = join(
+  import.meta.dir,
+  "..",
+  "..",
+  "anonymize",
+  "src",
+  "__test__",
+  "fixtures",
+  "contracts",
+);
+
+type SnapshotFile = {
+  entities: BenchSpan[];
+};
+
+export const loadGoldDocuments = (): GoldDocument[] => {
+  const docs: GoldDocument[] = [];
+  for (const language of readdirSync(CONTRACTS_DIR).toSorted()) {
+    const languageDir = join(CONTRACTS_DIR, language);
+    for (const file of readdirSync(languageDir).toSorted()) {
+      if (!file.endsWith(".txt")) continue;
+      const text = readFileSync(join(languageDir, file), "utf8").replaceAll(
+        "\r\n",
+        "\n",
+      );
+      const snapshotPath = join(
+        languageDir,
+        file.replace(/\.txt$/u, ".snapshot.json"),
+      );
+      // SAFETY: sidecars are generated by contract-snapshots.test.ts with this shape
+      const snapshot = JSON.parse(
+        readFileSync(snapshotPath, "utf8"),
+      ) as SnapshotFile;
+      docs.push({
+        id: `${language}/${file}`,
+        language,
+        text,
+        gold: snapshot.entities.map(({ start, end, label }) => ({
+          start,
+          end,
+          label,
+        })),
+      });
+    }
+  }
+  return docs;
+};
diff --git a/packages/bench/src/render-results.ts b/packages/bench/src/render-results.ts
new file mode 100644
index 00000000..6d6a270c
--- /dev/null
+++ b/packages/bench/src/render-results.ts
@@ -0,0 +1,168 @@
+/**
+ * Renders results/*.json into results/RESULTS.md. Quality reports
+ * are discovered by the quality.<tool>.json naming convention so
+ * external tools added later show up without changes here.
+ */
+import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+
+const RESULTS_DIR = join(import.meta.dir, "..", "results");
+
+type LabelMetricsJson = {
+  goldCount: number;
+  truePositives: number;
+  falsePositives: number;
+  falseNegatives: number;
+  precision: number;
+  recall: number;
+  f1: number;
+};
+
+type ModeReportJson = {
+  micro: LabelMetricsJson;
+  perLabel: Record<string, LabelMetricsJson>;
+  perLanguage: Record<string, LabelMetricsJson>;
+};
+
+type QualityReportJson = {
+  tool: string;
+  generatedAt: string;
+  corpus: {
+    docs: number;
+    docsPerLanguage: Record<string, number>;
+    goldEntities: number;
+  };
+  labelsFilter: string[] | null;
+  modes: Record<"exact" | "overlap", ModeReportJson>;
+};
+
+type ThroughputReportJson = {
+  generatedAt: string;
+  environment: { bun: string; platform: string; arch: string; cpu: string };
+  settings: { iterations: number; warmup: number };
+  oneTime: { dictionaryLoadMs: number; prepareMs: number };
+  corpus: {
+    docs: number;
+    totalChars: number;
+    medianPassMs: number;
+    charsPerSecond: number;
+  };
+  documents: {
+    id: string;
+    language: string;
+    chars: number;
+    medianMs: number;
+    minMs: number;
+    maxMs: number;
+    charsPerSecond: number;
+  }[];
+};
+
+const percent = (value: number): string => `${(value * 100).toFixed(1)}%`;
+const integer = (value: number): string => value.toLocaleString("en-US");
+
+const metricsRow = (name: string, metrics: LabelMetricsJson): string =>
+  `| ${name} | ${integer(metrics.goldCount)} | ${percent(metrics.precision)} | ${percent(metrics.recall)} | ${percent(metrics.f1)} |`;
+
+const renderQuality = (report: QualityReportJson): string[] => {
+  const lines: string[] = [];
+  lines.push(`### ${report.tool}`);
+  lines.push("");
+  const filterNote = report.labelsFilter
+    ? ` Scored labels: ${report.labelsFilter.join(", ")}.`
+    : "";
+  lines.push(
+    `${report.corpus.docs} documents, ${integer(report.corpus.goldEntities)} reference entities.${filterNote}`,
+  );
+  for (const mode of ["exact", "overlap"] as const) {
+    const modeReport = report.modes[mode];
+    lines.push("");
+    lines.push(`#### ${mode} match`);
+    lines.push("");
+    lines.push("| Label | Gold | Precision | Recall | F1 |");
+    lines.push("| --- | ---: | ---: | ---: | ---: |");
+    for (const [label, metrics] of Object.entries(modeReport.perLabel)) {
+      lines.push(metricsRow(label, metrics));
+    }
+    lines.push(metricsRow("**all (micro)**", modeReport.micro));
+    lines.push("");
+    lines.push("| Language | Gold | Precision | Recall | F1 |");
+    lines.push("| --- | ---: | ---: | ---: | ---: |");
+    for (const [language, metrics] of Object.entries(
+      modeReport.perLanguage,
+    ).toSorted(([a], [b]) => a.localeCompare(b))) {
+      lines.push(metricsRow(language, metrics));
+    }
+  }
+  lines.push("");
+  return lines;
+};
+
+const renderThroughput = (report: ThroughputReportJson): string[] => {
+  const lines: string[] = [];
+  lines.push("## Throughput");
+  lines.push("");
+  lines.push(
+    `Environment: Bun ${report.environment.bun}, ${report.environment.cpu} (${report.environment.platform}/${report.environment.arch}). ` +
+      `${report.settings.warmup} warmup + ${report.settings.iterations} measured passes; medians reported.`,
+  );
+  lines.push("");
+  lines.push(
+    `One-time costs: dictionary load ${report.oneTime.dictionaryLoadMs.toFixed(0)} ms, search preparation ${report.oneTime.prepareMs.toFixed(0)} ms.`,
+  );
+  lines.push("");
+  lines.push(
+    `Corpus: ${report.corpus.docs} documents, ${integer(report.corpus.totalChars)} chars; ` +
+      `median full pass ${report.corpus.medianPassMs.toFixed(1)} ms (${integer(report.corpus.charsPerSecond)} chars/s).`,
+  );
+  lines.push("");
+  lines.push("| Document | Chars | Median ms | Min | Max | Chars/s |");
+  lines.push("| --- | ---: | ---: | ---: | ---: | ---: |");
+  for (const doc of report.documents) {
+    lines.push(
+      `| ${doc.id} | ${integer(doc.chars)} | ${doc.medianMs.toFixed(1)} | ${doc.minMs.toFixed(1)} | ${doc.maxMs.toFixed(1)} | ${integer(doc.charsPerSecond)} |`,
+    );
+  }
+  lines.push("");
+  return lines;
+};
+
+const lines: string[] = [];
+lines.push("# Benchmark results");
+lines.push("");
+lines.push(
+  "Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you.",
+);
+lines.push("");
+
+const throughputPath = join(RESULTS_DIR, "throughput.json");
+if (existsSync(throughputPath)) {
+  // SAFETY: written by run-throughput.ts with this shape
+  const throughput = JSON.parse(
+    readFileSync(throughputPath, "utf8"),
+  ) as ThroughputReportJson;
+  lines.push(...renderThroughput(throughput));
+}
+
+const qualityFiles = readdirSync(RESULTS_DIR)
+  .filter((file) => file.startsWith("quality.") && file.endsWith(".json"))
+  .toSorted();
+if (qualityFiles.length > 0) {
+  lines.push("## Quality vs. reference annotations");
+  lines.push("");
+  lines.push(
+    "The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison.",
+  );
+  lines.push("");
+  for (const file of qualityFiles) {
+    // SAFETY: written by run-quality.ts with this shape
+    const report = JSON.parse(
+      readFileSync(join(RESULTS_DIR, file), "utf8"),
+    ) as QualityReportJson;
+    lines.push(...renderQuality(report));
+  }
+}
+
+const outPath = join(RESULTS_DIR, "RESULTS.md");
+writeFileSync(outPath, `${lines.join("\n")}\n`);
+console.log(JSON.stringify({ event: "written", path: outPath }));
diff --git a/packages/bench/src/run-quality.ts b/packages/bench/src/run-quality.ts
new file mode 100644
index 00000000..1f93aef7
--- /dev/null
+++ b/packages/bench/src/run-quality.ts
@@ -0,0 +1,139 @@
+/**
+ * Scores tool predictions against the reference annotations.
+ *
+ * Default run executes the anonymize pipeline in-process. Pass
+ * --predictions <file.json> (PredictionsFile shape) to score an
+ * external tool's output on the same corpus instead.
+ */
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { parseArgs } from "node:util";
+
+import { runAnonymizeAdapter } from "./adapters/anonymize";
+import { loadGoldDocuments } from "./fixtures";
+import {
+  type LabelCounts,
+  type LabelMetrics,
+  type MatchMode,
+  mergeCounts,
+  microCounts,
+  scoreDocument,
+  toMetrics,
+} from "./scorer";
+import type { PredictionsFile } from "./types";
+
+const MATCH_MODES: readonly MatchMode[] = ["exact", "overlap"];
+
+type ModeReport = {
+  micro: LabelMetrics;
+  perLabel: Record<string, LabelMetrics>;
+  perLanguage: Record<string, LabelMetrics>;
+};
+
+type QualityReport = {
+  tool: string;
+  generatedAt: string;
+  corpus: {
+    docs: number;
+    docsPerLanguage: Record<string, number>;
+    goldEntities: number;
+  };
+  labelsFilter: string[] | null;
+  modes: Record<MatchMode, ModeReport>;
+};
+
+const { values: args } = parseArgs({
+  options: {
+    predictions: { type: "string" },
+    labels: { type: "string" },
+    out: { type: "string" },
+  },
+});
+
+const labelsFilter = args.labels?.split(",").map((label) => label.trim());
+
+const docs = loadGoldDocuments();
+const predictions: PredictionsFile = args.predictions
+  ? // SAFETY: --predictions files are produced by bench adapters with this shape
+    (JSON.parse(readFileSync(args.predictions, "utf8")) as PredictionsFile)
+  : await runAnonymizeAdapter(docs);
+
+const predictionsById = new Map(
+  predictions.docs.map((doc) => [doc.id, doc.entities]),
+);
+
+const missingDocs = docs.filter((doc) => !predictionsById.has(doc.id));
+if (missingDocs.length > 0) {
+  const ids = missingDocs.map((doc) => doc.id).join(", ");
+  throw new Error(`predictions missing for: ${ids}`);
+}
+
+const buildModeReport = (mode: MatchMode): ModeReport => {
+  const totalCounts = new Map<string, LabelCounts>();
+  const languageCounts = new Map<string, Map<string, LabelCounts>>();
+  for (const doc of docs) {
+    const documentCounts = scoreDocument({
+      gold: doc.gold,
+      predicted: predictionsById.get(doc.id) ?? [],
+      mode,
+      labels: labelsFilter,
+    });
+    mergeCounts(totalCounts, documentCounts);
+    const perLanguage =
+      languageCounts.get(doc.language) ?? new Map<string, LabelCounts>();
+    mergeCounts(perLanguage, documentCounts);
+    languageCounts.set(doc.language, perLanguage);
+  }
+
+  const perLabel: Record<string, LabelMetrics> = {};
+  for (const label of [...totalCounts.keys()].toSorted()) {
+    const counts = totalCounts.get(label);
+    if (counts) perLabel[label] = toMetrics(counts);
+  }
+  const perLanguage: Record<string, LabelMetrics> = {};
+  for (const [language, counts] of languageCounts) {
+    perLanguage[language] = toMetrics(microCounts(counts));
+  }
+  return { micro: toMetrics(microCounts(totalCounts)), perLabel, perLanguage };
+};
+
+const docsPerLanguage: Record<string, number> = {};
+for (const doc of docs) {
+  docsPerLanguage[doc.language] = (docsPerLanguage[doc.language] ?? 0) + 1;
+}
+
+const report: QualityReport = {
+  tool: predictions.tool,
+  generatedAt: new Date().toISOString(),
+  corpus: {
+    docs: docs.length,
+    docsPerLanguage,
+    goldEntities: docs.reduce((sum, doc) => sum + doc.gold.length, 0),
+  },
+  labelsFilter: labelsFilter ?? null,
+  modes: {
+    exact: buildModeReport("exact"),
+    overlap: buildModeReport("overlap"),
+  },
+};
+
+const outPath =
+  args.out ??
+  join(import.meta.dir, "..", "results", `quality.${predictions.tool}.json`);
+mkdirSync(dirname(outPath), { recursive: true });
+writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
+
+for (const mode of MATCH_MODES) {
+  const { micro } = report.modes[mode];
+  console.log(
+    JSON.stringify({
+      event: "quality",
+      tool: predictions.tool,
+      mode,
+      precision: micro.precision,
+      recall: micro.recall,
+      f1: micro.f1,
+    }),
+  );
+}
+console.log(JSON.stringify({ event: "written", path: outPath }));
diff --git a/packages/bench/src/run-throughput.ts b/packages/bench/src/run-throughput.ts
new file mode 100644
index 00000000..98ac58f1
--- /dev/null
+++ b/packages/bench/src/run-throughput.ts
@@ -0,0 +1,181 @@
+/**
+ * Throughput benchmark for the deterministic pipeline (NER off).
+ *
+ * Measures one-time costs (dictionary load, search preparation) and
+ * steady-state per-document latency over the contract corpus:
+ * --warmup full passes (default 2), then --iterations measured
+ * passes (default 10); medians are reported.
+ */
+import { mkdirSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { arch, cpus, platform } from "node:os";
+import { parseArgs } from "node:util";
+
+import {
+  createPipelineContext,
+  preparePipelineSearch,
+  runPipeline,
+  type PipelineConfig,
+} from "@stll/anonymize";
+
+import { BENCH_PIPELINE_CONFIG } from "./adapters/anonymize";
+import { loadBenchDictionaries } from "./dictionaries";
+import { loadGoldDocuments } from "./fixtures";
+
+const DEFAULT_ITERATIONS = 10;
+const DEFAULT_WARMUP = 2;
+
+type DocumentStats = {
+  id: string;
+  language: string;
+  chars: number;
+  medianMs: number;
+  minMs: number;
+  maxMs: number;
+  charsPerSecond: number;
+};
+
+type ThroughputReport = {
+  generatedAt: string;
+  environment: {
+    bun: string;
+    platform: string;
+    arch: string;
+    cpu: string;
+  };
+  settings: { iterations: number; warmup: number };
+  oneTime: { dictionaryLoadMs: number; prepareMs: number };
+  corpus: {
+    docs: number;
+    totalChars: number;
+    medianPassMs: number;
+    charsPerSecond: number;
+  };
+  documents: DocumentStats[];
+};
+
+const { values: args } = parseArgs({
+  options: {
+    iterations: { type: "string" },
+    warmup: { type: "string" },
+    out: { type: "string" },
+  },
+});
+
+const iterations = Number(args.iterations ?? DEFAULT_ITERATIONS);
+const warmup = Number(args.warmup ?? DEFAULT_WARMUP);
+if (!Number.isInteger(iterations) || iterations < 1) {
+  throw new Error(`--iterations must be a positive integer`);
+}
+if (!Number.isInteger(warmup) || warmup < 0) {
+  throw new Error(`--warmup must be a non-negative integer`);
+}
+
+const elapsedMs = (startNs: number): number =>
+  (Bun.nanoseconds() - startNs) / 1_000_000;
+
+const median = (samples: number[]): number => {
+  const sorted = samples.toSorted((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+  const lower = sorted.at(middle - (sorted.length % 2 === 0 ? 1 : 0)) ?? 0;
+  const upper = sorted.at(middle) ?? 0;
+  return (lower + upper) / 2;
+};
+
+const roundMs = (ms: number): number => Math.round(ms * 1_000) / 1_000;
+
+const docs = loadGoldDocuments();
+
+const dictionaryStart = Bun.nanoseconds();
+const dictionaries = await loadBenchDictionaries();
+const dictionaryLoadMs = elapsedMs(dictionaryStart);
+
+const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries };
+const context = createPipelineContext();
+const prepareStart = Bun.nanoseconds();
+await preparePipelineSearch({ config, context });
+const prepareMs = elapsedMs(prepareStart);
+
+const runDocument = async (text: string): Promise<void> => {
+  await runPipeline({
+    fullText: text,
+    config,
+    gazetteerEntries: [],
+    context,
+  });
+};
+
+for (let pass = 0; pass < warmup; pass += 1) {
+  for (const doc of docs) {
+    await runDocument(doc.text);
+  }
+}
+
+const samplesByDoc = new Map<string, number[]>(docs.map((doc) => [doc.id, []]));
+const passSamples: number[] = [];
+for (let pass = 0; pass < iterations; pass += 1) {
+  let passMs = 0;
+  for (const doc of docs) {
+    const start = Bun.nanoseconds();
+    await runDocument(doc.text);
+    const ms = elapsedMs(start);
+    passMs += ms;
+    samplesByDoc.get(doc.id)?.push(ms);
+  }
+  passSamples.push(passMs);
+}
+
+const documents: DocumentStats[] = docs.map((doc) => {
+  const samples = samplesByDoc.get(doc.id) ?? [];
+  const medianMs = median(samples);
+  return {
+    id: doc.id,
+    language: doc.language,
+    chars: doc.text.length,
+    medianMs: roundMs(medianMs),
+    minMs: roundMs(Math.min(...samples)),
+    maxMs: roundMs(Math.max(...samples)),
+    charsPerSecond: Math.round(doc.text.length / (medianMs / 1_000)),
+  };
+});
+
+const totalChars = docs.reduce((sum, doc) => sum + doc.text.length, 0);
+const medianPassMs = median(passSamples);
+
+const report: ThroughputReport = {
+  generatedAt: new Date().toISOString(),
+  environment: {
+    bun: Bun.version,
+    platform: platform(),
+    arch: arch(),
+    cpu: cpus().at(0)?.model ?? "unknown",
+  },
+  settings: { iterations, warmup },
+  oneTime: {
+    dictionaryLoadMs: roundMs(dictionaryLoadMs),
+    prepareMs: roundMs(prepareMs),
+  },
+  corpus: {
+    docs: docs.length,
+    totalChars,
+    medianPassMs: roundMs(medianPassMs),
+    charsPerSecond: Math.round(totalChars / (medianPassMs / 1_000)),
+  },
+  documents,
+};
+
+const outPath =
+  args.out ?? join(import.meta.dir, "..", "results", "throughput.json");
+mkdirSync(dirname(outPath), { recursive: true });
+writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
+
+console.log(
+  JSON.stringify({
+    event: "throughput",
+    medianPassMs: report.corpus.medianPassMs,
+    charsPerSecond: report.corpus.charsPerSecond,
+    dictionaryLoadMs: report.oneTime.dictionaryLoadMs,
+    prepareMs: report.oneTime.prepareMs,
+  }),
+);
+console.log(JSON.stringify({ event: "written", path: outPath }));
diff --git a/packages/bench/src/scorer.ts b/packages/bench/src/scorer.ts
new file mode 100644
index 00000000..e3ac08fe
--- /dev/null
+++ b/packages/bench/src/scorer.ts
@@ -0,0 +1,169 @@
+import type { BenchSpan } from "./types";
+
+/**
+ * exact: a prediction counts only when label, start, and end all match.
+ * overlap: a prediction counts when the label matches and the spans
+ * share at least one character; for anonymization a partial hit still
+ * redacts part of the value, but exact mode is the honest headline.
+ */
+export type MatchMode = "exact" | "overlap";
+
+export type LabelCounts = {
+  truePositives: number;
+  falsePositives: number;
+  falseNegatives: number;
+};
+
+export type LabelMetrics = LabelCounts & {
+  goldCount: number;
+  precision: number;
+  recall: number;
+  f1: number;
+};
+
+type ScoreDocumentOptions = {
+  gold: BenchSpan[];
+  predicted: BenchSpan[];
+  mode: MatchMode;
+  /** Restrict scoring to these labels; both sides are filtered. */
+  labels?: readonly string[] | undefined;
+};
+
+const overlapLength = (a: BenchSpan, b: BenchSpan): number =>
+  Math.min(a.end, b.end) - Math.max(a.start, b.start);
+
+const groupByLabel = (spans: BenchSpan[]): Map<string, BenchSpan[]> => {
+  const groups = new Map<string, BenchSpan[]>();
+  for (const span of spans) {
+    const group = groups.get(span.label);
+    if (group) {
+      group.push(span);
+    } else {
+      groups.set(span.label, [span]);
+    }
+  }
+  return groups;
+};
+
+/**
+ * One-to-one matching within a label: gold spans are visited in
+ * document order; each claims the unmatched prediction with the
+ * largest overlap (exact mode requires identical bounds).
+ */
+const countLabelMatches = (
+  gold: BenchSpan[],
+  predicted: BenchSpan[],
+  mode: MatchMode,
+): number => {
+  const used = predicted.map(() => false);
+  let truePositives = 0;
+  const sortedGold = gold.toSorted((a, b) => a.start - b.start);
+  for (const goldSpan of sortedGold) {
+    let bestIndex = -1;
+    let bestOverlap = 0;
+    for (const [index, prediction] of predicted.entries()) {
+      if (used[index]) continue;
+      if (mode === "exact") {
+        if (
+          prediction.start === goldSpan.start &&
+          prediction.end === goldSpan.end
+        ) {
+          bestIndex = index;
+          break;
+        }
+        continue;
+      }
+      const overlap = overlapLength(goldSpan, prediction);
+      if (overlap > bestOverlap) {
+        bestOverlap = overlap;
+        bestIndex = index;
+      }
+    }
+    if (bestIndex < 0) continue;
+    used[bestIndex] = true;
+    truePositives += 1;
+  }
+  return truePositives;
+};
+
+/** Per-label true/false positive and false negative counts for one document. */
+export const scoreDocument = ({
+  gold,
+  predicted,
+  mode,
+  labels,
+}: ScoreDocumentOptions): Map<string, LabelCounts> => {
+  const labelFilter = labels ? new Set(labels) : null;
+  const keep = (span: BenchSpan) =>
+    labelFilter === null || labelFilter.has(span.label);
+  const goldGroups = groupByLabel(gold.filter(keep));
+  const predictedGroups = groupByLabel(predicted.filter(keep));
+
+  const counts = new Map<string, LabelCounts>();
+  const allLabels = new Set([...goldGroups.keys(), ...predictedGroups.keys()]);
+  for (const label of allLabels) {
+    const goldSpans = goldGroups.get(label) ?? [];
+    const predictedSpans = predictedGroups.get(label) ?? [];
+    const truePositives = countLabelMatches(goldSpans, predictedSpans, mode);
+    counts.set(label, {
+      truePositives,
+      falsePositives: predictedSpans.length - truePositives,
+      falseNegatives: goldSpans.length - truePositives,
+    });
+  }
+  return counts;
+};
+
+export const mergeCounts = (
+  into: Map<string, LabelCounts>,
+  from: Map<string, LabelCounts>,
+): void => {
+  for (const [label, counts] of from) {
+    const existing = into.get(label);
+    if (!existing) {
+      into.set(label, { ...counts });
+      continue;
+    }
+    existing.truePositives += counts.truePositives;
+    existing.falsePositives += counts.falsePositives;
+    existing.falseNegatives += counts.falseNegatives;
+  }
+};
+
+export const toMetrics = ({
+  truePositives,
+  falsePositives,
+  falseNegatives,
+}: LabelCounts): LabelMetrics => {
+  const predictedCount = truePositives + falsePositives;
+  const goldCount = truePositives + falseNegatives;
+  const precision = predictedCount === 0 ? 0 : truePositives / predictedCount;
+  const recall = goldCount === 0 ? 0 : truePositives / goldCount;
+  const f1 =
+    precision + recall === 0
+      ? 0
+      : (2 * precision * recall) / (precision + recall);
+  return {
+    truePositives,
+    falsePositives,
+    falseNegatives,
+    goldCount,
+    precision,
+    recall,
+    f1,
+  };
+};
+
+export const microCounts = (counts: Map<string, LabelCounts>): LabelCounts => {
+  const total: LabelCounts = {
+    truePositives: 0,
+    falsePositives: 0,
+    falseNegatives: 0,
+  };
+  for (const labelCounts of counts.values()) {
+    total.truePositives += labelCounts.truePositives;
+    total.falsePositives += labelCounts.falsePositives;
+    total.falseNegatives += labelCounts.falseNegatives;
+  }
+  return total;
+};
diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts
new file mode 100644
index 00000000..d1ff30af
--- /dev/null
+++ b/packages/bench/src/types.ts
@@ -0,0 +1,29 @@
+/** A labeled character span; offsets are UTF-16 code units into the document text. */
+export type BenchSpan = {
+  start: number;
+  end: number;
+  label: string;
+};
+
+export type GoldDocument = {
+  /** Path relative to the contracts fixture root, e.g. "cs/sanofi-bonus-agreement.txt". */
+  id: string;
+  language: string;
+  text: string;
+  gold: BenchSpan[];
+};
+
+export type PredictionsDocument = {
+  id: string;
+  entities: BenchSpan[];
+};
+
+/**
+ * Interchange format for tool outputs. External tools (Presidio,
+ * redact-pii, ...) produce this shape so every tool is scored by
+ * the same scorer against the same reference annotations.
+ */
+export type PredictionsFile = {
+  tool: string;
+  docs: PredictionsDocument[];
+};
diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json
new file mode 100644
index 00000000..c9bf468b
--- /dev/null
+++ b/packages/bench/tsconfig.json
@@ -0,0 +1,13 @@
+{
+  "$schema": "https://json.schemastore.org/tsconfig",
+  "extends": "@stll/typescript-config/library.json",
+  "compilerOptions": {
+    "lib": ["ESNext"],
+    "noEmit": true,
+    "resolveJsonModule": true,
+    "target": "ES2023",
+    "types": ["node", "bun-types"]
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["node_modules"]
+}