From 728c5fd118fc4691af946aad2479e75e16be3607 Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 12 Jun 2026 13:27:39 +0200 Subject: [PATCH 1/2] fix: bundle non-Western name corpora into dist The per-locale name corpus files were loaded with a template-literal dynamic import, which the bundler cannot resolve statically. The import survived into dist as a runtime-relative path that does not exist in the published package, so name detection was silently disabled for consumers of the built output (the regression suite imports from src and never hit the path). Replace the template literal with a map of literal import specifiers keyed by locale so each corpus file becomes a build chunk, and pin one chunk in check-packlist so the regression cannot ship again. --- .github/tools/check-packlist.mjs | 4 ++++ packages/anonymize/src/detectors/names.ts | 29 ++++++++++++++++++----- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 573f6d18..3e039419 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -7,6 +7,9 @@ const PACKAGES = [ expected: [ "dist/index.d.mts", "dist/index.mjs", + // Dynamically imported corpus chunk; missing means the + // bundler stopped resolving the non-Western name imports. + "dist/names-nw-in.mjs", "README.md", "LICENSE", "package.json", @@ -41,6 +44,7 @@ const PACKAGES = [ expected: [ "dist/wasm.d.mts", "dist/wasm.mjs", + "dist/names-nw-in.mjs", "dist/vite.d.mts", "dist/vite.mjs", "README.md", diff --git a/packages/anonymize/src/detectors/names.ts b/packages/anonymize/src/detectors/names.ts index 146912de..5732b711 100644 --- a/packages/anonymize/src/detectors/names.ts +++ b/packages/anonymize/src/detectors/names.ts @@ -49,6 +49,28 @@ const NONWESTERN_LOCALE_KEYS = [ "id", ] as const; +type NonWesternNamesModule = Promise<{ default: { names: string[] } }>; + +// Literal import specifiers so the bundler resolves each corpus +// file into the build output; a template-literal specifier survives +// bundling as a runtime-relative path that does not exist in dist. +const NONWESTERN_NAME_IMPORTS: Record< + (typeof NONWESTERN_LOCALE_KEYS)[number], + () => NonWesternNamesModule +> = { + in: () => import("../data/names-nw-in.json") as NonWesternNamesModule, + ar: () => import("../data/names-nw-ar.json") as NonWesternNamesModule, + "ja-latn": () => + import("../data/names-nw-ja-latn.json") as NonWesternNamesModule, + ko: () => import("../data/names-nw-ko.json") as NonWesternNamesModule, + "zh-latn": () => + import("../data/names-nw-zh-latn.json") as NonWesternNamesModule, + th: () => import("../data/names-nw-th.json") as NonWesternNamesModule, + vi: () => import("../data/names-nw-vi.json") as NonWesternNamesModule, + fil: () => import("../data/names-nw-fil.json") as NonWesternNamesModule, + id: () => import("../data/names-nw-id.json") as NonWesternNamesModule, +}; + const normalizeCorpusLanguage = (language: string): string => language.toLowerCase(); @@ -219,12 +241,7 @@ export const initNameCorpus = ( const nwLocaleKeys = getScopedNonWesternLocaleKeys(languages); const [nwNameMods, nwExcludedMod] = await Promise.all([ Promise.all( - nwLocaleKeys.map( - (locale) => - import(`../data/names-nw-${locale}.json`) as Promise<{ - default: { names: string[] }; - }>, - ), + nwLocaleKeys.map((locale) => NONWESTERN_NAME_IMPORTS[locale]()), ), import("../data/names-nw-excluded-allcaps.json") as Promise<{ default: { words: string[] }; From 1cf681420546eb0023a6292404d16897ff96ca7e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Fri, 12 Jun 2026 13:29:09 +0200 Subject: [PATCH 2/2] feat(bench): add quality and throughput benchmark workspace New private packages/bench workspace measuring the deterministic pipeline (NER off) over the contract fixture corpus: - span-level scorer (per-label precision/recall/F1, exact and overlap matching, one-to-one within label) with unit tests - quality runner scoring the pipeline against the reviewed .snapshot.json reference annotations; accepts external tool predictions via a documented JSON interchange format so other anonymizers can be scored by the same scorer on the same corpus - throughput runner (warmup + measured passes, per-document medians, corpus chars/s, one-time dictionary and prepare costs) - methodology README covering what the reference annotations can and cannot support, plus rendered results The bench imports the built dist like a production consumer, which is how it caught the non-Western corpus bundling regression fixed in the previous commit. --- bun.lock | 15 + packages/bench/README.md | 108 ++++++ packages/bench/package.json | 26 ++ packages/bench/results/RESULTS.md | 85 +++++ packages/bench/results/quality.anonymize.json | 334 ++++++++++++++++++ packages/bench/results/throughput.json | 142 ++++++++ packages/bench/src/__test__/scorer.test.ts | 144 ++++++++ packages/bench/src/adapters/anonymize.ts | 57 +++ packages/bench/src/dictionaries.ts | 136 +++++++ packages/bench/src/fixtures.ts | 58 +++ packages/bench/src/render-results.ts | 168 +++++++++ packages/bench/src/run-quality.ts | 139 ++++++++ packages/bench/src/run-throughput.ts | 181 ++++++++++ packages/bench/src/scorer.ts | 169 +++++++++ packages/bench/src/types.ts | 29 ++ packages/bench/tsconfig.json | 13 + 16 files changed, 1804 insertions(+) create mode 100644 packages/bench/README.md create mode 100644 packages/bench/package.json create mode 100644 packages/bench/results/RESULTS.md create mode 100644 packages/bench/results/quality.anonymize.json create mode 100644 packages/bench/results/throughput.json create mode 100644 packages/bench/src/__test__/scorer.test.ts create mode 100644 packages/bench/src/adapters/anonymize.ts create mode 100644 packages/bench/src/dictionaries.ts create mode 100644 packages/bench/src/fixtures.ts create mode 100644 packages/bench/src/render-results.ts create mode 100644 packages/bench/src/run-quality.ts create mode 100644 packages/bench/src/run-throughput.ts create mode 100644 packages/bench/src/scorer.ts create mode 100644 packages/bench/src/types.ts create mode 100644 packages/bench/tsconfig.json diff --git a/bun.lock b/bun.lock index af9f858e..c4d022c5 100644 --- a/bun.lock +++ b/bun.lock @@ -54,6 +54,19 @@ "vite", ], }, + "packages/bench": { + "name": "@stll/anonymize-bench", + "version": "0.0.0", + "dependencies": { + "@stll/anonymize": "workspace:*", + "@stll/anonymize-data": "workspace:*", + }, + "devDependencies": { + "@types/node": "^25.9.2", + "bun-types": "^1.3.14", + "typescript": "^6.0.3", + }, + }, "packages/cli": { "name": "@stll/anonymize-cli", "version": "1.4.10", @@ -253,6 +266,8 @@ "@stll/anonymize": ["@stll/anonymize@workspace:packages/anonymize"], + "@stll/anonymize-bench": ["@stll/anonymize-bench@workspace:packages/bench"], + "@stll/anonymize-cli": ["@stll/anonymize-cli@workspace:packages/cli"], "@stll/anonymize-data": ["@stll/anonymize-data@workspace:packages/data"], diff --git a/packages/bench/README.md b/packages/bench/README.md new file mode 100644 index 00000000..9adaf483 --- /dev/null +++ b/packages/bench/README.md @@ -0,0 +1,108 @@ +# @stll/anonymize-bench + +Reproducible quality and throughput benchmarks for `@stll/anonymize`. +Private workspace package; nothing here is published to npm. + +## Running + +```sh +bun install +bun run build # bench imports the built @stll/anonymize dist +cd packages/bench +bun run bench # quality + throughput + render results/RESULTS.md +``` + +Individual steps: `bun run bench:quality`, `bun run bench:throughput` +(`--iterations N --warmup N`), `bun run bench:render`. Results land in +`results/` as JSON plus a rendered `results/RESULTS.md`. + +## Corpus + +The corpus is the contract fixture set in +`packages/anonymize/src/__test__/fixtures/contracts/` (Czech, German, +and English legal contracts; public or synthetic documents, several +sourced from SEC EDGAR filings). The same fixtures gate releases via +the regression suite, so the benchmark always describes the pipeline +that actually ships. + +All measurements run the deterministic layers only (`enableNer: +false`): regex, trigger phrases, legal forms, name corpus, deny +lists, coreference, hotword rules, and zone classification, with the +full published dictionary set from `@stll/anonymize-data` loaded the +way a production consumer loads it. + +## Reference annotations, and what they can tell you + +Quality is scored against the `.snapshot.json` sidecars next to each +fixture. These are produced by the pipeline itself and then human +reviewed: every change to them is diffed in PRs, and +`contract-snapshots.test.ts` plus `contract-quality.test.ts` pin +specific true positives and false positives that reviewers have +verified by hand. + +Because the reference derives from reviewed pipeline output, the +pipeline's own score against it is close to perfect **by +construction**. That number is a drift detector, not proof of +accuracy. The honest uses of this harness are: + +- **Cross-tool comparison.** Other tools' outputs (see interchange + format below) are scored against the same reference with the same + scorer; relative differences on identical documents are meaningful + even when the reference has our bias. Comparisons should be read + per label, restricted to labels both tools claim to detect + (`--labels person,organization,...`). +- **Per-label and per-language coverage tracking** across releases. +- **Throughput**, which does not depend on the reference at all. + +Independent third-party corpora are a planned extension; numbers on +this corpus alone should not be quoted as absolute accuracy claims. + +## Scoring + +Span-level, per label, one-to-one matching: + +- **exact**: label, start, and end must all match. +- **overlap**: label must match and spans must share at least one + character; gold spans claim the unmatched prediction with the + largest overlap. For anonymization a partial hit still redacts part + of the value, but exact mode is the honest headline metric. + +Precision, recall, and F1 are reported per label, per language, and +micro-averaged. Offsets are UTF-16 code units; fixture text is +CRLF-normalized to match the regression suite. + +## Comparing another tool + +Run the tool over the same fixture files and write a predictions file: + +```json +{ + "tool": "some-tool", + "docs": [ + { + "id": "en/software-license-agreement.txt", + "entities": [{ "start": 100, "end": 117, "label": "date" }] + } + ] +} +``` + +Labels must be mapped to the canonical `@stll/anonymize` labels +(`person`, `organization`, `address`, `date`, ...) by the adapter +producing the file. Then: + +```sh +bun run bench:quality -- --predictions path/to/predictions.json \ + --labels person,organization,email address,phone number,date +bun run bench:render +``` + +## Throughput methodology + +One-time costs (dictionary load, search automaton preparation) are +measured separately from steady-state latency. The corpus is run +`--warmup` full passes (default 2), then `--iterations` measured +passes (default 10); per-document medians and corpus chars/second are +reported together with the Bun version and CPU model. Numbers in +committed results come from a developer laptop; treat them as +order-of-magnitude, and re-run locally for decisions. diff --git a/packages/bench/package.json b/packages/bench/package.json new file mode 100644 index 00000000..809f8904 --- /dev/null +++ b/packages/bench/package.json @@ -0,0 +1,26 @@ +{ + "name": "@stll/anonymize-bench", + "version": "0.0.0", + "private": true, + "description": "Reproducible quality and throughput benchmarks for @stll/anonymize", + "type": "module", + "license": "MIT", + "scripts": { + "bench": "bun run bench:quality && bun run bench:throughput && bun run bench:render", + "bench:quality": "bun src/run-quality.ts", + "bench:throughput": "bun src/run-throughput.ts", + "bench:render": "bun src/render-results.ts", + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "bun test", + "format": "oxfmt ." + }, + "dependencies": { + "@stll/anonymize": "workspace:*", + "@stll/anonymize-data": "workspace:*" + }, + "devDependencies": { + "@types/node": "^25.9.2", + "bun-types": "^1.3.14", + "typescript": "^6.0.3" + } +} diff --git a/packages/bench/results/RESULTS.md b/packages/bench/results/RESULTS.md new file mode 100644 index 00000000..96dcc5ec --- /dev/null +++ b/packages/bench/results/RESULTS.md @@ -0,0 +1,85 @@ +# Benchmark results + +Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you. + +## Throughput + +Environment: Bun 1.3.14, Apple M3 (darwin/arm64). 2 warmup + 10 measured passes; medians reported. + +One-time costs: dictionary load 202 ms, search preparation 777 ms. + +Corpus: 13 documents, 146,106 chars; median full pass 185.5 ms (787,480 chars/s). + +| Document | Chars | Median ms | Min | Max | Chars/s | +| -------------------------------------------- | -----: | --------: | ---: | ---: | --------: | +| cs/asset-transfer-court-declensions.txt | 1,517 | 4.1 | 3.5 | 7.0 | 371,745 | +| cs/database-cz-service-contract.txt | 7,924 | 9.9 | 8.6 | 13.1 | 801,766 | +| cs/eagles-rental-agreement.txt | 6,970 | 9.0 | 8.1 | 11.1 | 776,587 | +| cs/nakit-legal-services-framework.txt | 45,767 | 62.2 | 59.6 | 71.9 | 735,535 | +| cs/patrik-nguyen-used-vehicle-sale.txt | 8,391 | 15.5 | 14.6 | 18.5 | 541,236 | +| cs/probo-frame-purchase-contract.txt | 3,204 | 7.1 | 6.1 | 7.7 | 449,708 | +| cs/sanofi-bonus-agreement.txt | 1,740 | 3.6 | 3.4 | 4.7 | 478,430 | +| cs/vinci-donation-agreement.txt | 4,607 | 6.3 | 5.6 | 8.2 | 728,219 | +| de/geschaeftsfuehrer-dienstvertrag.txt | 1,912 | 4.7 | 4.2 | 9.9 | 405,789 | +| en/gt-biopharma-employment-amendment.txt | 4,806 | 5.1 | 4.8 | 8.7 | 942,646 | +| en/healthcare-trust-employment-amendment.txt | 8,627 | 10.6 | 10.0 | 18.7 | 810,201 | +| en/pra-group-employment-agreement.txt | 48,324 | 33.1 | 30.9 | 46.3 | 1,461,483 | +| en/software-license-agreement.txt | 2,317 | 5.8 | 5.3 | 14.7 | 402,334 | + +## Quality vs. reference annotations + +The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison. + +### anonymize + +13 documents, 332 reference entities. + +#### exact match + +| Label | Gold | Precision | Recall | F1 | +| ------------------------- | ---: | --------: | -----: | -----: | +| address | 55 | 100.0% | 100.0% | 100.0% | +| bank account number | 4 | 100.0% | 100.0% | 100.0% | +| country | 11 | 100.0% | 100.0% | 100.0% | +| date | 52 | 100.0% | 100.0% | 100.0% | +| date of birth | 2 | 100.0% | 100.0% | 100.0% | +| email address | 4 | 100.0% | 100.0% | 100.0% | +| iban | 1 | 100.0% | 100.0% | 100.0% | +| monetary amount | 54 | 100.0% | 100.0% | 100.0% | +| organization | 56 | 100.0% | 100.0% | 100.0% | +| person | 48 | 100.0% | 100.0% | 100.0% | +| phone number | 3 | 100.0% | 100.0% | 100.0% | +| registration number | 27 | 100.0% | 100.0% | 100.0% | +| tax identification number | 15 | 100.0% | 100.0% | 100.0% | +| **all (micro)** | 332 | 100.0% | 100.0% | 100.0% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | -----: | +| cs | 207 | 100.0% | 100.0% | 100.0% | +| de | 24 | 100.0% | 100.0% | 100.0% | +| en | 101 | 100.0% | 100.0% | 100.0% | + +#### overlap match + +| Label | Gold | Precision | Recall | F1 | +| ------------------------- | ---: | --------: | -----: | -----: | +| address | 55 | 100.0% | 100.0% | 100.0% | +| bank account number | 4 | 100.0% | 100.0% | 100.0% | +| country | 11 | 100.0% | 100.0% | 100.0% | +| date | 52 | 100.0% | 100.0% | 100.0% | +| date of birth | 2 | 100.0% | 100.0% | 100.0% | +| email address | 4 | 100.0% | 100.0% | 100.0% | +| iban | 1 | 100.0% | 100.0% | 100.0% | +| monetary amount | 54 | 100.0% | 100.0% | 100.0% | +| organization | 56 | 100.0% | 100.0% | 100.0% | +| person | 48 | 100.0% | 100.0% | 100.0% | +| phone number | 3 | 100.0% | 100.0% | 100.0% | +| registration number | 27 | 100.0% | 100.0% | 100.0% | +| tax identification number | 15 | 100.0% | 100.0% | 100.0% | +| **all (micro)** | 332 | 100.0% | 100.0% | 100.0% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | -----: | +| cs | 207 | 100.0% | 100.0% | 100.0% | +| de | 24 | 100.0% | 100.0% | 100.0% | +| en | 101 | 100.0% | 100.0% | 100.0% | diff --git a/packages/bench/results/quality.anonymize.json b/packages/bench/results/quality.anonymize.json new file mode 100644 index 00000000..0821b6ed --- /dev/null +++ b/packages/bench/results/quality.anonymize.json @@ -0,0 +1,334 @@ +{ + "tool": "anonymize", + "generatedAt": "2026-06-12T11:26:03.458Z", + "corpus": { + "docs": 13, + "docsPerLanguage": { + "cs": 8, + "de": 1, + "en": 4 + }, + "goldEntities": 332 + }, + "labelsFilter": null, + "modes": { + "exact": { + "micro": { + "truePositives": 332, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 332, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "perLabel": { + "address": { + "truePositives": 55, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 55, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "bank account number": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "country": { + "truePositives": 11, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 11, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date": { + "truePositives": 52, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 52, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date of birth": { + "truePositives": 2, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 2, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "email address": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "iban": { + "truePositives": 1, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 1, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "monetary amount": { + "truePositives": 54, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 54, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "organization": { + "truePositives": 56, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 56, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "person": { + "truePositives": 48, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 48, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "phone number": { + "truePositives": 3, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 3, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "registration number": { + "truePositives": 27, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 27, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "tax identification number": { + "truePositives": 15, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 15, + "precision": 1, + "recall": 1, + "f1": 1 + } + }, + "perLanguage": { + "cs": { + "truePositives": 207, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 207, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "de": { + "truePositives": 24, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 24, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "en": { + "truePositives": 101, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 101, + "precision": 1, + "recall": 1, + "f1": 1 + } + } + }, + "overlap": { + "micro": { + "truePositives": 332, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 332, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "perLabel": { + "address": { + "truePositives": 55, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 55, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "bank account number": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "country": { + "truePositives": 11, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 11, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date": { + "truePositives": 52, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 52, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date of birth": { + "truePositives": 2, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 2, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "email address": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "iban": { + "truePositives": 1, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 1, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "monetary amount": { + "truePositives": 54, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 54, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "organization": { + "truePositives": 56, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 56, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "person": { + "truePositives": 48, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 48, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "phone number": { + "truePositives": 3, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 3, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "registration number": { + "truePositives": 27, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 27, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "tax identification number": { + "truePositives": 15, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 15, + "precision": 1, + "recall": 1, + "f1": 1 + } + }, + "perLanguage": { + "cs": { + "truePositives": 207, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 207, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "de": { + "truePositives": 24, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 24, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "en": { + "truePositives": 101, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 101, + "precision": 1, + "recall": 1, + "f1": 1 + } + } + } + } +} diff --git a/packages/bench/results/throughput.json b/packages/bench/results/throughput.json new file mode 100644 index 00000000..54dd1af3 --- /dev/null +++ b/packages/bench/results/throughput.json @@ -0,0 +1,142 @@ +{ + "generatedAt": "2026-06-12T11:27:49.606Z", + "environment": { + "bun": "1.3.14", + "platform": "darwin", + "arch": "arm64", + "cpu": "Apple M3" + }, + "settings": { + "iterations": 10, + "warmup": 2 + }, + "oneTime": { + "dictionaryLoadMs": 202.265, + "prepareMs": 777.44 + }, + "corpus": { + "docs": 13, + "totalChars": 146106, + "medianPassMs": 185.536, + "charsPerSecond": 787480 + }, + "documents": [ + { + "id": "cs/asset-transfer-court-declensions.txt", + "language": "cs", + "chars": 1517, + "medianMs": 4.081, + "minMs": 3.478, + "maxMs": 6.955, + "charsPerSecond": 371745 + }, + { + "id": "cs/database-cz-service-contract.txt", + "language": "cs", + "chars": 7924, + "medianMs": 9.883, + "minMs": 8.588, + "maxMs": 13.129, + "charsPerSecond": 801766 + }, + { + "id": "cs/eagles-rental-agreement.txt", + "language": "cs", + "chars": 6970, + "medianMs": 8.975, + "minMs": 8.094, + "maxMs": 11.058, + "charsPerSecond": 776587 + }, + { + "id": "cs/nakit-legal-services-framework.txt", + "language": "cs", + "chars": 45767, + "medianMs": 62.223, + "minMs": 59.606, + "maxMs": 71.866, + "charsPerSecond": 735535 + }, + { + "id": "cs/patrik-nguyen-used-vehicle-sale.txt", + "language": "cs", + "chars": 8391, + "medianMs": 15.503, + "minMs": 14.645, + "maxMs": 18.458, + "charsPerSecond": 541236 + }, + { + "id": "cs/probo-frame-purchase-contract.txt", + "language": "cs", + "chars": 3204, + "medianMs": 7.125, + "minMs": 6.098, + "maxMs": 7.661, + "charsPerSecond": 449708 + }, + { + "id": "cs/sanofi-bonus-agreement.txt", + "language": "cs", + "chars": 1740, + "medianMs": 3.637, + "minMs": 3.358, + "maxMs": 4.733, + "charsPerSecond": 478430 + }, + { + "id": "cs/vinci-donation-agreement.txt", + "language": "cs", + "chars": 4607, + "medianMs": 6.326, + "minMs": 5.614, + "maxMs": 8.226, + "charsPerSecond": 728219 + }, + { + "id": "de/geschaeftsfuehrer-dienstvertrag.txt", + "language": "de", + "chars": 1912, + "medianMs": 4.712, + "minMs": 4.231, + "maxMs": 9.883, + "charsPerSecond": 405789 + }, + { + "id": "en/gt-biopharma-employment-amendment.txt", + "language": "en", + "chars": 4806, + "medianMs": 5.098, + "minMs": 4.768, + "maxMs": 8.735, + "charsPerSecond": 942646 + }, + { + "id": "en/healthcare-trust-employment-amendment.txt", + "language": "en", + "chars": 8627, + "medianMs": 10.648, + "minMs": 9.966, + "maxMs": 18.707, + "charsPerSecond": 810201 + }, + { + "id": "en/pra-group-employment-agreement.txt", + "language": "en", + "chars": 48324, + "medianMs": 33.065, + "minMs": 30.881, + "maxMs": 46.263, + "charsPerSecond": 1461483 + }, + { + "id": "en/software-license-agreement.txt", + "language": "en", + "chars": 2317, + "medianMs": 5.759, + "minMs": 5.263, + "maxMs": 14.676, + "charsPerSecond": 402334 + } + ] +} diff --git a/packages/bench/src/__test__/scorer.test.ts b/packages/bench/src/__test__/scorer.test.ts new file mode 100644 index 00000000..e22a0826 --- /dev/null +++ b/packages/bench/src/__test__/scorer.test.ts @@ -0,0 +1,144 @@ +import { describe, expect, test } from "bun:test"; + +import { + type LabelCounts, + mergeCounts, + microCounts, + scoreDocument, + toMetrics, +} from "../scorer"; +import type { BenchSpan } from "../types"; + +const span = (start: number, end: number, label: string): BenchSpan => ({ + start, + end, + label, +}); + +const counts = ( + result: Map, + label: string, +): LabelCounts => { + const labelCounts = result.get(label); + if (!labelCounts) throw new Error(`no counts for label ${label}`); + return labelCounts; +}; + +describe("scoreDocument", () => { + test("exact mode requires identical bounds", () => { + const gold = [span(0, 10, "person")]; + const shifted = [span(1, 10, "person")]; + const exact = scoreDocument({ gold, predicted: shifted, mode: "exact" }); + expect(counts(exact, "person")).toEqual({ + truePositives: 0, + falsePositives: 1, + falseNegatives: 1, + }); + const overlap = scoreDocument({ + gold, + predicted: shifted, + mode: "overlap", + }); + expect(counts(overlap, "person")).toEqual({ + truePositives: 1, + falsePositives: 0, + falseNegatives: 0, + }); + }); + + test("label mismatch never matches even with identical bounds", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(0, 5, "organization")], + mode: "overlap", + }); + expect(counts(result, "person").falseNegatives).toBe(1); + expect(counts(result, "organization").falsePositives).toBe(1); + }); + + test("adjacent spans do not overlap (end is exclusive)", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(5, 9, "person")], + mode: "overlap", + }); + expect(counts(result, "person").truePositives).toBe(0); + }); + + test("one gold span absorbs at most one of several predictions", () => { + const result = scoreDocument({ + gold: [span(0, 10, "person")], + predicted: [span(0, 4, "person"), span(2, 10, "person")], + mode: "overlap", + }); + expect(counts(result, "person")).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 0, + }); + }); + + test("largest overlap wins when several predictions compete", () => { + const gold = [span(0, 10, "person"), span(20, 30, "person")]; + const predicted = [span(8, 25, "person"), span(0, 9, "person")]; + const result = scoreDocument({ gold, predicted, mode: "overlap" }); + // First gold takes the 9-char overlap (0..9); second takes 8..25. + expect(counts(result, "person")).toEqual({ + truePositives: 2, + falsePositives: 0, + falseNegatives: 0, + }); + }); + + test("labels filter drops both gold and predictions", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person"), span(10, 15, "date")], + predicted: [span(10, 15, "date"), span(20, 25, "organization")], + mode: "exact", + labels: ["date"], + }); + expect([...result.keys()]).toEqual(["date"]); + expect(counts(result, "date").truePositives).toBe(1); + }); +}); + +describe("aggregation", () => { + test("mergeCounts accumulates and microCounts sums labels", () => { + const into = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(0, 5, "person")], + mode: "exact", + }); + const from = scoreDocument({ + gold: [span(0, 5, "person"), span(8, 12, "date")], + predicted: [span(1, 5, "person")], + mode: "exact", + }); + mergeCounts(into, from); + expect(counts(into, "person")).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 1, + }); + expect(microCounts(into)).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 2, + }); + }); + + test("toMetrics handles empty sides without dividing by zero", () => { + expect( + toMetrics({ truePositives: 0, falsePositives: 0, falseNegatives: 0 }), + ).toMatchObject({ precision: 0, recall: 0, f1: 0 }); + const metrics = toMetrics({ + truePositives: 3, + falsePositives: 1, + falseNegatives: 1, + }); + expect(metrics.precision).toBeCloseTo(0.75); + expect(metrics.recall).toBeCloseTo(0.75); + expect(metrics.f1).toBeCloseTo(0.75); + expect(metrics.goldCount).toBe(4); + }); +}); diff --git a/packages/bench/src/adapters/anonymize.ts b/packages/bench/src/adapters/anonymize.ts new file mode 100644 index 00000000..ef38118c --- /dev/null +++ b/packages/bench/src/adapters/anonymize.ts @@ -0,0 +1,57 @@ +import { + createPipelineContext, + DEFAULT_ENTITY_LABELS, + runPipeline, + type PipelineConfig, +} from "@stll/anonymize"; + +import { loadBenchDictionaries } from "../dictionaries"; +import type { GoldDocument, PredictionsFile } from "../types"; + +/** + * Deterministic layers only (NER off): identical to the config the + * regression snapshots are generated with, so quality numbers and + * throughput numbers describe the same pipeline. + */ +export const BENCH_PIPELINE_CONFIG: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "bench", +}; + +export const runAnonymizeAdapter = async ( + docs: GoldDocument[], +): Promise => { + const dictionaries = await loadBenchDictionaries(); + const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries }; + const context = createPipelineContext(); + const predictions: PredictionsFile = { tool: "anonymize", docs: [] }; + for (const doc of docs) { + const entities = await runPipeline({ + fullText: doc.text, + config, + gazetteerEntries: [], + context, + }); + predictions.docs.push({ + id: doc.id, + entities: entities.map(({ start, end, label }) => ({ + start, + end, + label, + })), + }); + } + return predictions; +}; diff --git a/packages/bench/src/dictionaries.ts b/packages/bench/src/dictionaries.ts new file mode 100644 index 00000000..0ae23425 --- /dev/null +++ b/packages/bench/src/dictionaries.ts @@ -0,0 +1,136 @@ +/** + * Loads the full published dictionary set from @stll/anonymize-data + * the way a production consumer would. Mirrors the corpus used by + * the anonymize regression suite (see + * packages/anonymize/src/__test__/load-dictionaries.ts) so bench + * results stay comparable with the committed snapshots; keep the + * language and country lists in sync. + */ +import type { Dictionaries, DictionaryMeta } from "@stll/anonymize"; + +let cached: Dictionaries | null = null; + +const NAME_LANGUAGES = [ + "cs", + "sk", + "de", + "pl", + "hu", + "ro", + "fr", + "es", + "it", + "en", + "sv", +] as const; + +const CITY_COUNTRIES = [ + "AT", + "AU", + "BE", + "BG", + "BR", + "CA", + "CH", + "CZ", + "DE", + "DK", + "ES", + "FI", + "FR", + "GB", + "GR", + "HR", + "HU", + "IE", + "IT", + "LU", + "NL", + "NO", + "NZ", + "PL", + "PT", + "RO", + "SE", + "SI", + "SK", + "US", +] as const; + +type NameDictionaryModule = { + default: readonly string[]; +}; + +const loadNameDictionary = async ( + kind: "first" | "surnames", + language: string, +): Promise => { + try { + const mod: NameDictionaryModule = await import( + `@stll/anonymize-data/dictionaries/names/${kind}/${language}.json` + ); + return mod.default; + } catch { + return null; + } +}; + +export const loadBenchDictionaries = async (): Promise => { + if (cached) return cached; + + const dataModule = await import("@stll/anonymize-data"); + + const denyList: Record = {}; + const denyListMeta: Record = {}; + const denyListResults = await Promise.all( + [...dataModule.ALL_DICTIONARY_IDS].map(async (id) => ({ + id, + entries: await dataModule.loadDictionary(id), + })), + ); + for (const { id, entries } of denyListResults) { + const meta = dataModule.DICTIONARY_META[id]; + if (!meta) continue; + denyList[id] = entries; + // SAFETY: anonymize-data categories match DenyListCategory at runtime + denyListMeta[id] = meta as DictionaryMeta; + } + + const firstNames: Record = {}; + const surnames: Record = {}; + await Promise.all( + NAME_LANGUAGES.map(async (language) => { + const [first, last] = await Promise.all([ + loadNameDictionary("first", language), + loadNameDictionary("surnames", language), + ]); + if (first) firstNames[language] = first; + if (last) surnames[language] = last; + }), + ); + + const cityResults = await Promise.all( + CITY_COUNTRIES.map(async (country) => ({ + country, + entries: await dataModule.loadCityDictionary(country), + })), + ); + const citiesByCountry: Record = {}; + const mergedCities: string[] = []; + for (const { country, entries } of cityResults) { + citiesByCountry[country] = entries; + for (const entry of entries) { + mergedCities.push(entry); + } + } + + cached = { + firstNames, + surnames, + denyList, + denyListMeta, + cities: mergedCities, + citiesByCountry, + }; + return cached; +}; diff --git a/packages/bench/src/fixtures.ts b/packages/bench/src/fixtures.ts new file mode 100644 index 00000000..016ad730 --- /dev/null +++ b/packages/bench/src/fixtures.ts @@ -0,0 +1,58 @@ +import { readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; + +import type { BenchSpan, GoldDocument } from "./types"; + +/** + * The bench corpus lives with the anonymize regression suite so the + * same fixtures gate releases and feed the benchmarks. Reference + * annotations come from the human-reviewed `.snapshot.json` sidecars + * maintained by contract-snapshots.test.ts. + */ +const CONTRACTS_DIR = join( + import.meta.dir, + "..", + "..", + "anonymize", + "src", + "__test__", + "fixtures", + "contracts", +); + +type SnapshotFile = { + entities: BenchSpan[]; +}; + +export const loadGoldDocuments = (): GoldDocument[] => { + const docs: GoldDocument[] = []; + for (const language of readdirSync(CONTRACTS_DIR).toSorted()) { + const languageDir = join(CONTRACTS_DIR, language); + for (const file of readdirSync(languageDir).toSorted()) { + if (!file.endsWith(".txt")) continue; + const text = readFileSync(join(languageDir, file), "utf8").replaceAll( + "\r\n", + "\n", + ); + const snapshotPath = join( + languageDir, + file.replace(/\.txt$/u, ".snapshot.json"), + ); + // SAFETY: sidecars are generated by contract-snapshots.test.ts with this shape + const snapshot = JSON.parse( + readFileSync(snapshotPath, "utf8"), + ) as SnapshotFile; + docs.push({ + id: `${language}/${file}`, + language, + text, + gold: snapshot.entities.map(({ start, end, label }) => ({ + start, + end, + label, + })), + }); + } + } + return docs; +}; diff --git a/packages/bench/src/render-results.ts b/packages/bench/src/render-results.ts new file mode 100644 index 00000000..6d6a270c --- /dev/null +++ b/packages/bench/src/render-results.ts @@ -0,0 +1,168 @@ +/** + * Renders results/*.json into results/RESULTS.md. Quality reports + * are discovered by the quality..json naming convention so + * external tools added later show up without changes here. + */ +import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const RESULTS_DIR = join(import.meta.dir, "..", "results"); + +type LabelMetricsJson = { + goldCount: number; + truePositives: number; + falsePositives: number; + falseNegatives: number; + precision: number; + recall: number; + f1: number; +}; + +type ModeReportJson = { + micro: LabelMetricsJson; + perLabel: Record; + perLanguage: Record; +}; + +type QualityReportJson = { + tool: string; + generatedAt: string; + corpus: { + docs: number; + docsPerLanguage: Record; + goldEntities: number; + }; + labelsFilter: string[] | null; + modes: Record<"exact" | "overlap", ModeReportJson>; +}; + +type ThroughputReportJson = { + generatedAt: string; + environment: { bun: string; platform: string; arch: string; cpu: string }; + settings: { iterations: number; warmup: number }; + oneTime: { dictionaryLoadMs: number; prepareMs: number }; + corpus: { + docs: number; + totalChars: number; + medianPassMs: number; + charsPerSecond: number; + }; + documents: { + id: string; + language: string; + chars: number; + medianMs: number; + minMs: number; + maxMs: number; + charsPerSecond: number; + }[]; +}; + +const percent = (value: number): string => `${(value * 100).toFixed(1)}%`; +const integer = (value: number): string => value.toLocaleString("en-US"); + +const metricsRow = (name: string, metrics: LabelMetricsJson): string => + `| ${name} | ${integer(metrics.goldCount)} | ${percent(metrics.precision)} | ${percent(metrics.recall)} | ${percent(metrics.f1)} |`; + +const renderQuality = (report: QualityReportJson): string[] => { + const lines: string[] = []; + lines.push(`### ${report.tool}`); + lines.push(""); + const filterNote = report.labelsFilter + ? ` Scored labels: ${report.labelsFilter.join(", ")}.` + : ""; + lines.push( + `${report.corpus.docs} documents, ${integer(report.corpus.goldEntities)} reference entities.${filterNote}`, + ); + for (const mode of ["exact", "overlap"] as const) { + const modeReport = report.modes[mode]; + lines.push(""); + lines.push(`#### ${mode} match`); + lines.push(""); + lines.push("| Label | Gold | Precision | Recall | F1 |"); + lines.push("| --- | ---: | ---: | ---: | ---: |"); + for (const [label, metrics] of Object.entries(modeReport.perLabel)) { + lines.push(metricsRow(label, metrics)); + } + lines.push(metricsRow("**all (micro)**", modeReport.micro)); + lines.push(""); + lines.push("| Language | Gold | Precision | Recall | F1 |"); + lines.push("| --- | ---: | ---: | ---: | ---: |"); + for (const [language, metrics] of Object.entries( + modeReport.perLanguage, + ).toSorted(([a], [b]) => a.localeCompare(b))) { + lines.push(metricsRow(language, metrics)); + } + } + lines.push(""); + return lines; +}; + +const renderThroughput = (report: ThroughputReportJson): string[] => { + const lines: string[] = []; + lines.push("## Throughput"); + lines.push(""); + lines.push( + `Environment: Bun ${report.environment.bun}, ${report.environment.cpu} (${report.environment.platform}/${report.environment.arch}). ` + + `${report.settings.warmup} warmup + ${report.settings.iterations} measured passes; medians reported.`, + ); + lines.push(""); + lines.push( + `One-time costs: dictionary load ${report.oneTime.dictionaryLoadMs.toFixed(0)} ms, search preparation ${report.oneTime.prepareMs.toFixed(0)} ms.`, + ); + lines.push(""); + lines.push( + `Corpus: ${report.corpus.docs} documents, ${integer(report.corpus.totalChars)} chars; ` + + `median full pass ${report.corpus.medianPassMs.toFixed(1)} ms (${integer(report.corpus.charsPerSecond)} chars/s).`, + ); + lines.push(""); + lines.push("| Document | Chars | Median ms | Min | Max | Chars/s |"); + lines.push("| --- | ---: | ---: | ---: | ---: | ---: |"); + for (const doc of report.documents) { + lines.push( + `| ${doc.id} | ${integer(doc.chars)} | ${doc.medianMs.toFixed(1)} | ${doc.minMs.toFixed(1)} | ${doc.maxMs.toFixed(1)} | ${integer(doc.charsPerSecond)} |`, + ); + } + lines.push(""); + return lines; +}; + +const lines: string[] = []; +lines.push("# Benchmark results"); +lines.push(""); +lines.push( + "Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you.", +); +lines.push(""); + +const throughputPath = join(RESULTS_DIR, "throughput.json"); +if (existsSync(throughputPath)) { + // SAFETY: written by run-throughput.ts with this shape + const throughput = JSON.parse( + readFileSync(throughputPath, "utf8"), + ) as ThroughputReportJson; + lines.push(...renderThroughput(throughput)); +} + +const qualityFiles = readdirSync(RESULTS_DIR) + .filter((file) => file.startsWith("quality.") && file.endsWith(".json")) + .toSorted(); +if (qualityFiles.length > 0) { + lines.push("## Quality vs. reference annotations"); + lines.push(""); + lines.push( + "The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison.", + ); + lines.push(""); + for (const file of qualityFiles) { + // SAFETY: written by run-quality.ts with this shape + const report = JSON.parse( + readFileSync(join(RESULTS_DIR, file), "utf8"), + ) as QualityReportJson; + lines.push(...renderQuality(report)); + } +} + +const outPath = join(RESULTS_DIR, "RESULTS.md"); +writeFileSync(outPath, `${lines.join("\n")}\n`); +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/run-quality.ts b/packages/bench/src/run-quality.ts new file mode 100644 index 00000000..1f93aef7 --- /dev/null +++ b/packages/bench/src/run-quality.ts @@ -0,0 +1,139 @@ +/** + * Scores tool predictions against the reference annotations. + * + * Default run executes the anonymize pipeline in-process. Pass + * --predictions (PredictionsFile shape) to score an + * external tool's output on the same corpus instead. + */ +import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { parseArgs } from "node:util"; + +import { runAnonymizeAdapter } from "./adapters/anonymize"; +import { loadGoldDocuments } from "./fixtures"; +import { + type LabelCounts, + type LabelMetrics, + type MatchMode, + mergeCounts, + microCounts, + scoreDocument, + toMetrics, +} from "./scorer"; +import type { PredictionsFile } from "./types"; + +const MATCH_MODES: readonly MatchMode[] = ["exact", "overlap"]; + +type ModeReport = { + micro: LabelMetrics; + perLabel: Record; + perLanguage: Record; +}; + +type QualityReport = { + tool: string; + generatedAt: string; + corpus: { + docs: number; + docsPerLanguage: Record; + goldEntities: number; + }; + labelsFilter: string[] | null; + modes: Record; +}; + +const { values: args } = parseArgs({ + options: { + predictions: { type: "string" }, + labels: { type: "string" }, + out: { type: "string" }, + }, +}); + +const labelsFilter = args.labels?.split(",").map((label) => label.trim()); + +const docs = loadGoldDocuments(); +const predictions: PredictionsFile = args.predictions + ? // SAFETY: --predictions files are produced by bench adapters with this shape + (JSON.parse(readFileSync(args.predictions, "utf8")) as PredictionsFile) + : await runAnonymizeAdapter(docs); + +const predictionsById = new Map( + predictions.docs.map((doc) => [doc.id, doc.entities]), +); + +const missingDocs = docs.filter((doc) => !predictionsById.has(doc.id)); +if (missingDocs.length > 0) { + const ids = missingDocs.map((doc) => doc.id).join(", "); + throw new Error(`predictions missing for: ${ids}`); +} + +const buildModeReport = (mode: MatchMode): ModeReport => { + const totalCounts = new Map(); + const languageCounts = new Map>(); + for (const doc of docs) { + const documentCounts = scoreDocument({ + gold: doc.gold, + predicted: predictionsById.get(doc.id) ?? [], + mode, + labels: labelsFilter, + }); + mergeCounts(totalCounts, documentCounts); + const perLanguage = + languageCounts.get(doc.language) ?? new Map(); + mergeCounts(perLanguage, documentCounts); + languageCounts.set(doc.language, perLanguage); + } + + const perLabel: Record = {}; + for (const label of [...totalCounts.keys()].toSorted()) { + const counts = totalCounts.get(label); + if (counts) perLabel[label] = toMetrics(counts); + } + const perLanguage: Record = {}; + for (const [language, counts] of languageCounts) { + perLanguage[language] = toMetrics(microCounts(counts)); + } + return { micro: toMetrics(microCounts(totalCounts)), perLabel, perLanguage }; +}; + +const docsPerLanguage: Record = {}; +for (const doc of docs) { + docsPerLanguage[doc.language] = (docsPerLanguage[doc.language] ?? 0) + 1; +} + +const report: QualityReport = { + tool: predictions.tool, + generatedAt: new Date().toISOString(), + corpus: { + docs: docs.length, + docsPerLanguage, + goldEntities: docs.reduce((sum, doc) => sum + doc.gold.length, 0), + }, + labelsFilter: labelsFilter ?? null, + modes: { + exact: buildModeReport("exact"), + overlap: buildModeReport("overlap"), + }, +}; + +const outPath = + args.out ?? + join(import.meta.dir, "..", "results", `quality.${predictions.tool}.json`); +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`); + +for (const mode of MATCH_MODES) { + const { micro } = report.modes[mode]; + console.log( + JSON.stringify({ + event: "quality", + tool: predictions.tool, + mode, + precision: micro.precision, + recall: micro.recall, + f1: micro.f1, + }), + ); +} +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/run-throughput.ts b/packages/bench/src/run-throughput.ts new file mode 100644 index 00000000..98ac58f1 --- /dev/null +++ b/packages/bench/src/run-throughput.ts @@ -0,0 +1,181 @@ +/** + * Throughput benchmark for the deterministic pipeline (NER off). + * + * Measures one-time costs (dictionary load, search preparation) and + * steady-state per-document latency over the contract corpus: + * --warmup full passes (default 2), then --iterations measured + * passes (default 10); medians are reported. + */ +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { arch, cpus, platform } from "node:os"; +import { parseArgs } from "node:util"; + +import { + createPipelineContext, + preparePipelineSearch, + runPipeline, + type PipelineConfig, +} from "@stll/anonymize"; + +import { BENCH_PIPELINE_CONFIG } from "./adapters/anonymize"; +import { loadBenchDictionaries } from "./dictionaries"; +import { loadGoldDocuments } from "./fixtures"; + +const DEFAULT_ITERATIONS = 10; +const DEFAULT_WARMUP = 2; + +type DocumentStats = { + id: string; + language: string; + chars: number; + medianMs: number; + minMs: number; + maxMs: number; + charsPerSecond: number; +}; + +type ThroughputReport = { + generatedAt: string; + environment: { + bun: string; + platform: string; + arch: string; + cpu: string; + }; + settings: { iterations: number; warmup: number }; + oneTime: { dictionaryLoadMs: number; prepareMs: number }; + corpus: { + docs: number; + totalChars: number; + medianPassMs: number; + charsPerSecond: number; + }; + documents: DocumentStats[]; +}; + +const { values: args } = parseArgs({ + options: { + iterations: { type: "string" }, + warmup: { type: "string" }, + out: { type: "string" }, + }, +}); + +const iterations = Number(args.iterations ?? DEFAULT_ITERATIONS); +const warmup = Number(args.warmup ?? DEFAULT_WARMUP); +if (!Number.isInteger(iterations) || iterations < 1) { + throw new Error(`--iterations must be a positive integer`); +} +if (!Number.isInteger(warmup) || warmup < 0) { + throw new Error(`--warmup must be a non-negative integer`); +} + +const elapsedMs = (startNs: number): number => + (Bun.nanoseconds() - startNs) / 1_000_000; + +const median = (samples: number[]): number => { + const sorted = samples.toSorted((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + const lower = sorted.at(middle - (sorted.length % 2 === 0 ? 1 : 0)) ?? 0; + const upper = sorted.at(middle) ?? 0; + return (lower + upper) / 2; +}; + +const roundMs = (ms: number): number => Math.round(ms * 1_000) / 1_000; + +const docs = loadGoldDocuments(); + +const dictionaryStart = Bun.nanoseconds(); +const dictionaries = await loadBenchDictionaries(); +const dictionaryLoadMs = elapsedMs(dictionaryStart); + +const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries }; +const context = createPipelineContext(); +const prepareStart = Bun.nanoseconds(); +await preparePipelineSearch({ config, context }); +const prepareMs = elapsedMs(prepareStart); + +const runDocument = async (text: string): Promise => { + await runPipeline({ + fullText: text, + config, + gazetteerEntries: [], + context, + }); +}; + +for (let pass = 0; pass < warmup; pass += 1) { + for (const doc of docs) { + await runDocument(doc.text); + } +} + +const samplesByDoc = new Map(docs.map((doc) => [doc.id, []])); +const passSamples: number[] = []; +for (let pass = 0; pass < iterations; pass += 1) { + let passMs = 0; + for (const doc of docs) { + const start = Bun.nanoseconds(); + await runDocument(doc.text); + const ms = elapsedMs(start); + passMs += ms; + samplesByDoc.get(doc.id)?.push(ms); + } + passSamples.push(passMs); +} + +const documents: DocumentStats[] = docs.map((doc) => { + const samples = samplesByDoc.get(doc.id) ?? []; + const medianMs = median(samples); + return { + id: doc.id, + language: doc.language, + chars: doc.text.length, + medianMs: roundMs(medianMs), + minMs: roundMs(Math.min(...samples)), + maxMs: roundMs(Math.max(...samples)), + charsPerSecond: Math.round(doc.text.length / (medianMs / 1_000)), + }; +}); + +const totalChars = docs.reduce((sum, doc) => sum + doc.text.length, 0); +const medianPassMs = median(passSamples); + +const report: ThroughputReport = { + generatedAt: new Date().toISOString(), + environment: { + bun: Bun.version, + platform: platform(), + arch: arch(), + cpu: cpus().at(0)?.model ?? "unknown", + }, + settings: { iterations, warmup }, + oneTime: { + dictionaryLoadMs: roundMs(dictionaryLoadMs), + prepareMs: roundMs(prepareMs), + }, + corpus: { + docs: docs.length, + totalChars, + medianPassMs: roundMs(medianPassMs), + charsPerSecond: Math.round(totalChars / (medianPassMs / 1_000)), + }, + documents, +}; + +const outPath = + args.out ?? join(import.meta.dir, "..", "results", "throughput.json"); +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`); + +console.log( + JSON.stringify({ + event: "throughput", + medianPassMs: report.corpus.medianPassMs, + charsPerSecond: report.corpus.charsPerSecond, + dictionaryLoadMs: report.oneTime.dictionaryLoadMs, + prepareMs: report.oneTime.prepareMs, + }), +); +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/scorer.ts b/packages/bench/src/scorer.ts new file mode 100644 index 00000000..e3ac08fe --- /dev/null +++ b/packages/bench/src/scorer.ts @@ -0,0 +1,169 @@ +import type { BenchSpan } from "./types"; + +/** + * exact: a prediction counts only when label, start, and end all match. + * overlap: a prediction counts when the label matches and the spans + * share at least one character; for anonymization a partial hit still + * redacts part of the value, but exact mode is the honest headline. + */ +export type MatchMode = "exact" | "overlap"; + +export type LabelCounts = { + truePositives: number; + falsePositives: number; + falseNegatives: number; +}; + +export type LabelMetrics = LabelCounts & { + goldCount: number; + precision: number; + recall: number; + f1: number; +}; + +type ScoreDocumentOptions = { + gold: BenchSpan[]; + predicted: BenchSpan[]; + mode: MatchMode; + /** Restrict scoring to these labels; both sides are filtered. */ + labels?: readonly string[] | undefined; +}; + +const overlapLength = (a: BenchSpan, b: BenchSpan): number => + Math.min(a.end, b.end) - Math.max(a.start, b.start); + +const groupByLabel = (spans: BenchSpan[]): Map => { + const groups = new Map(); + for (const span of spans) { + const group = groups.get(span.label); + if (group) { + group.push(span); + } else { + groups.set(span.label, [span]); + } + } + return groups; +}; + +/** + * One-to-one matching within a label: gold spans are visited in + * document order; each claims the unmatched prediction with the + * largest overlap (exact mode requires identical bounds). + */ +const countLabelMatches = ( + gold: BenchSpan[], + predicted: BenchSpan[], + mode: MatchMode, +): number => { + const used = predicted.map(() => false); + let truePositives = 0; + const sortedGold = gold.toSorted((a, b) => a.start - b.start); + for (const goldSpan of sortedGold) { + let bestIndex = -1; + let bestOverlap = 0; + for (const [index, prediction] of predicted.entries()) { + if (used[index]) continue; + if (mode === "exact") { + if ( + prediction.start === goldSpan.start && + prediction.end === goldSpan.end + ) { + bestIndex = index; + break; + } + continue; + } + const overlap = overlapLength(goldSpan, prediction); + if (overlap > bestOverlap) { + bestOverlap = overlap; + bestIndex = index; + } + } + if (bestIndex < 0) continue; + used[bestIndex] = true; + truePositives += 1; + } + return truePositives; +}; + +/** Per-label true/false positive and false negative counts for one document. */ +export const scoreDocument = ({ + gold, + predicted, + mode, + labels, +}: ScoreDocumentOptions): Map => { + const labelFilter = labels ? new Set(labels) : null; + const keep = (span: BenchSpan) => + labelFilter === null || labelFilter.has(span.label); + const goldGroups = groupByLabel(gold.filter(keep)); + const predictedGroups = groupByLabel(predicted.filter(keep)); + + const counts = new Map(); + const allLabels = new Set([...goldGroups.keys(), ...predictedGroups.keys()]); + for (const label of allLabels) { + const goldSpans = goldGroups.get(label) ?? []; + const predictedSpans = predictedGroups.get(label) ?? []; + const truePositives = countLabelMatches(goldSpans, predictedSpans, mode); + counts.set(label, { + truePositives, + falsePositives: predictedSpans.length - truePositives, + falseNegatives: goldSpans.length - truePositives, + }); + } + return counts; +}; + +export const mergeCounts = ( + into: Map, + from: Map, +): void => { + for (const [label, counts] of from) { + const existing = into.get(label); + if (!existing) { + into.set(label, { ...counts }); + continue; + } + existing.truePositives += counts.truePositives; + existing.falsePositives += counts.falsePositives; + existing.falseNegatives += counts.falseNegatives; + } +}; + +export const toMetrics = ({ + truePositives, + falsePositives, + falseNegatives, +}: LabelCounts): LabelMetrics => { + const predictedCount = truePositives + falsePositives; + const goldCount = truePositives + falseNegatives; + const precision = predictedCount === 0 ? 0 : truePositives / predictedCount; + const recall = goldCount === 0 ? 0 : truePositives / goldCount; + const f1 = + precision + recall === 0 + ? 0 + : (2 * precision * recall) / (precision + recall); + return { + truePositives, + falsePositives, + falseNegatives, + goldCount, + precision, + recall, + f1, + }; +}; + +export const microCounts = (counts: Map): LabelCounts => { + const total: LabelCounts = { + truePositives: 0, + falsePositives: 0, + falseNegatives: 0, + }; + for (const labelCounts of counts.values()) { + total.truePositives += labelCounts.truePositives; + total.falsePositives += labelCounts.falsePositives; + total.falseNegatives += labelCounts.falseNegatives; + } + return total; +}; diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts new file mode 100644 index 00000000..d1ff30af --- /dev/null +++ b/packages/bench/src/types.ts @@ -0,0 +1,29 @@ +/** A labeled character span; offsets are UTF-16 code units into the document text. */ +export type BenchSpan = { + start: number; + end: number; + label: string; +}; + +export type GoldDocument = { + /** Path relative to the contracts fixture root, e.g. "cs/sanofi-bonus-agreement.txt". */ + id: string; + language: string; + text: string; + gold: BenchSpan[]; +}; + +export type PredictionsDocument = { + id: string; + entities: BenchSpan[]; +}; + +/** + * Interchange format for tool outputs. External tools (Presidio, + * redact-pii, ...) produce this shape so every tool is scored by + * the same scorer against the same reference annotations. + */ +export type PredictionsFile = { + tool: string; + docs: PredictionsDocument[]; +}; diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json new file mode 100644 index 00000000..c9bf468b --- /dev/null +++ b/packages/bench/tsconfig.json @@ -0,0 +1,13 @@ +{ + "$schema": "https://json.schemastore.org/tsconfig", + "extends": "@stll/typescript-config/library.json", + "compilerOptions": { + "lib": ["ESNext"], + "noEmit": true, + "resolveJsonModule": true, + "target": "ES2023", + "types": ["node", "bun-types"] + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules"] +}