diff --git a/.github/tools/check-packlist.mjs b/.github/tools/check-packlist.mjs index 573f6d18..3e039419 100644 --- a/.github/tools/check-packlist.mjs +++ b/.github/tools/check-packlist.mjs @@ -7,6 +7,9 @@ const PACKAGES = [ expected: [ "dist/index.d.mts", "dist/index.mjs", + // Dynamically imported corpus chunk; missing means the + // bundler stopped resolving the non-Western name imports. + "dist/names-nw-in.mjs", "README.md", "LICENSE", "package.json", @@ -41,6 +44,7 @@ const PACKAGES = [ expected: [ "dist/wasm.d.mts", "dist/wasm.mjs", + "dist/names-nw-in.mjs", "dist/vite.d.mts", "dist/vite.mjs", "README.md", diff --git a/bun.lock b/bun.lock index af9f858e..c4d022c5 100644 --- a/bun.lock +++ b/bun.lock @@ -54,6 +54,19 @@ "vite", ], }, + "packages/bench": { + "name": "@stll/anonymize-bench", + "version": "0.0.0", + "dependencies": { + "@stll/anonymize": "workspace:*", + "@stll/anonymize-data": "workspace:*", + }, + "devDependencies": { + "@types/node": "^25.9.2", + "bun-types": "^1.3.14", + "typescript": "^6.0.3", + }, + }, "packages/cli": { "name": "@stll/anonymize-cli", "version": "1.4.10", @@ -253,6 +266,8 @@ "@stll/anonymize": ["@stll/anonymize@workspace:packages/anonymize"], + "@stll/anonymize-bench": ["@stll/anonymize-bench@workspace:packages/bench"], + "@stll/anonymize-cli": ["@stll/anonymize-cli@workspace:packages/cli"], "@stll/anonymize-data": ["@stll/anonymize-data@workspace:packages/data"], diff --git a/packages/anonymize/src/detectors/names.ts b/packages/anonymize/src/detectors/names.ts index 146912de..5732b711 100644 --- a/packages/anonymize/src/detectors/names.ts +++ b/packages/anonymize/src/detectors/names.ts @@ -49,6 +49,28 @@ const NONWESTERN_LOCALE_KEYS = [ "id", ] as const; +type NonWesternNamesModule = Promise<{ default: { names: string[] } }>; + +// Literal import specifiers so the bundler resolves each corpus +// file into the build output; a template-literal specifier survives +// bundling as a runtime-relative path that does not exist in dist. +const NONWESTERN_NAME_IMPORTS: Record< + (typeof NONWESTERN_LOCALE_KEYS)[number], + () => NonWesternNamesModule +> = { + in: () => import("../data/names-nw-in.json") as NonWesternNamesModule, + ar: () => import("../data/names-nw-ar.json") as NonWesternNamesModule, + "ja-latn": () => + import("../data/names-nw-ja-latn.json") as NonWesternNamesModule, + ko: () => import("../data/names-nw-ko.json") as NonWesternNamesModule, + "zh-latn": () => + import("../data/names-nw-zh-latn.json") as NonWesternNamesModule, + th: () => import("../data/names-nw-th.json") as NonWesternNamesModule, + vi: () => import("../data/names-nw-vi.json") as NonWesternNamesModule, + fil: () => import("../data/names-nw-fil.json") as NonWesternNamesModule, + id: () => import("../data/names-nw-id.json") as NonWesternNamesModule, +}; + const normalizeCorpusLanguage = (language: string): string => language.toLowerCase(); @@ -219,12 +241,7 @@ export const initNameCorpus = ( const nwLocaleKeys = getScopedNonWesternLocaleKeys(languages); const [nwNameMods, nwExcludedMod] = await Promise.all([ Promise.all( - nwLocaleKeys.map( - (locale) => - import(`../data/names-nw-${locale}.json`) as Promise<{ - default: { names: string[] }; - }>, - ), + nwLocaleKeys.map((locale) => NONWESTERN_NAME_IMPORTS[locale]()), ), import("../data/names-nw-excluded-allcaps.json") as Promise<{ default: { words: string[] }; diff --git a/packages/bench/README.md b/packages/bench/README.md new file mode 100644 index 00000000..9adaf483 --- /dev/null +++ b/packages/bench/README.md @@ -0,0 +1,108 @@ +# @stll/anonymize-bench + +Reproducible quality and throughput benchmarks for `@stll/anonymize`. +Private workspace package; nothing here is published to npm. + +## Running + +```sh +bun install +bun run build # bench imports the built @stll/anonymize dist +cd packages/bench +bun run bench # quality + throughput + render results/RESULTS.md +``` + +Individual steps: `bun run bench:quality`, `bun run bench:throughput` +(`--iterations N --warmup N`), `bun run bench:render`. Results land in +`results/` as JSON plus a rendered `results/RESULTS.md`. + +## Corpus + +The corpus is the contract fixture set in +`packages/anonymize/src/__test__/fixtures/contracts/` (Czech, German, +and English legal contracts; public or synthetic documents, several +sourced from SEC EDGAR filings). The same fixtures gate releases via +the regression suite, so the benchmark always describes the pipeline +that actually ships. + +All measurements run the deterministic layers only (`enableNer: +false`): regex, trigger phrases, legal forms, name corpus, deny +lists, coreference, hotword rules, and zone classification, with the +full published dictionary set from `@stll/anonymize-data` loaded the +way a production consumer loads it. + +## Reference annotations, and what they can tell you + +Quality is scored against the `.snapshot.json` sidecars next to each +fixture. These are produced by the pipeline itself and then human +reviewed: every change to them is diffed in PRs, and +`contract-snapshots.test.ts` plus `contract-quality.test.ts` pin +specific true positives and false positives that reviewers have +verified by hand. + +Because the reference derives from reviewed pipeline output, the +pipeline's own score against it is close to perfect **by +construction**. That number is a drift detector, not proof of +accuracy. The honest uses of this harness are: + +- **Cross-tool comparison.** Other tools' outputs (see interchange + format below) are scored against the same reference with the same + scorer; relative differences on identical documents are meaningful + even when the reference has our bias. Comparisons should be read + per label, restricted to labels both tools claim to detect + (`--labels person,organization,...`). +- **Per-label and per-language coverage tracking** across releases. +- **Throughput**, which does not depend on the reference at all. + +Independent third-party corpora are a planned extension; numbers on +this corpus alone should not be quoted as absolute accuracy claims. + +## Scoring + +Span-level, per label, one-to-one matching: + +- **exact**: label, start, and end must all match. +- **overlap**: label must match and spans must share at least one + character; gold spans claim the unmatched prediction with the + largest overlap. For anonymization a partial hit still redacts part + of the value, but exact mode is the honest headline metric. + +Precision, recall, and F1 are reported per label, per language, and +micro-averaged. Offsets are UTF-16 code units; fixture text is +CRLF-normalized to match the regression suite. + +## Comparing another tool + +Run the tool over the same fixture files and write a predictions file: + +```json +{ + "tool": "some-tool", + "docs": [ + { + "id": "en/software-license-agreement.txt", + "entities": [{ "start": 100, "end": 117, "label": "date" }] + } + ] +} +``` + +Labels must be mapped to the canonical `@stll/anonymize` labels +(`person`, `organization`, `address`, `date`, ...) by the adapter +producing the file. Then: + +```sh +bun run bench:quality -- --predictions path/to/predictions.json \ + --labels person,organization,email address,phone number,date +bun run bench:render +``` + +## Throughput methodology + +One-time costs (dictionary load, search automaton preparation) are +measured separately from steady-state latency. The corpus is run +`--warmup` full passes (default 2), then `--iterations` measured +passes (default 10); per-document medians and corpus chars/second are +reported together with the Bun version and CPU model. Numbers in +committed results come from a developer laptop; treat them as +order-of-magnitude, and re-run locally for decisions. diff --git a/packages/bench/package.json b/packages/bench/package.json new file mode 100644 index 00000000..809f8904 --- /dev/null +++ b/packages/bench/package.json @@ -0,0 +1,26 @@ +{ + "name": "@stll/anonymize-bench", + "version": "0.0.0", + "private": true, + "description": "Reproducible quality and throughput benchmarks for @stll/anonymize", + "type": "module", + "license": "MIT", + "scripts": { + "bench": "bun run bench:quality && bun run bench:throughput && bun run bench:render", + "bench:quality": "bun src/run-quality.ts", + "bench:throughput": "bun src/run-throughput.ts", + "bench:render": "bun src/render-results.ts", + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "bun test", + "format": "oxfmt ." + }, + "dependencies": { + "@stll/anonymize": "workspace:*", + "@stll/anonymize-data": "workspace:*" + }, + "devDependencies": { + "@types/node": "^25.9.2", + "bun-types": "^1.3.14", + "typescript": "^6.0.3" + } +} diff --git a/packages/bench/results/RESULTS.md b/packages/bench/results/RESULTS.md new file mode 100644 index 00000000..96dcc5ec --- /dev/null +++ b/packages/bench/results/RESULTS.md @@ -0,0 +1,85 @@ +# Benchmark results + +Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you. + +## Throughput + +Environment: Bun 1.3.14, Apple M3 (darwin/arm64). 2 warmup + 10 measured passes; medians reported. + +One-time costs: dictionary load 202 ms, search preparation 777 ms. + +Corpus: 13 documents, 146,106 chars; median full pass 185.5 ms (787,480 chars/s). + +| Document | Chars | Median ms | Min | Max | Chars/s | +| -------------------------------------------- | -----: | --------: | ---: | ---: | --------: | +| cs/asset-transfer-court-declensions.txt | 1,517 | 4.1 | 3.5 | 7.0 | 371,745 | +| cs/database-cz-service-contract.txt | 7,924 | 9.9 | 8.6 | 13.1 | 801,766 | +| cs/eagles-rental-agreement.txt | 6,970 | 9.0 | 8.1 | 11.1 | 776,587 | +| cs/nakit-legal-services-framework.txt | 45,767 | 62.2 | 59.6 | 71.9 | 735,535 | +| cs/patrik-nguyen-used-vehicle-sale.txt | 8,391 | 15.5 | 14.6 | 18.5 | 541,236 | +| cs/probo-frame-purchase-contract.txt | 3,204 | 7.1 | 6.1 | 7.7 | 449,708 | +| cs/sanofi-bonus-agreement.txt | 1,740 | 3.6 | 3.4 | 4.7 | 478,430 | +| cs/vinci-donation-agreement.txt | 4,607 | 6.3 | 5.6 | 8.2 | 728,219 | +| de/geschaeftsfuehrer-dienstvertrag.txt | 1,912 | 4.7 | 4.2 | 9.9 | 405,789 | +| en/gt-biopharma-employment-amendment.txt | 4,806 | 5.1 | 4.8 | 8.7 | 942,646 | +| en/healthcare-trust-employment-amendment.txt | 8,627 | 10.6 | 10.0 | 18.7 | 810,201 | +| en/pra-group-employment-agreement.txt | 48,324 | 33.1 | 30.9 | 46.3 | 1,461,483 | +| en/software-license-agreement.txt | 2,317 | 5.8 | 5.3 | 14.7 | 402,334 | + +## Quality vs. reference annotations + +The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison. + +### anonymize + +13 documents, 332 reference entities. + +#### exact match + +| Label | Gold | Precision | Recall | F1 | +| ------------------------- | ---: | --------: | -----: | -----: | +| address | 55 | 100.0% | 100.0% | 100.0% | +| bank account number | 4 | 100.0% | 100.0% | 100.0% | +| country | 11 | 100.0% | 100.0% | 100.0% | +| date | 52 | 100.0% | 100.0% | 100.0% | +| date of birth | 2 | 100.0% | 100.0% | 100.0% | +| email address | 4 | 100.0% | 100.0% | 100.0% | +| iban | 1 | 100.0% | 100.0% | 100.0% | +| monetary amount | 54 | 100.0% | 100.0% | 100.0% | +| organization | 56 | 100.0% | 100.0% | 100.0% | +| person | 48 | 100.0% | 100.0% | 100.0% | +| phone number | 3 | 100.0% | 100.0% | 100.0% | +| registration number | 27 | 100.0% | 100.0% | 100.0% | +| tax identification number | 15 | 100.0% | 100.0% | 100.0% | +| **all (micro)** | 332 | 100.0% | 100.0% | 100.0% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | -----: | +| cs | 207 | 100.0% | 100.0% | 100.0% | +| de | 24 | 100.0% | 100.0% | 100.0% | +| en | 101 | 100.0% | 100.0% | 100.0% | + +#### overlap match + +| Label | Gold | Precision | Recall | F1 | +| ------------------------- | ---: | --------: | -----: | -----: | +| address | 55 | 100.0% | 100.0% | 100.0% | +| bank account number | 4 | 100.0% | 100.0% | 100.0% | +| country | 11 | 100.0% | 100.0% | 100.0% | +| date | 52 | 100.0% | 100.0% | 100.0% | +| date of birth | 2 | 100.0% | 100.0% | 100.0% | +| email address | 4 | 100.0% | 100.0% | 100.0% | +| iban | 1 | 100.0% | 100.0% | 100.0% | +| monetary amount | 54 | 100.0% | 100.0% | 100.0% | +| organization | 56 | 100.0% | 100.0% | 100.0% | +| person | 48 | 100.0% | 100.0% | 100.0% | +| phone number | 3 | 100.0% | 100.0% | 100.0% | +| registration number | 27 | 100.0% | 100.0% | 100.0% | +| tax identification number | 15 | 100.0% | 100.0% | 100.0% | +| **all (micro)** | 332 | 100.0% | 100.0% | 100.0% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | -----: | +| cs | 207 | 100.0% | 100.0% | 100.0% | +| de | 24 | 100.0% | 100.0% | 100.0% | +| en | 101 | 100.0% | 100.0% | 100.0% | diff --git a/packages/bench/results/quality.anonymize.json b/packages/bench/results/quality.anonymize.json new file mode 100644 index 00000000..0821b6ed --- /dev/null +++ b/packages/bench/results/quality.anonymize.json @@ -0,0 +1,334 @@ +{ + "tool": "anonymize", + "generatedAt": "2026-06-12T11:26:03.458Z", + "corpus": { + "docs": 13, + "docsPerLanguage": { + "cs": 8, + "de": 1, + "en": 4 + }, + "goldEntities": 332 + }, + "labelsFilter": null, + "modes": { + "exact": { + "micro": { + "truePositives": 332, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 332, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "perLabel": { + "address": { + "truePositives": 55, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 55, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "bank account number": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "country": { + "truePositives": 11, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 11, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date": { + "truePositives": 52, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 52, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date of birth": { + "truePositives": 2, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 2, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "email address": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "iban": { + "truePositives": 1, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 1, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "monetary amount": { + "truePositives": 54, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 54, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "organization": { + "truePositives": 56, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 56, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "person": { + "truePositives": 48, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 48, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "phone number": { + "truePositives": 3, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 3, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "registration number": { + "truePositives": 27, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 27, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "tax identification number": { + "truePositives": 15, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 15, + "precision": 1, + "recall": 1, + "f1": 1 + } + }, + "perLanguage": { + "cs": { + "truePositives": 207, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 207, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "de": { + "truePositives": 24, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 24, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "en": { + "truePositives": 101, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 101, + "precision": 1, + "recall": 1, + "f1": 1 + } + } + }, + "overlap": { + "micro": { + "truePositives": 332, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 332, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "perLabel": { + "address": { + "truePositives": 55, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 55, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "bank account number": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "country": { + "truePositives": 11, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 11, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date": { + "truePositives": 52, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 52, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "date of birth": { + "truePositives": 2, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 2, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "email address": { + "truePositives": 4, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 4, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "iban": { + "truePositives": 1, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 1, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "monetary amount": { + "truePositives": 54, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 54, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "organization": { + "truePositives": 56, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 56, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "person": { + "truePositives": 48, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 48, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "phone number": { + "truePositives": 3, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 3, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "registration number": { + "truePositives": 27, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 27, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "tax identification number": { + "truePositives": 15, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 15, + "precision": 1, + "recall": 1, + "f1": 1 + } + }, + "perLanguage": { + "cs": { + "truePositives": 207, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 207, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "de": { + "truePositives": 24, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 24, + "precision": 1, + "recall": 1, + "f1": 1 + }, + "en": { + "truePositives": 101, + "falsePositives": 0, + "falseNegatives": 0, + "goldCount": 101, + "precision": 1, + "recall": 1, + "f1": 1 + } + } + } + } +} diff --git a/packages/bench/results/throughput.json b/packages/bench/results/throughput.json new file mode 100644 index 00000000..54dd1af3 --- /dev/null +++ b/packages/bench/results/throughput.json @@ -0,0 +1,142 @@ +{ + "generatedAt": "2026-06-12T11:27:49.606Z", + "environment": { + "bun": "1.3.14", + "platform": "darwin", + "arch": "arm64", + "cpu": "Apple M3" + }, + "settings": { + "iterations": 10, + "warmup": 2 + }, + "oneTime": { + "dictionaryLoadMs": 202.265, + "prepareMs": 777.44 + }, + "corpus": { + "docs": 13, + "totalChars": 146106, + "medianPassMs": 185.536, + "charsPerSecond": 787480 + }, + "documents": [ + { + "id": "cs/asset-transfer-court-declensions.txt", + "language": "cs", + "chars": 1517, + "medianMs": 4.081, + "minMs": 3.478, + "maxMs": 6.955, + "charsPerSecond": 371745 + }, + { + "id": "cs/database-cz-service-contract.txt", + "language": "cs", + "chars": 7924, + "medianMs": 9.883, + "minMs": 8.588, + "maxMs": 13.129, + "charsPerSecond": 801766 + }, + { + "id": "cs/eagles-rental-agreement.txt", + "language": "cs", + "chars": 6970, + "medianMs": 8.975, + "minMs": 8.094, + "maxMs": 11.058, + "charsPerSecond": 776587 + }, + { + "id": "cs/nakit-legal-services-framework.txt", + "language": "cs", + "chars": 45767, + "medianMs": 62.223, + "minMs": 59.606, + "maxMs": 71.866, + "charsPerSecond": 735535 + }, + { + "id": "cs/patrik-nguyen-used-vehicle-sale.txt", + "language": "cs", + "chars": 8391, + "medianMs": 15.503, + "minMs": 14.645, + "maxMs": 18.458, + "charsPerSecond": 541236 + }, + { + "id": "cs/probo-frame-purchase-contract.txt", + "language": "cs", + "chars": 3204, + "medianMs": 7.125, + "minMs": 6.098, + "maxMs": 7.661, + "charsPerSecond": 449708 + }, + { + "id": "cs/sanofi-bonus-agreement.txt", + "language": "cs", + "chars": 1740, + "medianMs": 3.637, + "minMs": 3.358, + "maxMs": 4.733, + "charsPerSecond": 478430 + }, + { + "id": "cs/vinci-donation-agreement.txt", + "language": "cs", + "chars": 4607, + "medianMs": 6.326, + "minMs": 5.614, + "maxMs": 8.226, + "charsPerSecond": 728219 + }, + { + "id": "de/geschaeftsfuehrer-dienstvertrag.txt", + "language": "de", + "chars": 1912, + "medianMs": 4.712, + "minMs": 4.231, + "maxMs": 9.883, + "charsPerSecond": 405789 + }, + { + "id": "en/gt-biopharma-employment-amendment.txt", + "language": "en", + "chars": 4806, + "medianMs": 5.098, + "minMs": 4.768, + "maxMs": 8.735, + "charsPerSecond": 942646 + }, + { + "id": "en/healthcare-trust-employment-amendment.txt", + "language": "en", + "chars": 8627, + "medianMs": 10.648, + "minMs": 9.966, + "maxMs": 18.707, + "charsPerSecond": 810201 + }, + { + "id": "en/pra-group-employment-agreement.txt", + "language": "en", + "chars": 48324, + "medianMs": 33.065, + "minMs": 30.881, + "maxMs": 46.263, + "charsPerSecond": 1461483 + }, + { + "id": "en/software-license-agreement.txt", + "language": "en", + "chars": 2317, + "medianMs": 5.759, + "minMs": 5.263, + "maxMs": 14.676, + "charsPerSecond": 402334 + } + ] +} diff --git a/packages/bench/src/__test__/scorer.test.ts b/packages/bench/src/__test__/scorer.test.ts new file mode 100644 index 00000000..e22a0826 --- /dev/null +++ b/packages/bench/src/__test__/scorer.test.ts @@ -0,0 +1,144 @@ +import { describe, expect, test } from "bun:test"; + +import { + type LabelCounts, + mergeCounts, + microCounts, + scoreDocument, + toMetrics, +} from "../scorer"; +import type { BenchSpan } from "../types"; + +const span = (start: number, end: number, label: string): BenchSpan => ({ + start, + end, + label, +}); + +const counts = ( + result: Map, + label: string, +): LabelCounts => { + const labelCounts = result.get(label); + if (!labelCounts) throw new Error(`no counts for label ${label}`); + return labelCounts; +}; + +describe("scoreDocument", () => { + test("exact mode requires identical bounds", () => { + const gold = [span(0, 10, "person")]; + const shifted = [span(1, 10, "person")]; + const exact = scoreDocument({ gold, predicted: shifted, mode: "exact" }); + expect(counts(exact, "person")).toEqual({ + truePositives: 0, + falsePositives: 1, + falseNegatives: 1, + }); + const overlap = scoreDocument({ + gold, + predicted: shifted, + mode: "overlap", + }); + expect(counts(overlap, "person")).toEqual({ + truePositives: 1, + falsePositives: 0, + falseNegatives: 0, + }); + }); + + test("label mismatch never matches even with identical bounds", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(0, 5, "organization")], + mode: "overlap", + }); + expect(counts(result, "person").falseNegatives).toBe(1); + expect(counts(result, "organization").falsePositives).toBe(1); + }); + + test("adjacent spans do not overlap (end is exclusive)", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(5, 9, "person")], + mode: "overlap", + }); + expect(counts(result, "person").truePositives).toBe(0); + }); + + test("one gold span absorbs at most one of several predictions", () => { + const result = scoreDocument({ + gold: [span(0, 10, "person")], + predicted: [span(0, 4, "person"), span(2, 10, "person")], + mode: "overlap", + }); + expect(counts(result, "person")).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 0, + }); + }); + + test("largest overlap wins when several predictions compete", () => { + const gold = [span(0, 10, "person"), span(20, 30, "person")]; + const predicted = [span(8, 25, "person"), span(0, 9, "person")]; + const result = scoreDocument({ gold, predicted, mode: "overlap" }); + // First gold takes the 9-char overlap (0..9); second takes 8..25. + expect(counts(result, "person")).toEqual({ + truePositives: 2, + falsePositives: 0, + falseNegatives: 0, + }); + }); + + test("labels filter drops both gold and predictions", () => { + const result = scoreDocument({ + gold: [span(0, 5, "person"), span(10, 15, "date")], + predicted: [span(10, 15, "date"), span(20, 25, "organization")], + mode: "exact", + labels: ["date"], + }); + expect([...result.keys()]).toEqual(["date"]); + expect(counts(result, "date").truePositives).toBe(1); + }); +}); + +describe("aggregation", () => { + test("mergeCounts accumulates and microCounts sums labels", () => { + const into = scoreDocument({ + gold: [span(0, 5, "person")], + predicted: [span(0, 5, "person")], + mode: "exact", + }); + const from = scoreDocument({ + gold: [span(0, 5, "person"), span(8, 12, "date")], + predicted: [span(1, 5, "person")], + mode: "exact", + }); + mergeCounts(into, from); + expect(counts(into, "person")).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 1, + }); + expect(microCounts(into)).toEqual({ + truePositives: 1, + falsePositives: 1, + falseNegatives: 2, + }); + }); + + test("toMetrics handles empty sides without dividing by zero", () => { + expect( + toMetrics({ truePositives: 0, falsePositives: 0, falseNegatives: 0 }), + ).toMatchObject({ precision: 0, recall: 0, f1: 0 }); + const metrics = toMetrics({ + truePositives: 3, + falsePositives: 1, + falseNegatives: 1, + }); + expect(metrics.precision).toBeCloseTo(0.75); + expect(metrics.recall).toBeCloseTo(0.75); + expect(metrics.f1).toBeCloseTo(0.75); + expect(metrics.goldCount).toBe(4); + }); +}); diff --git a/packages/bench/src/adapters/anonymize.ts b/packages/bench/src/adapters/anonymize.ts new file mode 100644 index 00000000..ef38118c --- /dev/null +++ b/packages/bench/src/adapters/anonymize.ts @@ -0,0 +1,57 @@ +import { + createPipelineContext, + DEFAULT_ENTITY_LABELS, + runPipeline, + type PipelineConfig, +} from "@stll/anonymize"; + +import { loadBenchDictionaries } from "../dictionaries"; +import type { GoldDocument, PredictionsFile } from "../types"; + +/** + * Deterministic layers only (NER off): identical to the config the + * regression snapshots are generated with, so quality numbers and + * throughput numbers describe the same pipeline. + */ +export const BENCH_PIPELINE_CONFIG: PipelineConfig = { + threshold: 0.3, + enableTriggerPhrases: true, + enableRegex: true, + enableLegalForms: true, + enableNameCorpus: true, + enableDenyList: true, + enableGazetteer: false, + enableNer: false, + enableConfidenceBoost: true, + enableCoreference: true, + enableHotwordRules: true, + enableZoneClassification: true, + labels: [...DEFAULT_ENTITY_LABELS], + workspaceId: "bench", +}; + +export const runAnonymizeAdapter = async ( + docs: GoldDocument[], +): Promise => { + const dictionaries = await loadBenchDictionaries(); + const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries }; + const context = createPipelineContext(); + const predictions: PredictionsFile = { tool: "anonymize", docs: [] }; + for (const doc of docs) { + const entities = await runPipeline({ + fullText: doc.text, + config, + gazetteerEntries: [], + context, + }); + predictions.docs.push({ + id: doc.id, + entities: entities.map(({ start, end, label }) => ({ + start, + end, + label, + })), + }); + } + return predictions; +}; diff --git a/packages/bench/src/dictionaries.ts b/packages/bench/src/dictionaries.ts new file mode 100644 index 00000000..0ae23425 --- /dev/null +++ b/packages/bench/src/dictionaries.ts @@ -0,0 +1,136 @@ +/** + * Loads the full published dictionary set from @stll/anonymize-data + * the way a production consumer would. Mirrors the corpus used by + * the anonymize regression suite (see + * packages/anonymize/src/__test__/load-dictionaries.ts) so bench + * results stay comparable with the committed snapshots; keep the + * language and country lists in sync. + */ +import type { Dictionaries, DictionaryMeta } from "@stll/anonymize"; + +let cached: Dictionaries | null = null; + +const NAME_LANGUAGES = [ + "cs", + "sk", + "de", + "pl", + "hu", + "ro", + "fr", + "es", + "it", + "en", + "sv", +] as const; + +const CITY_COUNTRIES = [ + "AT", + "AU", + "BE", + "BG", + "BR", + "CA", + "CH", + "CZ", + "DE", + "DK", + "ES", + "FI", + "FR", + "GB", + "GR", + "HR", + "HU", + "IE", + "IT", + "LU", + "NL", + "NO", + "NZ", + "PL", + "PT", + "RO", + "SE", + "SI", + "SK", + "US", +] as const; + +type NameDictionaryModule = { + default: readonly string[]; +}; + +const loadNameDictionary = async ( + kind: "first" | "surnames", + language: string, +): Promise => { + try { + const mod: NameDictionaryModule = await import( + `@stll/anonymize-data/dictionaries/names/${kind}/${language}.json` + ); + return mod.default; + } catch { + return null; + } +}; + +export const loadBenchDictionaries = async (): Promise => { + if (cached) return cached; + + const dataModule = await import("@stll/anonymize-data"); + + const denyList: Record = {}; + const denyListMeta: Record = {}; + const denyListResults = await Promise.all( + [...dataModule.ALL_DICTIONARY_IDS].map(async (id) => ({ + id, + entries: await dataModule.loadDictionary(id), + })), + ); + for (const { id, entries } of denyListResults) { + const meta = dataModule.DICTIONARY_META[id]; + if (!meta) continue; + denyList[id] = entries; + // SAFETY: anonymize-data categories match DenyListCategory at runtime + denyListMeta[id] = meta as DictionaryMeta; + } + + const firstNames: Record = {}; + const surnames: Record = {}; + await Promise.all( + NAME_LANGUAGES.map(async (language) => { + const [first, last] = await Promise.all([ + loadNameDictionary("first", language), + loadNameDictionary("surnames", language), + ]); + if (first) firstNames[language] = first; + if (last) surnames[language] = last; + }), + ); + + const cityResults = await Promise.all( + CITY_COUNTRIES.map(async (country) => ({ + country, + entries: await dataModule.loadCityDictionary(country), + })), + ); + const citiesByCountry: Record = {}; + const mergedCities: string[] = []; + for (const { country, entries } of cityResults) { + citiesByCountry[country] = entries; + for (const entry of entries) { + mergedCities.push(entry); + } + } + + cached = { + firstNames, + surnames, + denyList, + denyListMeta, + cities: mergedCities, + citiesByCountry, + }; + return cached; +}; diff --git a/packages/bench/src/fixtures.ts b/packages/bench/src/fixtures.ts new file mode 100644 index 00000000..016ad730 --- /dev/null +++ b/packages/bench/src/fixtures.ts @@ -0,0 +1,58 @@ +import { readdirSync, readFileSync } from "node:fs"; +import { join } from "node:path"; + +import type { BenchSpan, GoldDocument } from "./types"; + +/** + * The bench corpus lives with the anonymize regression suite so the + * same fixtures gate releases and feed the benchmarks. Reference + * annotations come from the human-reviewed `.snapshot.json` sidecars + * maintained by contract-snapshots.test.ts. + */ +const CONTRACTS_DIR = join( + import.meta.dir, + "..", + "..", + "anonymize", + "src", + "__test__", + "fixtures", + "contracts", +); + +type SnapshotFile = { + entities: BenchSpan[]; +}; + +export const loadGoldDocuments = (): GoldDocument[] => { + const docs: GoldDocument[] = []; + for (const language of readdirSync(CONTRACTS_DIR).toSorted()) { + const languageDir = join(CONTRACTS_DIR, language); + for (const file of readdirSync(languageDir).toSorted()) { + if (!file.endsWith(".txt")) continue; + const text = readFileSync(join(languageDir, file), "utf8").replaceAll( + "\r\n", + "\n", + ); + const snapshotPath = join( + languageDir, + file.replace(/\.txt$/u, ".snapshot.json"), + ); + // SAFETY: sidecars are generated by contract-snapshots.test.ts with this shape + const snapshot = JSON.parse( + readFileSync(snapshotPath, "utf8"), + ) as SnapshotFile; + docs.push({ + id: `${language}/${file}`, + language, + text, + gold: snapshot.entities.map(({ start, end, label }) => ({ + start, + end, + label, + })), + }); + } + } + return docs; +}; diff --git a/packages/bench/src/render-results.ts b/packages/bench/src/render-results.ts new file mode 100644 index 00000000..6d6a270c --- /dev/null +++ b/packages/bench/src/render-results.ts @@ -0,0 +1,168 @@ +/** + * Renders results/*.json into results/RESULTS.md. Quality reports + * are discovered by the quality..json naming convention so + * external tools added later show up without changes here. + */ +import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const RESULTS_DIR = join(import.meta.dir, "..", "results"); + +type LabelMetricsJson = { + goldCount: number; + truePositives: number; + falsePositives: number; + falseNegatives: number; + precision: number; + recall: number; + f1: number; +}; + +type ModeReportJson = { + micro: LabelMetricsJson; + perLabel: Record; + perLanguage: Record; +}; + +type QualityReportJson = { + tool: string; + generatedAt: string; + corpus: { + docs: number; + docsPerLanguage: Record; + goldEntities: number; + }; + labelsFilter: string[] | null; + modes: Record<"exact" | "overlap", ModeReportJson>; +}; + +type ThroughputReportJson = { + generatedAt: string; + environment: { bun: string; platform: string; arch: string; cpu: string }; + settings: { iterations: number; warmup: number }; + oneTime: { dictionaryLoadMs: number; prepareMs: number }; + corpus: { + docs: number; + totalChars: number; + medianPassMs: number; + charsPerSecond: number; + }; + documents: { + id: string; + language: string; + chars: number; + medianMs: number; + minMs: number; + maxMs: number; + charsPerSecond: number; + }[]; +}; + +const percent = (value: number): string => `${(value * 100).toFixed(1)}%`; +const integer = (value: number): string => value.toLocaleString("en-US"); + +const metricsRow = (name: string, metrics: LabelMetricsJson): string => + `| ${name} | ${integer(metrics.goldCount)} | ${percent(metrics.precision)} | ${percent(metrics.recall)} | ${percent(metrics.f1)} |`; + +const renderQuality = (report: QualityReportJson): string[] => { + const lines: string[] = []; + lines.push(`### ${report.tool}`); + lines.push(""); + const filterNote = report.labelsFilter + ? ` Scored labels: ${report.labelsFilter.join(", ")}.` + : ""; + lines.push( + `${report.corpus.docs} documents, ${integer(report.corpus.goldEntities)} reference entities.${filterNote}`, + ); + for (const mode of ["exact", "overlap"] as const) { + const modeReport = report.modes[mode]; + lines.push(""); + lines.push(`#### ${mode} match`); + lines.push(""); + lines.push("| Label | Gold | Precision | Recall | F1 |"); + lines.push("| --- | ---: | ---: | ---: | ---: |"); + for (const [label, metrics] of Object.entries(modeReport.perLabel)) { + lines.push(metricsRow(label, metrics)); + } + lines.push(metricsRow("**all (micro)**", modeReport.micro)); + lines.push(""); + lines.push("| Language | Gold | Precision | Recall | F1 |"); + lines.push("| --- | ---: | ---: | ---: | ---: |"); + for (const [language, metrics] of Object.entries( + modeReport.perLanguage, + ).toSorted(([a], [b]) => a.localeCompare(b))) { + lines.push(metricsRow(language, metrics)); + } + } + lines.push(""); + return lines; +}; + +const renderThroughput = (report: ThroughputReportJson): string[] => { + const lines: string[] = []; + lines.push("## Throughput"); + lines.push(""); + lines.push( + `Environment: Bun ${report.environment.bun}, ${report.environment.cpu} (${report.environment.platform}/${report.environment.arch}). ` + + `${report.settings.warmup} warmup + ${report.settings.iterations} measured passes; medians reported.`, + ); + lines.push(""); + lines.push( + `One-time costs: dictionary load ${report.oneTime.dictionaryLoadMs.toFixed(0)} ms, search preparation ${report.oneTime.prepareMs.toFixed(0)} ms.`, + ); + lines.push(""); + lines.push( + `Corpus: ${report.corpus.docs} documents, ${integer(report.corpus.totalChars)} chars; ` + + `median full pass ${report.corpus.medianPassMs.toFixed(1)} ms (${integer(report.corpus.charsPerSecond)} chars/s).`, + ); + lines.push(""); + lines.push("| Document | Chars | Median ms | Min | Max | Chars/s |"); + lines.push("| --- | ---: | ---: | ---: | ---: | ---: |"); + for (const doc of report.documents) { + lines.push( + `| ${doc.id} | ${integer(doc.chars)} | ${doc.medianMs.toFixed(1)} | ${doc.minMs.toFixed(1)} | ${doc.maxMs.toFixed(1)} | ${integer(doc.charsPerSecond)} |`, + ); + } + lines.push(""); + return lines; +}; + +const lines: string[] = []; +lines.push("# Benchmark results"); +lines.push(""); +lines.push( + "Generated by `bun run bench` in `packages/bench`; see README.md for methodology, including how the reference annotations are produced and what they can and cannot tell you.", +); +lines.push(""); + +const throughputPath = join(RESULTS_DIR, "throughput.json"); +if (existsSync(throughputPath)) { + // SAFETY: written by run-throughput.ts with this shape + const throughput = JSON.parse( + readFileSync(throughputPath, "utf8"), + ) as ThroughputReportJson; + lines.push(...renderThroughput(throughput)); +} + +const qualityFiles = readdirSync(RESULTS_DIR) + .filter((file) => file.startsWith("quality.") && file.endsWith(".json")) + .toSorted(); +if (qualityFiles.length > 0) { + lines.push("## Quality vs. reference annotations"); + lines.push(""); + lines.push( + "The reference annotations derive from reviewed pipeline output, so the anonymize score against them is close to perfect by construction; it is a regression signal, not an accuracy claim. Cross-tool rows on the same corpus are the meaningful comparison.", + ); + lines.push(""); + for (const file of qualityFiles) { + // SAFETY: written by run-quality.ts with this shape + const report = JSON.parse( + readFileSync(join(RESULTS_DIR, file), "utf8"), + ) as QualityReportJson; + lines.push(...renderQuality(report)); + } +} + +const outPath = join(RESULTS_DIR, "RESULTS.md"); +writeFileSync(outPath, `${lines.join("\n")}\n`); +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/run-quality.ts b/packages/bench/src/run-quality.ts new file mode 100644 index 00000000..1f93aef7 --- /dev/null +++ b/packages/bench/src/run-quality.ts @@ -0,0 +1,139 @@ +/** + * Scores tool predictions against the reference annotations. + * + * Default run executes the anonymize pipeline in-process. Pass + * --predictions (PredictionsFile shape) to score an + * external tool's output on the same corpus instead. + */ +import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { parseArgs } from "node:util"; + +import { runAnonymizeAdapter } from "./adapters/anonymize"; +import { loadGoldDocuments } from "./fixtures"; +import { + type LabelCounts, + type LabelMetrics, + type MatchMode, + mergeCounts, + microCounts, + scoreDocument, + toMetrics, +} from "./scorer"; +import type { PredictionsFile } from "./types"; + +const MATCH_MODES: readonly MatchMode[] = ["exact", "overlap"]; + +type ModeReport = { + micro: LabelMetrics; + perLabel: Record; + perLanguage: Record; +}; + +type QualityReport = { + tool: string; + generatedAt: string; + corpus: { + docs: number; + docsPerLanguage: Record; + goldEntities: number; + }; + labelsFilter: string[] | null; + modes: Record; +}; + +const { values: args } = parseArgs({ + options: { + predictions: { type: "string" }, + labels: { type: "string" }, + out: { type: "string" }, + }, +}); + +const labelsFilter = args.labels?.split(",").map((label) => label.trim()); + +const docs = loadGoldDocuments(); +const predictions: PredictionsFile = args.predictions + ? // SAFETY: --predictions files are produced by bench adapters with this shape + (JSON.parse(readFileSync(args.predictions, "utf8")) as PredictionsFile) + : await runAnonymizeAdapter(docs); + +const predictionsById = new Map( + predictions.docs.map((doc) => [doc.id, doc.entities]), +); + +const missingDocs = docs.filter((doc) => !predictionsById.has(doc.id)); +if (missingDocs.length > 0) { + const ids = missingDocs.map((doc) => doc.id).join(", "); + throw new Error(`predictions missing for: ${ids}`); +} + +const buildModeReport = (mode: MatchMode): ModeReport => { + const totalCounts = new Map(); + const languageCounts = new Map>(); + for (const doc of docs) { + const documentCounts = scoreDocument({ + gold: doc.gold, + predicted: predictionsById.get(doc.id) ?? [], + mode, + labels: labelsFilter, + }); + mergeCounts(totalCounts, documentCounts); + const perLanguage = + languageCounts.get(doc.language) ?? new Map(); + mergeCounts(perLanguage, documentCounts); + languageCounts.set(doc.language, perLanguage); + } + + const perLabel: Record = {}; + for (const label of [...totalCounts.keys()].toSorted()) { + const counts = totalCounts.get(label); + if (counts) perLabel[label] = toMetrics(counts); + } + const perLanguage: Record = {}; + for (const [language, counts] of languageCounts) { + perLanguage[language] = toMetrics(microCounts(counts)); + } + return { micro: toMetrics(microCounts(totalCounts)), perLabel, perLanguage }; +}; + +const docsPerLanguage: Record = {}; +for (const doc of docs) { + docsPerLanguage[doc.language] = (docsPerLanguage[doc.language] ?? 0) + 1; +} + +const report: QualityReport = { + tool: predictions.tool, + generatedAt: new Date().toISOString(), + corpus: { + docs: docs.length, + docsPerLanguage, + goldEntities: docs.reduce((sum, doc) => sum + doc.gold.length, 0), + }, + labelsFilter: labelsFilter ?? null, + modes: { + exact: buildModeReport("exact"), + overlap: buildModeReport("overlap"), + }, +}; + +const outPath = + args.out ?? + join(import.meta.dir, "..", "results", `quality.${predictions.tool}.json`); +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`); + +for (const mode of MATCH_MODES) { + const { micro } = report.modes[mode]; + console.log( + JSON.stringify({ + event: "quality", + tool: predictions.tool, + mode, + precision: micro.precision, + recall: micro.recall, + f1: micro.f1, + }), + ); +} +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/run-throughput.ts b/packages/bench/src/run-throughput.ts new file mode 100644 index 00000000..98ac58f1 --- /dev/null +++ b/packages/bench/src/run-throughput.ts @@ -0,0 +1,181 @@ +/** + * Throughput benchmark for the deterministic pipeline (NER off). + * + * Measures one-time costs (dictionary load, search preparation) and + * steady-state per-document latency over the contract corpus: + * --warmup full passes (default 2), then --iterations measured + * passes (default 10); medians are reported. + */ +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { arch, cpus, platform } from "node:os"; +import { parseArgs } from "node:util"; + +import { + createPipelineContext, + preparePipelineSearch, + runPipeline, + type PipelineConfig, +} from "@stll/anonymize"; + +import { BENCH_PIPELINE_CONFIG } from "./adapters/anonymize"; +import { loadBenchDictionaries } from "./dictionaries"; +import { loadGoldDocuments } from "./fixtures"; + +const DEFAULT_ITERATIONS = 10; +const DEFAULT_WARMUP = 2; + +type DocumentStats = { + id: string; + language: string; + chars: number; + medianMs: number; + minMs: number; + maxMs: number; + charsPerSecond: number; +}; + +type ThroughputReport = { + generatedAt: string; + environment: { + bun: string; + platform: string; + arch: string; + cpu: string; + }; + settings: { iterations: number; warmup: number }; + oneTime: { dictionaryLoadMs: number; prepareMs: number }; + corpus: { + docs: number; + totalChars: number; + medianPassMs: number; + charsPerSecond: number; + }; + documents: DocumentStats[]; +}; + +const { values: args } = parseArgs({ + options: { + iterations: { type: "string" }, + warmup: { type: "string" }, + out: { type: "string" }, + }, +}); + +const iterations = Number(args.iterations ?? DEFAULT_ITERATIONS); +const warmup = Number(args.warmup ?? DEFAULT_WARMUP); +if (!Number.isInteger(iterations) || iterations < 1) { + throw new Error(`--iterations must be a positive integer`); +} +if (!Number.isInteger(warmup) || warmup < 0) { + throw new Error(`--warmup must be a non-negative integer`); +} + +const elapsedMs = (startNs: number): number => + (Bun.nanoseconds() - startNs) / 1_000_000; + +const median = (samples: number[]): number => { + const sorted = samples.toSorted((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + const lower = sorted.at(middle - (sorted.length % 2 === 0 ? 1 : 0)) ?? 0; + const upper = sorted.at(middle) ?? 0; + return (lower + upper) / 2; +}; + +const roundMs = (ms: number): number => Math.round(ms * 1_000) / 1_000; + +const docs = loadGoldDocuments(); + +const dictionaryStart = Bun.nanoseconds(); +const dictionaries = await loadBenchDictionaries(); +const dictionaryLoadMs = elapsedMs(dictionaryStart); + +const config: PipelineConfig = { ...BENCH_PIPELINE_CONFIG, dictionaries }; +const context = createPipelineContext(); +const prepareStart = Bun.nanoseconds(); +await preparePipelineSearch({ config, context }); +const prepareMs = elapsedMs(prepareStart); + +const runDocument = async (text: string): Promise => { + await runPipeline({ + fullText: text, + config, + gazetteerEntries: [], + context, + }); +}; + +for (let pass = 0; pass < warmup; pass += 1) { + for (const doc of docs) { + await runDocument(doc.text); + } +} + +const samplesByDoc = new Map(docs.map((doc) => [doc.id, []])); +const passSamples: number[] = []; +for (let pass = 0; pass < iterations; pass += 1) { + let passMs = 0; + for (const doc of docs) { + const start = Bun.nanoseconds(); + await runDocument(doc.text); + const ms = elapsedMs(start); + passMs += ms; + samplesByDoc.get(doc.id)?.push(ms); + } + passSamples.push(passMs); +} + +const documents: DocumentStats[] = docs.map((doc) => { + const samples = samplesByDoc.get(doc.id) ?? []; + const medianMs = median(samples); + return { + id: doc.id, + language: doc.language, + chars: doc.text.length, + medianMs: roundMs(medianMs), + minMs: roundMs(Math.min(...samples)), + maxMs: roundMs(Math.max(...samples)), + charsPerSecond: Math.round(doc.text.length / (medianMs / 1_000)), + }; +}); + +const totalChars = docs.reduce((sum, doc) => sum + doc.text.length, 0); +const medianPassMs = median(passSamples); + +const report: ThroughputReport = { + generatedAt: new Date().toISOString(), + environment: { + bun: Bun.version, + platform: platform(), + arch: arch(), + cpu: cpus().at(0)?.model ?? "unknown", + }, + settings: { iterations, warmup }, + oneTime: { + dictionaryLoadMs: roundMs(dictionaryLoadMs), + prepareMs: roundMs(prepareMs), + }, + corpus: { + docs: docs.length, + totalChars, + medianPassMs: roundMs(medianPassMs), + charsPerSecond: Math.round(totalChars / (medianPassMs / 1_000)), + }, + documents, +}; + +const outPath = + args.out ?? join(import.meta.dir, "..", "results", "throughput.json"); +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`); + +console.log( + JSON.stringify({ + event: "throughput", + medianPassMs: report.corpus.medianPassMs, + charsPerSecond: report.corpus.charsPerSecond, + dictionaryLoadMs: report.oneTime.dictionaryLoadMs, + prepareMs: report.oneTime.prepareMs, + }), +); +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/scorer.ts b/packages/bench/src/scorer.ts new file mode 100644 index 00000000..e3ac08fe --- /dev/null +++ b/packages/bench/src/scorer.ts @@ -0,0 +1,169 @@ +import type { BenchSpan } from "./types"; + +/** + * exact: a prediction counts only when label, start, and end all match. + * overlap: a prediction counts when the label matches and the spans + * share at least one character; for anonymization a partial hit still + * redacts part of the value, but exact mode is the honest headline. + */ +export type MatchMode = "exact" | "overlap"; + +export type LabelCounts = { + truePositives: number; + falsePositives: number; + falseNegatives: number; +}; + +export type LabelMetrics = LabelCounts & { + goldCount: number; + precision: number; + recall: number; + f1: number; +}; + +type ScoreDocumentOptions = { + gold: BenchSpan[]; + predicted: BenchSpan[]; + mode: MatchMode; + /** Restrict scoring to these labels; both sides are filtered. */ + labels?: readonly string[] | undefined; +}; + +const overlapLength = (a: BenchSpan, b: BenchSpan): number => + Math.min(a.end, b.end) - Math.max(a.start, b.start); + +const groupByLabel = (spans: BenchSpan[]): Map => { + const groups = new Map(); + for (const span of spans) { + const group = groups.get(span.label); + if (group) { + group.push(span); + } else { + groups.set(span.label, [span]); + } + } + return groups; +}; + +/** + * One-to-one matching within a label: gold spans are visited in + * document order; each claims the unmatched prediction with the + * largest overlap (exact mode requires identical bounds). + */ +const countLabelMatches = ( + gold: BenchSpan[], + predicted: BenchSpan[], + mode: MatchMode, +): number => { + const used = predicted.map(() => false); + let truePositives = 0; + const sortedGold = gold.toSorted((a, b) => a.start - b.start); + for (const goldSpan of sortedGold) { + let bestIndex = -1; + let bestOverlap = 0; + for (const [index, prediction] of predicted.entries()) { + if (used[index]) continue; + if (mode === "exact") { + if ( + prediction.start === goldSpan.start && + prediction.end === goldSpan.end + ) { + bestIndex = index; + break; + } + continue; + } + const overlap = overlapLength(goldSpan, prediction); + if (overlap > bestOverlap) { + bestOverlap = overlap; + bestIndex = index; + } + } + if (bestIndex < 0) continue; + used[bestIndex] = true; + truePositives += 1; + } + return truePositives; +}; + +/** Per-label true/false positive and false negative counts for one document. */ +export const scoreDocument = ({ + gold, + predicted, + mode, + labels, +}: ScoreDocumentOptions): Map => { + const labelFilter = labels ? new Set(labels) : null; + const keep = (span: BenchSpan) => + labelFilter === null || labelFilter.has(span.label); + const goldGroups = groupByLabel(gold.filter(keep)); + const predictedGroups = groupByLabel(predicted.filter(keep)); + + const counts = new Map(); + const allLabels = new Set([...goldGroups.keys(), ...predictedGroups.keys()]); + for (const label of allLabels) { + const goldSpans = goldGroups.get(label) ?? []; + const predictedSpans = predictedGroups.get(label) ?? []; + const truePositives = countLabelMatches(goldSpans, predictedSpans, mode); + counts.set(label, { + truePositives, + falsePositives: predictedSpans.length - truePositives, + falseNegatives: goldSpans.length - truePositives, + }); + } + return counts; +}; + +export const mergeCounts = ( + into: Map, + from: Map, +): void => { + for (const [label, counts] of from) { + const existing = into.get(label); + if (!existing) { + into.set(label, { ...counts }); + continue; + } + existing.truePositives += counts.truePositives; + existing.falsePositives += counts.falsePositives; + existing.falseNegatives += counts.falseNegatives; + } +}; + +export const toMetrics = ({ + truePositives, + falsePositives, + falseNegatives, +}: LabelCounts): LabelMetrics => { + const predictedCount = truePositives + falsePositives; + const goldCount = truePositives + falseNegatives; + const precision = predictedCount === 0 ? 0 : truePositives / predictedCount; + const recall = goldCount === 0 ? 0 : truePositives / goldCount; + const f1 = + precision + recall === 0 + ? 0 + : (2 * precision * recall) / (precision + recall); + return { + truePositives, + falsePositives, + falseNegatives, + goldCount, + precision, + recall, + f1, + }; +}; + +export const microCounts = (counts: Map): LabelCounts => { + const total: LabelCounts = { + truePositives: 0, + falsePositives: 0, + falseNegatives: 0, + }; + for (const labelCounts of counts.values()) { + total.truePositives += labelCounts.truePositives; + total.falsePositives += labelCounts.falsePositives; + total.falseNegatives += labelCounts.falseNegatives; + } + return total; +}; diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts new file mode 100644 index 00000000..d1ff30af --- /dev/null +++ b/packages/bench/src/types.ts @@ -0,0 +1,29 @@ +/** A labeled character span; offsets are UTF-16 code units into the document text. */ +export type BenchSpan = { + start: number; + end: number; + label: string; +}; + +export type GoldDocument = { + /** Path relative to the contracts fixture root, e.g. "cs/sanofi-bonus-agreement.txt". */ + id: string; + language: string; + text: string; + gold: BenchSpan[]; +}; + +export type PredictionsDocument = { + id: string; + entities: BenchSpan[]; +}; + +/** + * Interchange format for tool outputs. External tools (Presidio, + * redact-pii, ...) produce this shape so every tool is scored by + * the same scorer against the same reference annotations. + */ +export type PredictionsFile = { + tool: string; + docs: PredictionsDocument[]; +}; diff --git a/packages/bench/tsconfig.json b/packages/bench/tsconfig.json new file mode 100644 index 00000000..c9bf468b --- /dev/null +++ b/packages/bench/tsconfig.json @@ -0,0 +1,13 @@ +{ + "$schema": "https://json.schemastore.org/tsconfig", + "extends": "@stll/typescript-config/library.json", + "compilerOptions": { + "lib": ["ESNext"], + "noEmit": true, + "resolveJsonModule": true, + "target": "ES2023", + "types": ["node", "bun-types"] + }, + "include": ["src/**/*.ts"], + "exclude": ["node_modules"] +}