diff --git a/README.md b/README.md index 6cdb1783..bf126ed8 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,17 @@ echo "Contact Jan Novák at jan.novak@example.com" | bunx @stll/anonymize-cli - NER, coreference handling, and confidence boosting - Native, browser, and Vite-compatible entrypoints +## Benchmarks + +[`packages/bench`](packages/bench) holds reproducible throughput and +quality benchmarks for the deterministic pipeline, plus comparison +runs of Microsoft Presidio and compromise on the same legal-contract +corpus scored by the same scorer. See +[`packages/bench/results/RESULTS.md`](packages/bench/results/RESULTS.md) +for current numbers and +[`packages/bench/README.md`](packages/bench/README.md) for the +methodology and its limits. + ## Development ```bash @@ -70,3 +81,4 @@ bun run hooks:install - [`packages/anonymize`](packages/anonymize) - [`packages/data`](packages/data) - [`packages/anonymize/wasm`](packages/anonymize/wasm) +- [`packages/bench`](packages/bench) diff --git a/bun.lock b/bun.lock index c4d022c5..638359b2 100644 --- a/bun.lock +++ b/bun.lock @@ -64,6 +64,7 @@ "devDependencies": { "@types/node": "^25.9.2", "bun-types": "^1.3.14", + "compromise": "^14.15.1", "typescript": "^6.0.3", }, }, @@ -346,12 +347,16 @@ "cac": ["cac@7.0.0", "", {}, "sha512-tixWYgm5ZoOD+3g6UTea91eow5z6AAHaho3g0V9CNSNb45gM8SmflpAc+GRd1InC4AqN/07Unrgp56Y94N9hJQ=="], + "compromise": ["compromise@14.15.1", "", { "dependencies": { "efrt": "2.7.0", "grad-school": "0.0.5", "suffix-thumb": "5.0.2" } }, "sha512-9F3UkUaEU1PPz2fgStkE/TI4tk++0wHxS8xfWq9PQWL/v28dy8bEcPVVSLh3dISIRD7PEhJ8YTzHRKF8y9tnLA=="], + "defu": ["defu@6.1.7", "", {}, "sha512-7z22QmUWiQ/2d0KkdYmANbRUVABpZ9SNYyH5vx6PZ+nE5bcC0l7uFvEfHlyld/HcGBFTL536ClDt3DEcSlEJAQ=="], "detect-libc": ["detect-libc@2.1.2", "", {}, "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ=="], "dts-resolver": ["dts-resolver@3.0.0", "", { "peerDependencies": { "oxc-resolver": ">=11.0.0" }, "optionalPeers": ["oxc-resolver"] }, "sha512-1T1f+z+4tl9XD+m+0HBgWoL/nm0bOIffyWaUuUSBlFg/86IWvfx+wjNaO/ybU0AJzG9/Mi5hBUgGV6zCmWEN7Q=="], + "efrt": ["efrt@2.7.0", "", {}, "sha512-/RInbCy1d4P6Zdfa+TMVsf/ufZVotat5hCw3QXmWtjU+3pFEOvOQ7ibo3aIxyCJw2leIeAMjmPj+1SLJiCpdrQ=="], + "empathic": ["empathic@2.0.1", "", {}, "sha512-YGRs8knHhKHVShLkFET/rWAU8kmHbOV5LwN938RHI0pljAJ1Gf6SzXsSmRaEzcXTtOOmVqJ5+WtQPL5uigY50Q=="], "estree-walker": ["estree-walker@3.0.3", "", { "dependencies": { "@types/estree": "^1.0.0" } }, "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g=="], @@ -362,6 +367,8 @@ "get-tsconfig": ["get-tsconfig@5.0.0-beta.5", "", { "dependencies": { "resolve-pkg-maps": "^1.0.0" } }, "sha512-/6gFNr0N04nob252sTQxyFLi3eKFRqIg1I87YcqAMT1i6SQrSF6KujUEQrtrjMV0H/eejTCltLdDSTEMzHbnsQ=="], + "grad-school": ["grad-school@0.0.5", "", {}, "sha512-rXunEHF9M9EkMydTBux7+IryYXEZinRk6g8OBOGDBzo/qWJjhTxy86i5q7lQYpCLHN8Sqv1XX3OIOc7ka2gtvQ=="], + "hookable": ["hookable@6.1.1", "", {}, "sha512-U9LYDy1CwhMCnprUfeAZWZGByVbhd54hwepegYTK7Pi5NvqEj63ifz5z+xukznehT7i6NIZRu89Ay1AZmRsLEQ=="], "import-without-cache": ["import-without-cache@0.4.0", "", {}, "sha512-NkJQA7oZ4YHQhd2+H3BoRFKF3d/XNsiKpHZCQEMH9pDX27hQQLsTyOocyRgaIVtf8gHX3Nt3LPkR4e5EdtPAGQ=="], @@ -446,6 +453,8 @@ "stopwords-iso": ["stopwords-iso@1.1.0", "", {}, "sha512-I6GPS/E0zyieHehMRPQcqkiBMJKGgLta+1hREixhoLPqEA0AlVFiC43dl8uPpmkkeRdDMzYRWFWk5/l9x7nmNg=="], + "suffix-thumb": ["suffix-thumb@5.0.2", "", {}, "sha512-I5PWXAFKx3FYnI9a+dQMWNqTxoRt6vdBdb0O+BJ1sxXCWtSoQCusc13E58f+9p4MYx/qCnEMkD5jac6K2j3dgA=="], + "tinyexec": ["tinyexec@1.1.2", "", {}, "sha512-dAqSqE/RabpBKI8+h26GfLq6Vb3JVXs30XYQjdMjaj/c2tS8IYYMbIzP599KtRj7c57/wYApb3QjgRgXmrCukA=="], "tinyglobby": ["tinyglobby@0.2.16", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.4" } }, "sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg=="], diff --git a/packages/bench/README.md b/packages/bench/README.md index 9adaf483..f1c7a674 100644 --- a/packages/bench/README.md +++ b/packages/bench/README.md @@ -97,6 +97,65 @@ bun run bench:quality -- --predictions path/to/predictions.json \ bun run bench:render ``` +## Comparison runs + +Committed results include two external tools run on the same corpus +and scored by the same scorer. Both runs are restricted (via +`--labels`) to labels the tool claims to detect, so micro averages +are not comparable across tools with different filters; compare per +label. + +### Microsoft Presidio + +`comparison/presidio/run.py` (pinned deps in `requirements.txt`) +runs `presidio-analyzer` with its documented spaCy defaults +(`en_core_web_lg`, `de_core_news_lg`) and writes the interchange +format. Scored labels: person, organization, email address, phone +number, date. + +Read the numbers with these caveats: + +- **Czech is skipped entirely**: Presidio has no Czech language + support, so 8 of 13 corpus documents cannot be processed at all. +- **Organizations are enabled deliberately.** Presidio ignores + spaCy `ORG` spans by default because they are noisy; the run + enables them because organizations are unavoidable in legal + contracts. The resulting false-positive count shows why the + default exists. +- **`DATE_TIME` is broader than the reference `date` label** (it + also matches durations and relative time), which depresses + Presidio's date precision; this is a label-mapping asymmetry, not + purely a detection failure. +- Labels Presidio has no recognizers for on this corpus + (registration numbers, tax identifiers, monetary amounts, + addresses as street-level spans) are excluded rather than scored + as zero. + +Reproduce: + +```sh +python3 -m venv .venv && .venv/bin/pip install -r comparison/presidio/requirements.txt +.venv/bin/python -m spacy download en_core_web_lg +.venv/bin/python -m spacy download de_core_news_lg +.venv/bin/python comparison/presidio/run.py +bun src/run-quality.ts --predictions results/predictions.presidio.json \ + --labels "person,organization,email address,phone number,date" +bun run bench:render +``` + +### compromise + +`src/run-compromise.ts` runs the compromise NLP library (the +closest JS-ecosystem baseline that reports spans) on the English +documents only; scored labels: person, organization. + +```sh +bun src/run-compromise.ts +bun src/run-quality.ts --predictions results/predictions.compromise.json \ + --labels "person,organization" +bun run bench:render +``` + ## Throughput methodology One-time costs (dictionary load, search automaton preparation) are diff --git a/packages/bench/comparison/presidio/requirements.txt b/packages/bench/comparison/presidio/requirements.txt new file mode 100644 index 00000000..85af9f81 --- /dev/null +++ b/packages/bench/comparison/presidio/requirements.txt @@ -0,0 +1,5 @@ +presidio-analyzer==2.2.359 +spacy==3.8.13 +# Models (installed via `python -m spacy download `): +# en_core_web_lg +# de_core_news_lg diff --git a/packages/bench/comparison/presidio/run.py b/packages/bench/comparison/presidio/run.py new file mode 100644 index 00000000..727cace2 --- /dev/null +++ b/packages/bench/comparison/presidio/run.py @@ -0,0 +1,112 @@ +"""Runs Microsoft Presidio over the bench contract corpus and writes +predictions in the bench interchange format (packages/bench/README.md). + +Czech fixtures are skipped: Presidio has no Czech language support +(no spaCy model and no Czech recognizers); that absence is reported +in the results rather than scored as zero. + +Offsets are converted from Python code-point indices to UTF-16 code +units to match the reference annotations. + +Usage: + python run.py [--out ../../results/predictions.presidio.json] +""" + +import argparse +import json +from pathlib import Path + +from presidio_analyzer import AnalyzerEngine +from presidio_analyzer.nlp_engine import NlpEngineProvider + +LANGUAGE_MODELS = {"en": "en_core_web_lg", "de": "de_core_news_lg"} + +LABEL_MAP = { + "PERSON": "person", + "ORGANIZATION": "organization", + "EMAIL_ADDRESS": "email address", + "PHONE_NUMBER": "phone number", + "DATE_TIME": "date", +} + +FIXTURES_DIR = ( + Path(__file__).resolve().parents[3] + / "anonymize" + / "src" + / "__test__" + / "fixtures" + / "contracts" +) +DEFAULT_OUT = ( + Path(__file__).resolve().parents[2] / "results" / "predictions.presidio.json" +) + + +def utf16_offsets(text: str) -> list[int]: + """Cumulative UTF-16 code-unit offset for each code-point index.""" + offsets = [0] * (len(text) + 1) + for index, char in enumerate(text): + offsets[index + 1] = offsets[index] + (2 if ord(char) > 0xFFFF else 1) + return offsets + + +def build_analyzer() -> AnalyzerEngine: + configuration = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": lang, "model_name": model} + for lang, model in LANGUAGE_MODELS.items() + ], + # Default Presidio config ignores ORG spans from spaCy; the + # comparison needs organizations, so keep only the truly + # non-PII tags ignored. + "ner_model_configuration": { + "labels_to_ignore": ["CARDINAL", "ORDINAL", "QUANTITY", "PERCENT"], + }, + } + provider = NlpEngineProvider(nlp_configuration=configuration) + return AnalyzerEngine( + nlp_engine=provider.create_engine(), + supported_languages=list(LANGUAGE_MODELS), + ) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--out", type=Path, default=DEFAULT_OUT) + args = parser.parse_args() + + analyzer = build_analyzer() + docs = [] + for language_dir in sorted(FIXTURES_DIR.iterdir()): + language = language_dir.name + if language not in LANGUAGE_MODELS: + print(f"skipping {language}: no Presidio language support") + continue + for fixture in sorted(language_dir.glob("*.txt")): + text = fixture.read_text(encoding="utf-8").replace("\r\n", "\n") + offsets = utf16_offsets(text) + results = analyzer.analyze( + text=text, language=language, entities=list(LABEL_MAP) + ) + entities = [ + { + "start": offsets[result.start], + "end": offsets[result.end], + "label": LABEL_MAP[result.entity_type], + } + for result in results + ] + docs.append({"id": f"{language}/{fixture.name}", "entities": entities}) + print(f"{language}/{fixture.name}: {len(entities)} entities") + + args.out.parent.mkdir(parents=True, exist_ok=True) + args.out.write_text( + json.dumps({"tool": "presidio", "docs": docs}, indent=2) + "\n", + encoding="utf-8", + ) + print(f"written: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/packages/bench/package.json b/packages/bench/package.json index 809f8904..ea67da99 100644 --- a/packages/bench/package.json +++ b/packages/bench/package.json @@ -21,6 +21,7 @@ "devDependencies": { "@types/node": "^25.9.2", "bun-types": "^1.3.14", + "compromise": "^14.15.1", "typescript": "^6.0.3" } } diff --git a/packages/bench/results/RESULTS.md b/packages/bench/results/RESULTS.md index 96dcc5ec..1eae3202 100644 --- a/packages/bench/results/RESULTS.md +++ b/packages/bench/results/RESULTS.md @@ -83,3 +83,71 @@ The reference annotations derive from reviewed pipeline output, so the anonymize | cs | 207 | 100.0% | 100.0% | 100.0% | | de | 24 | 100.0% | 100.0% | 100.0% | | en | 101 | 100.0% | 100.0% | 100.0% | + +### compromise + +4 documents, 101 reference entities. Scored labels: person, organization. + +Skipped 9 corpus documents (no support for: cs, de). + +#### exact match + +| Label | Gold | Precision | Recall | F1 | +| --------------- | ---: | --------: | -----: | ----: | +| organization | 19 | 12.5% | 15.8% | 14.0% | +| person | 19 | 40.0% | 63.2% | 49.0% | +| **all (micro)** | 38 | 27.8% | 39.5% | 32.6% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | ----: | +| en | 38 | 27.8% | 39.5% | 32.6% | + +#### overlap match + +| Label | Gold | Precision | Recall | F1 | +| --------------- | ---: | --------: | -----: | ----: | +| organization | 19 | 58.3% | 73.7% | 65.1% | +| person | 19 | 53.3% | 84.2% | 65.3% | +| **all (micro)** | 38 | 55.6% | 78.9% | 65.2% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | ----: | +| en | 38 | 55.6% | 78.9% | 65.2% | + +### presidio + +5 documents, 125 reference entities. Scored labels: person, organization, email address, phone number, date. + +Skipped 8 corpus documents (no support for: cs). + +#### exact match + +| Label | Gold | Precision | Recall | F1 | +| --------------- | ---: | --------: | -----: | ----: | +| date | 27 | 14.4% | 51.9% | 22.6% | +| email address | 1 | 0.0% | 0.0% | 0.0% | +| organization | 23 | 6.9% | 60.9% | 12.4% | +| person | 24 | 59.3% | 66.7% | 62.7% | +| phone number | 1 | 0.0% | 0.0% | 0.0% | +| **all (micro)** | 76 | 13.4% | 57.9% | 21.8% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | ----: | +| de | 12 | 30.0% | 25.0% | 27.3% | +| en | 64 | 12.9% | 64.1% | 21.5% | + +#### overlap match + +| Label | Gold | Precision | Recall | F1 | +| --------------- | ---: | --------: | -----: | ----: | +| date | 27 | 23.7% | 85.2% | 37.1% | +| email address | 1 | 0.0% | 0.0% | 0.0% | +| organization | 23 | 9.4% | 82.6% | 16.9% | +| person | 24 | 81.5% | 91.7% | 86.3% | +| phone number | 1 | 50.0% | 100.0% | 66.7% | +| **all (micro)** | 76 | 19.8% | 85.5% | 32.2% | + +| Language | Gold | Precision | Recall | F1 | +| -------- | ---: | --------: | -----: | ----: | +| de | 12 | 60.0% | 50.0% | 54.5% | +| en | 64 | 18.6% | 92.2% | 30.9% | diff --git a/packages/bench/results/predictions.compromise.json b/packages/bench/results/predictions.compromise.json new file mode 100644 index 00000000..a72412aa --- /dev/null +++ b/packages/bench/results/predictions.compromise.json @@ -0,0 +1,295 @@ +{ + "tool": "compromise", + "docs": [ + { + "id": "en/gt-biopharma-employment-amendment.txt", + "entities": [ + { + "start": 384, + "end": 397, + "label": "person" + }, + { + "start": 4291, + "end": 4306, + "label": "person" + }, + { + "start": 4565, + "end": 4575, + "label": "person" + }, + { + "start": 4591, + "end": 4595, + "label": "person" + }, + { + "start": 4688, + "end": 4701, + "label": "person" + }, + { + "start": 4717, + "end": 4724, + "label": "person" + }, + { + "start": 328, + "end": 333, + "label": "organization" + }, + { + "start": 1379, + "end": 1399, + "label": "organization" + }, + { + "start": 1895, + "end": 1915, + "label": "organization" + }, + { + "start": 4538, + "end": 4543, + "label": "organization" + } + ] + }, + { + "id": "en/healthcare-trust-employment-amendment.txt", + "entities": [ + { + "start": 129, + "end": 146, + "label": "person" + }, + { + "start": 398, + "end": 416, + "label": "person" + }, + { + "start": 2652, + "end": 2665, + "label": "person" + }, + { + "start": 2679, + "end": 2692, + "label": "person" + }, + { + "start": 2769, + "end": 2787, + "label": "person" + }, + { + "start": 2792, + "end": 2810, + "label": "person" + }, + { + "start": 2898, + "end": 2918, + "label": "person" + }, + { + "start": 3207, + "end": 3225, + "label": "person" + }, + { + "start": 8285, + "end": 8298, + "label": "person" + }, + { + "start": 8510, + "end": 8523, + "label": "person" + }, + { + "start": 8601, + "end": 8619, + "label": "person" + }, + { + "start": 338, + "end": 343, + "label": "organization" + }, + { + "start": 538, + "end": 554, + "label": "organization" + }, + { + "start": 2628, + "end": 2632, + "label": "organization" + }, + { + "start": 3147, + "end": 3152, + "label": "organization" + }, + { + "start": 3346, + "end": 3362, + "label": "organization" + }, + { + "start": 8476, + "end": 8480, + "label": "organization" + } + ] + }, + { + "id": "en/pra-group-employment-agreement.txt", + "entities": [ + { + "start": 2782, + "end": 2791, + "label": "person" + }, + { + "start": 36471, + "end": 36479, + "label": "person" + }, + { + "start": 36496, + "end": 36511, + "label": "person" + }, + { + "start": 39213, + "end": 39222, + "label": "person" + }, + { + "start": 39861, + "end": 39870, + "label": "person" + }, + { + "start": 191, + "end": 206, + "label": "organization" + }, + { + "start": 288, + "end": 293, + "label": "organization" + }, + { + "start": 1859, + "end": 1880, + "label": "organization" + }, + { + "start": 3614, + "end": 3636, + "label": "organization" + }, + { + "start": 20259, + "end": 20264, + "label": "organization" + }, + { + "start": 25424, + "end": 25446, + "label": "organization" + }, + { + "start": 36422, + "end": 36437, + "label": "organization" + }, + { + "start": 39935, + "end": 39968, + "label": "organization" + }, + { + "start": 48165, + "end": 48180, + "label": "organization" + } + ] + }, + { + "id": "en/software-license-agreement.txt", + "entities": [ + { + "start": 162, + "end": 180, + "label": "person" + }, + { + "start": 386, + "end": 407, + "label": "person" + }, + { + "start": 627, + "end": 652, + "label": "person" + }, + { + "start": 756, + "end": 770, + "label": "person" + }, + { + "start": 1013, + "end": 1027, + "label": "person" + }, + { + "start": 1572, + "end": 1590, + "label": "person" + }, + { + "start": 1972, + "end": 1990, + "label": "person" + }, + { + "start": 2165, + "end": 2185, + "label": "person" + }, + { + "start": 181, + "end": 186, + "label": "organization" + }, + { + "start": 969, + "end": 991, + "label": "organization" + }, + { + "start": 1388, + "end": 1415, + "label": "organization" + }, + { + "start": 1591, + "end": 1620, + "label": "organization" + }, + { + "start": 1991, + "end": 1995, + "label": "organization" + } + ] + } + ] +} diff --git a/packages/bench/results/predictions.presidio.json b/packages/bench/results/predictions.presidio.json new file mode 100644 index 00000000..daa1c05b --- /dev/null +++ b/packages/bench/results/predictions.presidio.json @@ -0,0 +1,1670 @@ +{ + "tool": "presidio", + "docs": [ + { + "id": "de/geschaeftsfuehrer-dienstvertrag.txt", + "entities": [ + { + "start": 255, + "end": 281, + "label": "person" + }, + { + "start": 417, + "end": 437, + "label": "person" + }, + { + "start": 673, + "end": 722, + "label": "organization" + }, + { + "start": 727, + "end": 739, + "label": "organization" + }, + { + "start": 778, + "end": 802, + "label": "organization" + }, + { + "start": 1142, + "end": 1162, + "label": "person" + }, + { + "start": 1206, + "end": 1217, + "label": "organization" + }, + { + "start": 1226, + "end": 1240, + "label": "organization" + }, + { + "start": 1833, + "end": 1853, + "label": "person" + }, + { + "start": 1874, + "end": 1890, + "label": "person" + } + ] + }, + { + "id": "en/gt-biopharma-employment-amendment.txt", + "entities": [ + { + "start": 0, + "end": 7, + "label": "organization" + }, + { + "start": 208, + "end": 223, + "label": "date" + }, + { + "start": 248, + "end": 262, + "label": "date" + }, + { + "start": 314, + "end": 332, + "label": "organization" + }, + { + "start": 384, + "end": 397, + "label": "person" + }, + { + "start": 527, + "end": 543, + "label": "date" + }, + { + "start": 608, + "end": 621, + "label": "date" + }, + { + "start": 626, + "end": 642, + "label": "date" + }, + { + "start": 1060, + "end": 1066, + "label": "organization" + }, + { + "start": 1301, + "end": 1307, + "label": "organization" + }, + { + "start": 1356, + "end": 1365, + "label": "organization" + }, + { + "start": 1379, + "end": 1412, + "label": "organization" + }, + { + "start": 1895, + "end": 1928, + "label": "organization" + }, + { + "start": 2380, + "end": 2386, + "label": "organization" + }, + { + "start": 2686, + "end": 2697, + "label": "date" + }, + { + "start": 2698, + "end": 2707, + "label": "date" + }, + { + "start": 2787, + "end": 2795, + "label": "date" + }, + { + "start": 2833, + "end": 2840, + "label": "date" + }, + { + "start": 3812, + "end": 3836, + "label": "organization" + }, + { + "start": 4323, + "end": 4333, + "label": "person" + }, + { + "start": 4565, + "end": 4575, + "label": "person" + }, + { + "start": 4688, + "end": 4701, + "label": "person" + } + ] + }, + { + "id": "en/healthcare-trust-employment-amendment.txt", + "entities": [ + { + "start": 129, + "end": 144, + "label": "person" + }, + { + "start": 216, + "end": 240, + "label": "organization" + }, + { + "start": 279, + "end": 287, + "label": "date" + }, + { + "start": 289, + "end": 293, + "label": "date" + }, + { + "start": 309, + "end": 342, + "label": "organization" + }, + { + "start": 368, + "end": 390, + "label": "organization" + }, + { + "start": 398, + "end": 416, + "label": "person" + }, + { + "start": 473, + "end": 480, + "label": "organization" + }, + { + "start": 538, + "end": 554, + "label": "organization" + }, + { + "start": 625, + "end": 637, + "label": "date" + }, + { + "start": 681, + "end": 694, + "label": "date" + }, + { + "start": 699, + "end": 713, + "label": "date" + }, + { + "start": 729, + "end": 749, + "label": "organization" + }, + { + "start": 787, + "end": 794, + "label": "organization" + }, + { + "start": 838, + "end": 874, + "label": "organization" + }, + { + "start": 1155, + "end": 1162, + "label": "organization" + }, + { + "start": 1271, + "end": 1282, + "label": "organization" + }, + { + "start": 1493, + "end": 1501, + "label": "date" + }, + { + "start": 1503, + "end": 1507, + "label": "date" + }, + { + "start": 1733, + "end": 1740, + "label": "organization" + }, + { + "start": 2247, + "end": 2256, + "label": "organization" + }, + { + "start": 2543, + "end": 2570, + "label": "date" + }, + { + "start": 2599, + "end": 2632, + "label": "organization" + }, + { + "start": 2652, + "end": 2665, + "label": "person" + }, + { + "start": 2679, + "end": 2692, + "label": "person" + }, + { + "start": 2769, + "end": 2787, + "label": "person" + }, + { + "start": 2792, + "end": 2810, + "label": "person" + }, + { + "start": 2898, + "end": 2916, + "label": "person" + }, + { + "start": 2987, + "end": 3011, + "label": "organization" + }, + { + "start": 3091, + "end": 3100, + "label": "organization" + }, + { + "start": 3118, + "end": 3151, + "label": "organization" + }, + { + "start": 3207, + "end": 3225, + "label": "person" + }, + { + "start": 3281, + "end": 3288, + "label": "organization" + }, + { + "start": 3346, + "end": 3362, + "label": "organization" + }, + { + "start": 3433, + "end": 3445, + "label": "date" + }, + { + "start": 3489, + "end": 3502, + "label": "date" + }, + { + "start": 3507, + "end": 3521, + "label": "date" + }, + { + "start": 3537, + "end": 3557, + "label": "organization" + }, + { + "start": 3833, + "end": 3840, + "label": "organization" + }, + { + "start": 4242, + "end": 4249, + "label": "date" + }, + { + "start": 4689, + "end": 4696, + "label": "organization" + }, + { + "start": 5056, + "end": 5075, + "label": "organization" + }, + { + "start": 5305, + "end": 5314, + "label": "organization" + }, + { + "start": 5322, + "end": 5329, + "label": "organization" + }, + { + "start": 5374, + "end": 5383, + "label": "organization" + }, + { + "start": 5643, + "end": 5666, + "label": "organization" + }, + { + "start": 5699, + "end": 5723, + "label": "organization" + }, + { + "start": 5802, + "end": 5821, + "label": "organization" + }, + { + "start": 6053, + "end": 6073, + "label": "organization" + }, + { + "start": 6309, + "end": 6316, + "label": "date" + }, + { + "start": 6333, + "end": 6367, + "label": "organization" + }, + { + "start": 6721, + "end": 6728, + "label": "date" + }, + { + "start": 6834, + "end": 6869, + "label": "organization" + }, + { + "start": 7215, + "end": 7222, + "label": "date" + }, + { + "start": 7230, + "end": 7236, + "label": "date" + }, + { + "start": 7278, + "end": 7313, + "label": "organization" + }, + { + "start": 7436, + "end": 7440, + "label": "date" + }, + { + "start": 7455, + "end": 7462, + "label": "organization" + }, + { + "start": 7516, + "end": 7528, + "label": "date" + }, + { + "start": 7534, + "end": 7538, + "label": "date" + }, + { + "start": 8054, + "end": 8063, + "label": "organization" + }, + { + "start": 8447, + "end": 8480, + "label": "organization" + }, + { + "start": 8510, + "end": 8523, + "label": "person" + }, + { + "start": 8601, + "end": 8619, + "label": "person" + } + ] + }, + { + "id": "en/pra-group-employment-agreement.txt", + "entities": [ + { + "start": 0, + "end": 7, + "label": "organization" + }, + { + "start": 123, + "end": 139, + "label": "date" + }, + { + "start": 191, + "end": 206, + "label": "organization" + }, + { + "start": 208, + "end": 232, + "label": "organization" + }, + { + "start": 239, + "end": 253, + "label": "person" + }, + { + "start": 356, + "end": 363, + "label": "organization" + }, + { + "start": 432, + "end": 446, + "label": "date" + }, + { + "start": 587, + "end": 609, + "label": "organization" + }, + { + "start": 730, + "end": 741, + "label": "date" + }, + { + "start": 743, + "end": 747, + "label": "date" + }, + { + "start": 782, + "end": 806, + "label": "organization" + }, + { + "start": 900, + "end": 924, + "label": "organization" + }, + { + "start": 1331, + "end": 1344, + "label": "organization" + }, + { + "start": 1377, + "end": 1398, + "label": "organization" + }, + { + "start": 1855, + "end": 1895, + "label": "organization" + }, + { + "start": 2154, + "end": 2161, + "label": "organization" + }, + { + "start": 2245, + "end": 2250, + "label": "organization" + }, + { + "start": 2979, + "end": 3013, + "label": "organization" + }, + { + "start": 3097, + "end": 3108, + "label": "date" + }, + { + "start": 3112, + "end": 3116, + "label": "date" + }, + { + "start": 3265, + "end": 3279, + "label": "date" + }, + { + "start": 3366, + "end": 3372, + "label": "date" + }, + { + "start": 3546, + "end": 3553, + "label": "organization" + }, + { + "start": 3595, + "end": 3602, + "label": "date" + }, + { + "start": 3610, + "end": 3636, + "label": "organization" + }, + { + "start": 3674, + "end": 3679, + "label": "organization" + }, + { + "start": 3757, + "end": 3766, + "label": "organization" + }, + { + "start": 4012, + "end": 4018, + "label": "date" + }, + { + "start": 4225, + "end": 4234, + "label": "organization" + }, + { + "start": 4269, + "end": 4277, + "label": "date" + }, + { + "start": 4416, + "end": 4425, + "label": "organization" + }, + { + "start": 4487, + "end": 4501, + "label": "date" + }, + { + "start": 4645, + "end": 4654, + "label": "organization" + }, + { + "start": 4843, + "end": 4874, + "label": "organization" + }, + { + "start": 5383, + "end": 5390, + "label": "organization" + }, + { + "start": 5410, + "end": 5414, + "label": "person" + }, + { + "start": 5741, + "end": 5748, + "label": "organization" + }, + { + "start": 5926, + "end": 5933, + "label": "organization" + }, + { + "start": 6235, + "end": 6242, + "label": "organization" + }, + { + "start": 6556, + "end": 6577, + "label": "organization" + }, + { + "start": 6711, + "end": 6721, + "label": "organization" + }, + { + "start": 7357, + "end": 7382, + "label": "organization" + }, + { + "start": 8371, + "end": 8378, + "label": "organization" + }, + { + "start": 9576, + "end": 9595, + "label": "date" + }, + { + "start": 9615, + "end": 9623, + "label": "date" + }, + { + "start": 9660, + "end": 9681, + "label": "date" + }, + { + "start": 9829, + "end": 9836, + "label": "date" + }, + { + "start": 9947, + "end": 9952, + "label": "organization" + }, + { + "start": 10418, + "end": 10439, + "label": "organization" + }, + { + "start": 10546, + "end": 10567, + "label": "organization" + }, + { + "start": 10691, + "end": 10716, + "label": "organization" + }, + { + "start": 11227, + "end": 11234, + "label": "organization" + }, + { + "start": 11507, + "end": 11514, + "label": "organization" + }, + { + "start": 11612, + "end": 11628, + "label": "organization" + }, + { + "start": 11643, + "end": 11667, + "label": "organization" + }, + { + "start": 11768, + "end": 11775, + "label": "organization" + }, + { + "start": 12043, + "end": 12048, + "label": "organization" + }, + { + "start": 12150, + "end": 12157, + "label": "organization" + }, + { + "start": 12271, + "end": 12278, + "label": "organization" + }, + { + "start": 12391, + "end": 12396, + "label": "organization" + }, + { + "start": 12563, + "end": 12570, + "label": "organization" + }, + { + "start": 12644, + "end": 12651, + "label": "organization" + }, + { + "start": 12743, + "end": 12754, + "label": "date" + }, + { + "start": 13309, + "end": 13332, + "label": "organization" + }, + { + "start": 13461, + "end": 13466, + "label": "organization" + }, + { + "start": 13592, + "end": 13632, + "label": "organization" + }, + { + "start": 13862, + "end": 13890, + "label": "organization" + }, + { + "start": 14175, + "end": 14186, + "label": "date" + }, + { + "start": 14363, + "end": 14370, + "label": "date" + }, + { + "start": 15320, + "end": 15344, + "label": "organization" + }, + { + "start": 16262, + "end": 16276, + "label": "date" + }, + { + "start": 16309, + "end": 16316, + "label": "organization" + }, + { + "start": 16565, + "end": 16598, + "label": "organization" + }, + { + "start": 17150, + "end": 17155, + "label": "organization" + }, + { + "start": 17164, + "end": 17185, + "label": "date" + }, + { + "start": 17640, + "end": 17645, + "label": "organization" + }, + { + "start": 18151, + "end": 18184, + "label": "organization" + }, + { + "start": 18233, + "end": 18240, + "label": "organization" + }, + { + "start": 18635, + "end": 18642, + "label": "organization" + }, + { + "start": 18662, + "end": 18692, + "label": "organization" + }, + { + "start": 18775, + "end": 18783, + "label": "date" + }, + { + "start": 18928, + "end": 18935, + "label": "date" + }, + { + "start": 19074, + "end": 19084, + "label": "date" + }, + { + "start": 19107, + "end": 19115, + "label": "date" + }, + { + "start": 19323, + "end": 19330, + "label": "date" + }, + { + "start": 19341, + "end": 19348, + "label": "organization" + }, + { + "start": 19476, + "end": 19488, + "label": "date" + }, + { + "start": 19543, + "end": 19560, + "label": "date" + }, + { + "start": 19571, + "end": 19579, + "label": "date" + }, + { + "start": 19683, + "end": 19691, + "label": "organization" + }, + { + "start": 19895, + "end": 19902, + "label": "date" + }, + { + "start": 19913, + "end": 19920, + "label": "organization" + }, + { + "start": 19972, + "end": 19996, + "label": "date" + }, + { + "start": 20028, + "end": 20045, + "label": "date" + }, + { + "start": 20094, + "end": 20101, + "label": "date" + }, + { + "start": 20229, + "end": 20247, + "label": "date" + }, + { + "start": 20287, + "end": 20318, + "label": "organization" + }, + { + "start": 20624, + "end": 20631, + "label": "date" + }, + { + "start": 21231, + "end": 21256, + "label": "organization" + }, + { + "start": 21436, + "end": 21446, + "label": "date" + }, + { + "start": 21458, + "end": 21466, + "label": "date" + }, + { + "start": 21635, + "end": 21639, + "label": "date" + }, + { + "start": 21663, + "end": 21693, + "label": "organization" + }, + { + "start": 21721, + "end": 21728, + "label": "date" + }, + { + "start": 21766, + "end": 21771, + "label": "organization" + }, + { + "start": 21791, + "end": 21798, + "label": "date" + }, + { + "start": 21862, + "end": 21869, + "label": "organization" + }, + { + "start": 22025, + "end": 22031, + "label": "date" + }, + { + "start": 22441, + "end": 22466, + "label": "organization" + }, + { + "start": 22470, + "end": 22474, + "label": "date" + }, + { + "start": 22518, + "end": 22526, + "label": "organization" + }, + { + "start": 22727, + "end": 22734, + "label": "organization" + }, + { + "start": 23037, + "end": 23048, + "label": "organization" + }, + { + "start": 23089, + "end": 23096, + "label": "organization" + }, + { + "start": 23330, + "end": 23351, + "label": "date" + }, + { + "start": 23462, + "end": 23469, + "label": "organization" + }, + { + "start": 23556, + "end": 23567, + "label": "organization" + }, + { + "start": 23614, + "end": 23619, + "label": "organization" + }, + { + "start": 23764, + "end": 23769, + "label": "organization" + }, + { + "start": 24069, + "end": 24090, + "label": "date" + }, + { + "start": 24343, + "end": 24350, + "label": "organization" + }, + { + "start": 24496, + "end": 24529, + "label": "organization" + }, + { + "start": 25440, + "end": 25465, + "label": "organization" + }, + { + "start": 25525, + "end": 25545, + "label": "date" + }, + { + "start": 25556, + "end": 25563, + "label": "organization" + }, + { + "start": 25883, + "end": 25890, + "label": "organization" + }, + { + "start": 25897, + "end": 25911, + "label": "date" + }, + { + "start": 25994, + "end": 26019, + "label": "organization" + }, + { + "start": 26037, + "end": 26061, + "label": "date" + }, + { + "start": 26239, + "end": 26246, + "label": "organization" + }, + { + "start": 26407, + "end": 26431, + "label": "organization" + }, + { + "start": 26485, + "end": 26492, + "label": "organization" + }, + { + "start": 26540, + "end": 26547, + "label": "organization" + }, + { + "start": 26716, + "end": 26723, + "label": "organization" + }, + { + "start": 26850, + "end": 26857, + "label": "organization" + }, + { + "start": 27484, + "end": 27491, + "label": "organization" + }, + { + "start": 27562, + "end": 27586, + "label": "organization" + }, + { + "start": 27818, + "end": 27825, + "label": "organization" + }, + { + "start": 28011, + "end": 28018, + "label": "organization" + }, + { + "start": 28175, + "end": 28182, + "label": "organization" + }, + { + "start": 28246, + "end": 28270, + "label": "organization" + }, + { + "start": 28461, + "end": 28485, + "label": "organization" + }, + { + "start": 29164, + "end": 29171, + "label": "organization" + }, + { + "start": 29799, + "end": 29806, + "label": "organization" + }, + { + "start": 29918, + "end": 29925, + "label": "organization" + }, + { + "start": 30251, + "end": 30258, + "label": "organization" + }, + { + "start": 30414, + "end": 30442, + "label": "organization" + }, + { + "start": 30446, + "end": 30450, + "label": "date" + }, + { + "start": 31585, + "end": 31592, + "label": "organization" + }, + { + "start": 31875, + "end": 31880, + "label": "organization" + }, + { + "start": 32251, + "end": 32276, + "label": "organization" + }, + { + "start": 32681, + "end": 32702, + "label": "organization" + }, + { + "start": 32721, + "end": 32753, + "label": "organization" + }, + { + "start": 32785, + "end": 32792, + "label": "organization" + }, + { + "start": 33067, + "end": 33074, + "label": "organization" + }, + { + "start": 33313, + "end": 33318, + "label": "organization" + }, + { + "start": 33537, + "end": 33544, + "label": "organization" + }, + { + "start": 33698, + "end": 33705, + "label": "organization" + }, + { + "start": 33899, + "end": 33906, + "label": "organization" + }, + { + "start": 34022, + "end": 34034, + "label": "date" + }, + { + "start": 34454, + "end": 34461, + "label": "organization" + }, + { + "start": 34537, + "end": 34544, + "label": "organization" + }, + { + "start": 35084, + "end": 35091, + "label": "organization" + }, + { + "start": 35194, + "end": 35201, + "label": "organization" + }, + { + "start": 35237, + "end": 35248, + "label": "date" + }, + { + "start": 35249, + "end": 35258, + "label": "date" + }, + { + "start": 35890, + "end": 35899, + "label": "date" + }, + { + "start": 36115, + "end": 36124, + "label": "date" + }, + { + "start": 36135, + "end": 36142, + "label": "date" + }, + { + "start": 36198, + "end": 36207, + "label": "date" + }, + { + "start": 36405, + "end": 36412, + "label": "organization" + }, + { + "start": 36422, + "end": 36437, + "label": "organization" + }, + { + "start": 36496, + "end": 36511, + "label": "person" + }, + { + "start": 36587, + "end": 36613, + "label": "person" + }, + { + "start": 36614, + "end": 36625, + "label": "organization" + }, + { + "start": 37384, + "end": 37391, + "label": "organization" + }, + { + "start": 37505, + "end": 37512, + "label": "organization" + }, + { + "start": 37557, + "end": 37564, + "label": "organization" + }, + { + "start": 37716, + "end": 37723, + "label": "organization" + }, + { + "start": 37809, + "end": 37816, + "label": "organization" + }, + { + "start": 38412, + "end": 38428, + "label": "organization" + }, + { + "start": 38558, + "end": 38563, + "label": "organization" + }, + { + "start": 39193, + "end": 39221, + "label": "organization" + }, + { + "start": 39931, + "end": 39967, + "label": "organization" + }, + { + "start": 40340, + "end": 40347, + "label": "organization" + }, + { + "start": 40769, + "end": 40776, + "label": "organization" + }, + { + "start": 41008, + "end": 41013, + "label": "organization" + }, + { + "start": 41584, + "end": 41609, + "label": "organization" + }, + { + "start": 41920, + "end": 41945, + "label": "organization" + }, + { + "start": 41949, + "end": 41953, + "label": "date" + }, + { + "start": 41967, + "end": 41988, + "label": "organization" + }, + { + "start": 43165, + "end": 43188, + "label": "organization" + }, + { + "start": 43862, + "end": 43875, + "label": "date" + }, + { + "start": 43890, + "end": 43899, + "label": "date" + }, + { + "start": 44415, + "end": 44426, + "label": "date" + }, + { + "start": 44437, + "end": 44453, + "label": "date" + }, + { + "start": 44796, + "end": 44813, + "label": "date" + }, + { + "start": 46585, + "end": 46610, + "label": "organization" + }, + { + "start": 46650, + "end": 46657, + "label": "organization" + }, + { + "start": 46922, + "end": 46933, + "label": "organization" + }, + { + "start": 47079, + "end": 47087, + "label": "organization" + }, + { + "start": 48123, + "end": 48145, + "label": "date" + }, + { + "start": 48165, + "end": 48180, + "label": "organization" + } + ] + }, + { + "id": "en/software-license-agreement.txt", + "entities": [ + { + "start": 100, + "end": 111, + "label": "date" + }, + { + "start": 113, + "end": 117, + "label": "date" + }, + { + "start": 162, + "end": 185, + "label": "organization" + }, + { + "start": 390, + "end": 395, + "label": "person" + }, + { + "start": 417, + "end": 447, + "label": "organization" + }, + { + "start": 631, + "end": 651, + "label": "person" + }, + { + "start": 874, + "end": 882, + "label": "organization" + }, + { + "start": 892, + "end": 907, + "label": "organization" + }, + { + "start": 909, + "end": 913, + "label": "organization" + }, + { + "start": 962, + "end": 985, + "label": "organization" + }, + { + "start": 1159, + "end": 1167, + "label": "organization" + }, + { + "start": 1251, + "end": 1257, + "label": "date" + }, + { + "start": 1398, + "end": 1414, + "label": "organization" + }, + { + "start": 1430, + "end": 1439, + "label": "date" + }, + { + "start": 1572, + "end": 1595, + "label": "organization" + }, + { + "start": 1597, + "end": 1619, + "label": "organization" + }, + { + "start": 1656, + "end": 1664, + "label": "organization" + }, + { + "start": 1703, + "end": 1734, + "label": "organization" + }, + { + "start": 1755, + "end": 1785, + "label": "organization" + }, + { + "start": 1787, + "end": 1808, + "label": "organization" + }, + { + "start": 1972, + "end": 1990, + "label": "organization" + }, + { + "start": 2013, + "end": 2039, + "label": "organization" + }, + { + "start": 2124, + "end": 2159, + "label": "person" + }, + { + "start": 2165, + "end": 2185, + "label": "person" + }, + { + "start": 2258, + "end": 2269, + "label": "date" + }, + { + "start": 2299, + "end": 2310, + "label": "date" + }, + { + "start": 2312, + "end": 2316, + "label": "date" + }, + { + "start": 1857, + "end": 1871, + "label": "phone number" + }, + { + "start": 1452, + "end": 1462, + "label": "phone number" + } + ] + } + ] +} diff --git a/packages/bench/results/quality.anonymize.json b/packages/bench/results/quality.anonymize.json index 0821b6ed..56d2584c 100644 --- a/packages/bench/results/quality.anonymize.json +++ b/packages/bench/results/quality.anonymize.json @@ -1,6 +1,6 @@ { "tool": "anonymize", - "generatedAt": "2026-06-12T11:26:03.458Z", + "generatedAt": "2026-06-12T11:44:41.920Z", "corpus": { "docs": 13, "docsPerLanguage": { @@ -8,7 +8,8 @@ "de": 1, "en": 4 }, - "goldEntities": 332 + "goldEntities": 332, + "skippedDocs": [] }, "labelsFilter": null, "modes": { diff --git a/packages/bench/results/quality.compromise.json b/packages/bench/results/quality.compromise.json new file mode 100644 index 00000000..e9ee2cc5 --- /dev/null +++ b/packages/bench/results/quality.compromise.json @@ -0,0 +1,109 @@ +{ + "tool": "compromise", + "generatedAt": "2026-06-12T11:44:16.453Z", + "corpus": { + "docs": 4, + "docsPerLanguage": { + "en": 4 + }, + "goldEntities": 101, + "skippedDocs": [ + "cs/asset-transfer-court-declensions.txt", + "cs/database-cz-service-contract.txt", + "cs/eagles-rental-agreement.txt", + "cs/nakit-legal-services-framework.txt", + "cs/patrik-nguyen-used-vehicle-sale.txt", + "cs/probo-frame-purchase-contract.txt", + "cs/sanofi-bonus-agreement.txt", + "cs/vinci-donation-agreement.txt", + "de/geschaeftsfuehrer-dienstvertrag.txt" + ] + }, + "labelsFilter": ["person", "organization"], + "modes": { + "exact": { + "micro": { + "truePositives": 15, + "falsePositives": 39, + "falseNegatives": 23, + "goldCount": 38, + "precision": 0.2777777777777778, + "recall": 0.39473684210526316, + "f1": 0.32608695652173914 + }, + "perLabel": { + "organization": { + "truePositives": 3, + "falsePositives": 21, + "falseNegatives": 16, + "goldCount": 19, + "precision": 0.125, + "recall": 0.15789473684210525, + "f1": 0.13953488372093023 + }, + "person": { + "truePositives": 12, + "falsePositives": 18, + "falseNegatives": 7, + "goldCount": 19, + "precision": 0.4, + "recall": 0.631578947368421, + "f1": 0.4897959183673469 + } + }, + "perLanguage": { + "en": { + "truePositives": 15, + "falsePositives": 39, + "falseNegatives": 23, + "goldCount": 38, + "precision": 0.2777777777777778, + "recall": 0.39473684210526316, + "f1": 0.32608695652173914 + } + } + }, + "overlap": { + "micro": { + "truePositives": 30, + "falsePositives": 24, + "falseNegatives": 8, + "goldCount": 38, + "precision": 0.5555555555555556, + "recall": 0.7894736842105263, + "f1": 0.6521739130434783 + }, + "perLabel": { + "organization": { + "truePositives": 14, + "falsePositives": 10, + "falseNegatives": 5, + "goldCount": 19, + "precision": 0.5833333333333334, + "recall": 0.7368421052631579, + "f1": 0.6511627906976745 + }, + "person": { + "truePositives": 16, + "falsePositives": 14, + "falseNegatives": 3, + "goldCount": 19, + "precision": 0.5333333333333333, + "recall": 0.8421052631578947, + "f1": 0.653061224489796 + } + }, + "perLanguage": { + "en": { + "truePositives": 30, + "falsePositives": 24, + "falseNegatives": 8, + "goldCount": 38, + "precision": 0.5555555555555556, + "recall": 0.7894736842105263, + "f1": 0.6521739130434783 + } + } + } + } +} diff --git a/packages/bench/results/quality.presidio.json b/packages/bench/results/quality.presidio.json new file mode 100644 index 00000000..a1283f5a --- /dev/null +++ b/packages/bench/results/quality.presidio.json @@ -0,0 +1,187 @@ +{ + "tool": "presidio", + "generatedAt": "2026-06-12T11:43:06.688Z", + "corpus": { + "docs": 5, + "docsPerLanguage": { + "de": 1, + "en": 4 + }, + "goldEntities": 125, + "skippedDocs": [ + "cs/asset-transfer-court-declensions.txt", + "cs/database-cz-service-contract.txt", + "cs/eagles-rental-agreement.txt", + "cs/nakit-legal-services-framework.txt", + "cs/patrik-nguyen-used-vehicle-sale.txt", + "cs/probo-frame-purchase-contract.txt", + "cs/sanofi-bonus-agreement.txt", + "cs/vinci-donation-agreement.txt" + ] + }, + "labelsFilter": [ + "person", + "organization", + "email address", + "phone number", + "date" + ], + "modes": { + "exact": { + "micro": { + "truePositives": 44, + "falsePositives": 284, + "falseNegatives": 32, + "goldCount": 76, + "precision": 0.13414634146341464, + "recall": 0.5789473684210527, + "f1": 0.21782178217821782 + }, + "perLabel": { + "date": { + "truePositives": 14, + "falsePositives": 83, + "falseNegatives": 13, + "goldCount": 27, + "precision": 0.14432989690721648, + "recall": 0.5185185185185185, + "f1": 0.2258064516129032 + }, + "email address": { + "truePositives": 0, + "falsePositives": 0, + "falseNegatives": 1, + "goldCount": 1, + "precision": 0, + "recall": 0, + "f1": 0 + }, + "organization": { + "truePositives": 14, + "falsePositives": 188, + "falseNegatives": 9, + "goldCount": 23, + "precision": 0.06930693069306931, + "recall": 0.6086956521739131, + "f1": 0.12444444444444444 + }, + "person": { + "truePositives": 16, + "falsePositives": 11, + "falseNegatives": 8, + "goldCount": 24, + "precision": 0.5925925925925926, + "recall": 0.6666666666666666, + "f1": 0.627450980392157 + }, + "phone number": { + "truePositives": 0, + "falsePositives": 2, + "falseNegatives": 1, + "goldCount": 1, + "precision": 0, + "recall": 0, + "f1": 0 + } + }, + "perLanguage": { + "de": { + "truePositives": 3, + "falsePositives": 7, + "falseNegatives": 9, + "goldCount": 12, + "precision": 0.3, + "recall": 0.25, + "f1": 0.2727272727272727 + }, + "en": { + "truePositives": 41, + "falsePositives": 277, + "falseNegatives": 23, + "goldCount": 64, + "precision": 0.1289308176100629, + "recall": 0.640625, + "f1": 0.21465968586387435 + } + } + }, + "overlap": { + "micro": { + "truePositives": 65, + "falsePositives": 263, + "falseNegatives": 11, + "goldCount": 76, + "precision": 0.19817073170731708, + "recall": 0.8552631578947368, + "f1": 0.3217821782178218 + }, + "perLabel": { + "date": { + "truePositives": 23, + "falsePositives": 74, + "falseNegatives": 4, + "goldCount": 27, + "precision": 0.23711340206185566, + "recall": 0.8518518518518519, + "f1": 0.3709677419354838 + }, + "email address": { + "truePositives": 0, + "falsePositives": 0, + "falseNegatives": 1, + "goldCount": 1, + "precision": 0, + "recall": 0, + "f1": 0 + }, + "organization": { + "truePositives": 19, + "falsePositives": 183, + "falseNegatives": 4, + "goldCount": 23, + "precision": 0.09405940594059406, + "recall": 0.8260869565217391, + "f1": 0.1688888888888889 + }, + "person": { + "truePositives": 22, + "falsePositives": 5, + "falseNegatives": 2, + "goldCount": 24, + "precision": 0.8148148148148148, + "recall": 0.9166666666666666, + "f1": 0.8627450980392156 + }, + "phone number": { + "truePositives": 1, + "falsePositives": 1, + "falseNegatives": 0, + "goldCount": 1, + "precision": 0.5, + "recall": 1, + "f1": 0.6666666666666666 + } + }, + "perLanguage": { + "de": { + "truePositives": 6, + "falsePositives": 4, + "falseNegatives": 6, + "goldCount": 12, + "precision": 0.6, + "recall": 0.5, + "f1": 0.5454545454545454 + }, + "en": { + "truePositives": 59, + "falsePositives": 259, + "falseNegatives": 5, + "goldCount": 64, + "precision": 0.18553459119496854, + "recall": 0.921875, + "f1": 0.30890052356020936 + } + } + } + } +} diff --git a/packages/bench/src/render-results.ts b/packages/bench/src/render-results.ts index 6d6a270c..adf039b7 100644 --- a/packages/bench/src/render-results.ts +++ b/packages/bench/src/render-results.ts @@ -31,6 +31,7 @@ type QualityReportJson = { docs: number; docsPerLanguage: Record; goldEntities: number; + skippedDocs?: string[]; }; labelsFilter: string[] | null; modes: Record<"exact" | "overlap", ModeReportJson>; @@ -74,6 +75,16 @@ const renderQuality = (report: QualityReportJson): string[] => { lines.push( `${report.corpus.docs} documents, ${integer(report.corpus.goldEntities)} reference entities.${filterNote}`, ); + const skipped = report.corpus.skippedDocs ?? []; + if (skipped.length > 0) { + const languages = [ + ...new Set(skipped.map((id) => id.split("/").at(0) ?? id)), + ].toSorted(); + lines.push(""); + lines.push( + `Skipped ${skipped.length} corpus documents (no support for: ${languages.join(", ")}).`, + ); + } for (const mode of ["exact", "overlap"] as const) { const modeReport = report.modes[mode]; lines.push(""); diff --git a/packages/bench/src/run-compromise.ts b/packages/bench/src/run-compromise.ts new file mode 100644 index 00000000..cb077509 --- /dev/null +++ b/packages/bench/src/run-compromise.ts @@ -0,0 +1,68 @@ +/** + * Runs compromise (the closest JS-ecosystem NLP baseline with span + * output) over the English corpus documents and writes predictions + * in the bench interchange format. compromise targets English, so + * other languages are omitted and reported as skipped by the + * quality runner. + */ +import { mkdirSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { parseArgs } from "node:util"; + +import nlp from "compromise"; + +import { loadGoldDocuments } from "./fixtures"; +import type { BenchSpan, PredictionsFile } from "./types"; + +type CompromiseMatch = { + offset?: { start: number; length: number }; +}; + +const matchesToSpans = ( + matches: CompromiseMatch[], + label: string, +): BenchSpan[] => { + const spans: BenchSpan[] = []; + for (const match of matches) { + if (!match.offset) continue; + spans.push({ + start: match.offset.start, + end: match.offset.start + match.offset.length, + label, + }); + } + return spans; +}; + +const { values: args } = parseArgs({ + options: { out: { type: "string" } }, +}); + +const predictions: PredictionsFile = { tool: "compromise", docs: [] }; +for (const doc of loadGoldDocuments()) { + if (doc.language !== "en") continue; + const parsed = nlp(doc.text); + const entities = [ + // SAFETY: compromise's .json({ offset: true }) returns match + // objects with an offset field; the library ships no types. + ...matchesToSpans( + parsed.people().json({ offset: true }) as CompromiseMatch[], + "person", + ), + ...matchesToSpans( + parsed.organizations().json({ offset: true }) as CompromiseMatch[], + "organization", + ), + ]; + predictions.docs.push({ id: doc.id, entities }); + console.log( + JSON.stringify({ event: "doc", id: doc.id, entities: entities.length }), + ); +} + +const outPath = + args.out ?? + join(import.meta.dir, "..", "results", "predictions.compromise.json"); +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, `${JSON.stringify(predictions, null, 2)}\n`); +console.log(JSON.stringify({ event: "written", path: outPath })); diff --git a/packages/bench/src/run-quality.ts b/packages/bench/src/run-quality.ts index 1f93aef7..405b5534 100644 --- a/packages/bench/src/run-quality.ts +++ b/packages/bench/src/run-quality.ts @@ -37,6 +37,7 @@ type QualityReport = { docs: number; docsPerLanguage: Record; goldEntities: number; + skippedDocs: string[]; }; labelsFilter: string[] | null; modes: Record; @@ -52,20 +53,24 @@ const { values: args } = parseArgs({ const labelsFilter = args.labels?.split(",").map((label) => label.trim()); -const docs = loadGoldDocuments(); +const allDocs = loadGoldDocuments(); const predictions: PredictionsFile = args.predictions ? // SAFETY: --predictions files are produced by bench adapters with this shape (JSON.parse(readFileSync(args.predictions, "utf8")) as PredictionsFile) - : await runAnonymizeAdapter(docs); + : await runAnonymizeAdapter(allDocs); const predictionsById = new Map( predictions.docs.map((doc) => [doc.id, doc.entities]), ); -const missingDocs = docs.filter((doc) => !predictionsById.has(doc.id)); -if (missingDocs.length > 0) { - const ids = missingDocs.map((doc) => doc.id).join(", "); - throw new Error(`predictions missing for: ${ids}`); +// Tools without support for a corpus language omit those documents; +// they are reported as skipped instead of scored as zero recall. +const docs = allDocs.filter((doc) => predictionsById.has(doc.id)); +const skippedDocIds = allDocs + .filter((doc) => !predictionsById.has(doc.id)) + .map((doc) => doc.id); +if (docs.length === 0) { + throw new Error("predictions cover no corpus documents"); } const buildModeReport = (mode: MatchMode): ModeReport => { @@ -109,6 +114,7 @@ const report: QualityReport = { docs: docs.length, docsPerLanguage, goldEntities: docs.reduce((sum, doc) => sum + doc.gold.length, 0), + skippedDocs: skippedDocIds, }, labelsFilter: labelsFilter ?? null, modes: {