diff --git a/evaluation/v6/RESULTS.md b/evaluation/v6/RESULTS.md index 7d5e5e4..38afd27 100644 --- a/evaluation/v6/RESULTS.md +++ b/evaluation/v6/RESULTS.md @@ -26,6 +26,14 @@ Detector run over corpora it was NOT authored against — real LLM `model_respon False-positive rate on independent text: **0.0000**. This is the load-bearing, non-circular precision evidence — distinct from the hand-authored F1 below. (Two real false positives found during development — a too-loose lead-in and a missing word-boundary on number words — were fixed and locked in as regression negatives.) +## Recall probe (in-scope phrasing coverage) + +25 genuine count-drift positives authored to span phrasing variety (digit/word lead-ins, number-first headings, prose prefixes, 'all N passed', 'there/here are N', numbered lists, 'a dozen', N-of-M, fraction/percent). Reproduce: `python3 evaluation/v6/score_count_drift.py`. + +Recall on in-scope positives: **25/25 = 1.00**. + +Caveat: hand-authored, so this measures coverage across the phrasing space the author could enumerate, not wild recall. Out-of-scope forms (a count with no adjacent enumeration, flowing-prose counts, table/semantic matches) are abstained by design; extending to them needs an LLM-judge advisory tier, deliberately deferred (it never blocks, and self-consistent count errors are exactly what LLM judges miss). + ## Honesty caveat (read before citing F1) This corpus is **hand-authored** — the same author wrote the detector and the fixtures — so an F1 of 1.0 here is **not** a wild-generalization claim; it is a co-evolved-corpus number and would inflate if cited as field performance. What the number legitimately shows: the detector behaves to spec on the designed cases, **including the adversarial negatives authored to break it** (nested-colon lead-ins, section-index numbers, label words, approximation markers, ambiguous multi-list scope, nested-list depth). The load-bearing, generalizable metric is **precision / zero-false-positives on those adversarial negatives** — the property a blocking gate must hold. diff --git a/evaluation/v6/independent_eval.py b/evaluation/v6/independent_eval.py old mode 100644 new mode 100755 diff --git a/evaluation/v6/recall_probe.jsonl b/evaluation/v6/recall_probe.jsonl new file mode 100644 index 0000000..50d8b1a --- /dev/null +++ b/evaluation/v6/recall_probe.jsonl @@ -0,0 +1,25 @@ +{"id": "leadin_colon_digit", "text": "I found 3 issues:\n- thing 1\n- thing 2", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "leadin_colon_word", "text": "Three reasons:\n- thing 1\n- thing 2\n- thing 3\n- thing 4", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "leadin_following_phrase", "text": "The following 5 steps:\n- thing 1\n- thing 2\n- thing 3\n- thing 4", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "heading_numfirst", "text": "## 4 Findings\n- thing 1\n- thing 2\n- thing 3", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "prose_prefix_we_shipped", "text": "We shipped four fixes:\n- thing 1\n- thing 2\n- thing 3", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "prose_prefix_produced", "text": "This produced six cases:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "n_of_m_wrong", "text": "5 of 4 tests passed.", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "n_of_m_words", "text": "all seven of five lanes are green", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "frac_pct_wrong", "text": "Pass rate 7/10 = 60% this run.", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "frac_paren_wrong", "text": "Resolved 3/4 (50%) of items.", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "numbered_two_three", "text": "Two options:\n1. thing 1\n2. thing 2\n3. thing 3", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "dozen_vs_11", "text": "a dozen items:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6\n- thing 7\n- thing 8\n- thing 9\n- thing 10\n- thing 11", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "fifteen_vs_14", "text": "fifteen entries:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6\n- thing 7\n- thing 8\n- thing 9\n- thing 10\n- thing 11\n- thing 12\n- thing 13\n- thing 14", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "all_n_passed", "text": "All 6 checks passed:\n- thing 1\n- thing 2\n- thing 3\n- thing 4", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "there_are_n", "text": "There are three blockers:\n- thing 1\n- thing 2", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "here_are_n", "text": "Here are the four options:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "count_changes", "text": "We made five changes:\n- thing 1\n- thing 2\n- thing 3", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "findings_plural", "text": "Seven findings surfaced:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "steps_numbered_wrong", "text": "Three steps:\n1. thing 1\n2. thing 2", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "issues_caps", "text": "TWO issues remain:\n- thing 1\n- thing 2\n- thing 3", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "nine_items", "text": "Nine items:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6\n- thing 7", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "ten_reasons", "text": "Ten reasons below:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6\n- thing 7\n- thing 8", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "eight_tasks", "text": "Eight tasks left:\n- thing 1\n- thing 2\n- thing 3\n- thing 4\n- thing 5\n- thing 6", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "frac_half_wrong", "text": "covered 1/2 = 70% of cases", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} +{"id": "n_of_m_the", "text": "6 of the 4 modules compiled", "expect": "block", "note": "genuine count-drift; in-scope phrasing-coverage probe"} diff --git a/evaluation/v6/score_count_drift.py b/evaluation/v6/score_count_drift.py index 5a823f5..4203339 100755 --- a/evaluation/v6/score_count_drift.py +++ b/evaluation/v6/score_count_drift.py @@ -127,6 +127,33 @@ def _count(texts): ) except Exception: pass + + # Recall probe: in-scope phrasing-coverage over genuine count-drift positives. + try: + probe = load(os.path.join(HERE, "recall_probe.jsonl")) + verdicts = [(r["id"], cd.analyze(r["text"])["decision"]) for r in probe] + pcaught = sum(1 for _, d in verdicts if d == "block") + pmiss = [pid for pid, d in verdicts if d != "block"] + if probe: + summary += ( + "\n## Recall probe (in-scope phrasing coverage)\n\n" + "%d genuine count-drift positives authored to span phrasing variety " + "(digit/word lead-ins, number-first headings, prose prefixes, 'all N " + "passed', 'there/here are N', numbered lists, 'a dozen', N-of-M, " + "fraction/percent). Reproduce: `python3 evaluation/v6/score_count_drift.py`.\n\n" + "Recall on in-scope positives: **%d/%d = %.2f**.%s\n\n" + "Caveat: hand-authored, so this measures coverage across the phrasing space " + "the author could enumerate, not wild recall. Out-of-scope forms (a count " + "with no adjacent enumeration, flowing-prose counts, table/semantic matches) " + "are abstained by design; extending to them needs an LLM-judge advisory tier, " + "deliberately deferred (it never blocks, and self-consistent count errors are " + "exactly what LLM judges miss).\n" + % (len(probe), pcaught, len(probe), pcaught / len(probe), + "" if not pmiss else " Misses: " + ", ".join(pmiss) + ".") + ) + except Exception: + pass + summary += ( "\n## Honesty caveat (read before citing F1)\n\n" "This corpus is **hand-authored** — the same author wrote the detector and the "