From 04f7ea3dfc4e7521ddd28021642cff5672fa0eea Mon Sep 17 00:00:00 2001 From: Eric Boothe Date: Wed, 10 Jun 2026 10:39:10 -0600 Subject: [PATCH] =?UTF-8?q?fix(eval):=20exit=20non-zero=20when=20any=20she?= =?UTF-8?q?et=20hard-fails=20=E2=80=94=20accuracy=20alone=20is=20a=20disho?= =?UTF-8?q?nest=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A crashed/OOMed sheet contributes ZERO tested cells, so the accuracy-only exit gate (>=85%) never saw it. Observed live on the A-1 canonical eval: the 17-sheet cluster child OOMed its 12GB heap, only the 3 standalone sheets were scored, and the harness printed an overall accuracy of 99.9% and exited 0 — a confident wrong summary from the canonical harness itself. Hard failures (status crash/oom/error) now force exit 1; the report still records the surviving sheets accuracy and the failed sheets status (honest and visible, not masked). Regression test kills the cluster child via a 10ms EVAL_CLUSTER_TIMEOUT_MS next to a healthy 100% standalone sheet — red pre-fix (exit 0), green post-fix. Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 6 ++ eval/per-sheet-eval.mjs | 11 ++- package.json | 2 +- .../test-per-sheet-eval-exit-honesty.mjs | 87 +++++++++++++++++++ 4 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs diff --git a/CHANGELOG.md b/CHANGELOG.md index 086dd05..63be9a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,12 @@ input cells). **A cluster member is never size-skipped** (`MAX_SHEET_SIZE_MB` silently dropped the monster sheets from the cluster → partial-cluster wrong fixed point; regression red pre-fix with `clustersTotal=0`). New `EVAL_CLUSTER_TIMEOUT_MS` (default 60min). +- **per-sheet-eval exit gate is crash-honest** — a hard-failed sheet (crash/OOM/error) + contributes zero tested cells, so the accuracy-only exit gate exited 0 on a run where the + 17-sheet cluster child OOMed and only 3 standalone sheets were scored (observed live: + "99.94%, exit 0" with `sheetsWithErrors: 17`). Hard failures now force exit 1 (regression + red pre-fix). Same live run measured the cluster child's true heap need: >12GB (the earlier + 6GB probe reading was mid-run, not peak) — set `NODE_HEAP_MB` accordingly. - **per-sheet-eval dynamic-read scan knows the #66 helpers** — the v0.3.1 emitter lowers `ref:OFFSET(...)` through `_dynRange`/`_offsetAddr` with no bare `_offset(` call, so the GT-seed-scoping scan approved scoping on exactly the builds where ranges are runtime- diff --git a/eval/per-sheet-eval.mjs b/eval/per-sheet-eval.mjs index 2831c3d..dcb2b5c 100644 --- a/eval/per-sheet-eval.mjs +++ b/eval/per-sheet-eval.mjs @@ -812,8 +812,15 @@ process.stdout.write(JSON.stringify({ results, iters: _iters, converged: _conv, console.log('='.repeat(60)); console.log(''); - // Exit code: 0 if >85% accuracy, 1 otherwise - process.exit(overallAccuracy >= 0.85 ? 0 : 1); + // Exit code: 0 only when nothing hard-failed AND accuracy clears 85%. A + // crashed/OOMed sheet contributes ZERO tested cells, so accuracy alone is a + // dishonest gate: a run where the 17-sheet cluster child died scored 99.9% + // on the three standalone sheets and exited 0 — a confident wrong summary. + const hardFailures = sheetsError + sheetsOom; + if (hardFailures > 0) { + console.log(` ! ${hardFailures} sheet(s) crashed/errored — exiting non-zero regardless of tested-cell accuracy.`); + } + process.exit(hardFailures === 0 && overallAccuracy >= 0.85 ? 0 : 1); } main().catch(err => { diff --git a/package.json b/package.json index abdcd95..3b87541 100644 --- a/package.json +++ b/package.json @@ -47,7 +47,7 @@ "test:onboarding": "node tests/cli/test-onboarding.mjs", "test:golden": "node tests/cli/test-golden-master.mjs", "golden": "node eval/golden-master.mjs", - "test": "node tests/lib/test-lib.mjs && node tests/lib/test-scope-plan.mjs && node tests/lib/test-cone-emit.mjs && node tests/lib/test-lite-harness.mjs && node tests/lib/test-lite-tier0.mjs && node tests/lib/test-lite-tier0-generic.mjs && node tests/lib/test-driver-scope.mjs && node tests/lib/test-tier-recommender.mjs && node tests/lib/test-lite-evaluators.mjs && node tests/lib/test-lite-surrogate.mjs && node tests/lib/test-lite-provenance.mjs && node tests/lib/test-artifact-hash.mjs && node tests/cli/test-colrow-ref.mjs && node pipelines/rust/tests/test-date-axis-sumifs.mjs && node pipelines/rust/tests/test-shared-formula-anchors.mjs && node pipelines/rust/tests/test-structure-fidelity.mjs &&node pipelines/rust/tests/test-iferror-infinity.mjs && node pipelines/rust/tests/test-circular-honesty.mjs && node pipelines/rust/tests/test-cluster-transient-div0.mjs && node pipelines/rust/tests/test-div-nan-propagation.mjs && node pipelines/rust/tests/test-cluster-divergent-cap.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-model-type.mjs && node tests/cli/test-dedupe-inputs.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node pipelines/rust/tests/test-per-sheet-eval-intracycle.mjs && node pipelines/rust/tests/test-per-sheet-eval-lockstep.mjs && node pipelines/rust/tests/test-per-sheet-eval-cluster-size.mjs && node pipelines/rust/tests/test-row-chunked-modules.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs && node tests/cli/test-onboarding.mjs && node tests/cli/test-lite-e2e.mjs && node tests/cli/test-lite-byrequest-disclosure.mjs", + "test": "node tests/lib/test-lib.mjs && node tests/lib/test-scope-plan.mjs && node tests/lib/test-cone-emit.mjs && node tests/lib/test-lite-harness.mjs && node tests/lib/test-lite-tier0.mjs && node tests/lib/test-lite-tier0-generic.mjs && node tests/lib/test-driver-scope.mjs && node tests/lib/test-tier-recommender.mjs && node tests/lib/test-lite-evaluators.mjs && node tests/lib/test-lite-surrogate.mjs && node tests/lib/test-lite-provenance.mjs && node tests/lib/test-artifact-hash.mjs && node tests/cli/test-colrow-ref.mjs && node pipelines/rust/tests/test-date-axis-sumifs.mjs && node pipelines/rust/tests/test-shared-formula-anchors.mjs && node pipelines/rust/tests/test-structure-fidelity.mjs &&node pipelines/rust/tests/test-iferror-infinity.mjs && node pipelines/rust/tests/test-circular-honesty.mjs && node pipelines/rust/tests/test-cluster-transient-div0.mjs && node pipelines/rust/tests/test-div-nan-propagation.mjs && node pipelines/rust/tests/test-cluster-divergent-cap.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-model-type.mjs && node tests/cli/test-dedupe-inputs.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node pipelines/rust/tests/test-per-sheet-eval-intracycle.mjs && node pipelines/rust/tests/test-per-sheet-eval-lockstep.mjs && node pipelines/rust/tests/test-per-sheet-eval-cluster-size.mjs && node pipelines/rust/tests/test-row-chunked-modules.mjs && node pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs && node tests/cli/test-onboarding.mjs && node tests/cli/test-lite-e2e.mjs && node tests/cli/test-lite-byrequest-disclosure.mjs", "bench": "node benchmarks/bench.mjs" }, "devDependencies": {} diff --git a/pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs b/pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs new file mode 100644 index 0000000..df3cc43 --- /dev/null +++ b/pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs @@ -0,0 +1,87 @@ +#!/usr/bin/env node +/** + * Regression: per-sheet-eval must exit NON-ZERO when any sheet hard-fails + * (crash/OOM/error), regardless of tested-cell accuracy. + * + * A crashed sheet contributes ZERO tested cells, so the old exit gate + * (accuracy >= 85%) never saw it: the real A-1 run where the 17-sheet cluster + * child OOMed its heap scored 99.9% on the three surviving standalone sheets + * and exited 0 — a confident wrong summary from the canonical harness. + * + * This test builds a cluster + standalone model through the REAL rust-parser + * and kills the cluster child via a 10ms EVAL_CLUSTER_TIMEOUT_MS: + * - pre-fix: standalone scores 100% -> exit 0 despite the dead cluster (RED) + * - post-fix: hard failure forces exit 1; the report still records the + * standalone accuracy and the cluster's crash/oom status (honest, visible) + * + * Needs the rust-parser binary. Skips (exit 0) if it isn't built. + * + * Usage: node pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs + */ + +import XLSX from 'xlsx'; +import { writeFileSync, existsSync, readFileSync, mkdtempSync, rmSync } from 'fs'; +import { join, dirname } from 'path'; +import { fileURLToPath } from 'url'; +import { tmpdir } from 'os'; +import { execFileSync } from 'child_process'; + +const __dir = dirname(fileURLToPath(import.meta.url)); +const ROOT = join(__dir, '..', '..', '..'); +const exe = process.platform === 'win32' ? '.exe' : ''; +const PARSER = [ + join(ROOT, 'pipelines/rust/target/release', `rust-parser${exe}`), + join(ROOT, 'pipelines/rust/target/debug', `rust-parser${exe}`), +].find(existsSync); +const EVAL = join(ROOT, 'eval', 'per-sheet-eval.mjs'); + +if (!PARSER) { + console.log('SKIP: rust-parser not built (cd pipelines/rust && cargo build --release)'); + process.exit(0); +} + +let passed = 0, failed = 0; +const assert = (c, m) => { if (c) { passed++; } else { failed++; console.error(` FAIL: ${m}`); } }; +const S = (ref, cells) => { const s = { '!ref': ref }; for (const [k, v] of Object.entries(cells)) s[k] = v; return s; }; +const n = (v, f) => (f ? { t: 'n', v, f } : { t: 'n', v }); + +console.log('Testing: a hard-failed sheet forces a non-zero exit (accuracy alone is a dishonest gate)'); + +// Bal<->Debt form a cluster (its child gets killed); Calc is a healthy standalone. +const Bal = S('A1:A1', { A1: n(2, '1+0.5*Debt!A1') }); +const Debt = S('A1:A2', { A1: n(2, 'Bal!A1'), A2: n(4, 'Debt!A1+Bal!A1') }); +const Calc = S('A1:A2', { A1: n(7), A2: n(14, 'A1*2') }); + +const tmp = mkdtempSync(join(tmpdir(), 'pse-exit-')); +let exitCode = 0, report = null; +try { + writeFileSync(join(tmp, 'm.xlsx'), XLSX.write({ SheetNames: ['Bal', 'Debt', 'Calc'], Sheets: { Bal, Debt, Calc } }, { type: 'buffer', bookType: 'xlsx' })); + execFileSync(PARSER, [join(tmp, 'm.xlsx'), join(tmp, 'out'), '--chunked'], { encoding: 'utf-8', stdio: 'pipe' }); + const out = join(tmp, 'report.json'); + try { + execFileSync('node', [EVAL, join(tmp, 'out', 'chunked'), '--output', out], { + encoding: 'utf-8', stdio: 'pipe', maxBuffer: 64 * 1024 * 1024, + env: { ...process.env, EVAL_CLUSTER_TIMEOUT_MS: '10' }, // kill the cluster child mid-boot + }); + } catch (e) { exitCode = e.status ?? 1; } + report = existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null; +} finally { + rmSync(tmp, { recursive: true, force: true }); +} + +assert(report !== null, 'report written despite the dead cluster'); +if (report) { + const calc = report.sheets.find(s => s.name === 'Calc'); + assert(calc && calc.status === 'ok' && calc.accuracy === 100, + `healthy standalone still scored (got ${calc ? `${calc.status}/${calc.accuracy}%` : 'no row'})`); + const dead = report.sheets.filter(s => ['crash', 'oom', 'error'].includes(s.status)); + assert(dead.length === 2, + `both cluster members recorded as hard-failed (got ${dead.length}: ${report.sheets.map(s => `${s.name}=${s.status}`).join(', ')})`); + assert(report.summary.clustersConverged === 0, `cluster not reported converged (got ${report.summary.clustersConverged})`); +} +assert(exitCode !== 0, + `eval exits NON-ZERO when a sheet hard-fails (got exit ${exitCode} — pre-fix the 100% standalone hid the dead cluster)`); + +console.log(''); +console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`); +process.exit(failed > 0 ? 1 : 0);