Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
input cells). **A cluster member is never size-skipped** (`MAX_SHEET_SIZE_MB` silently
dropped the monster sheets from the cluster → partial-cluster wrong fixed point; regression
red pre-fix with `clustersTotal=0`). New `EVAL_CLUSTER_TIMEOUT_MS` (default 60min).
- **per-sheet-eval exit gate is crash-honest** — a hard-failed sheet (crash/OOM/error)
contributes zero tested cells, so the accuracy-only exit gate exited 0 on a run where the
17-sheet cluster child OOMed and only 3 standalone sheets were scored (observed live:
"99.94%, exit 0" with `sheetsWithErrors: 17`). Hard failures now force exit 1 (regression
red pre-fix). Same live run measured the cluster child's true heap need: >12GB (the earlier
6GB probe reading was mid-run, not peak) — set `NODE_HEAP_MB` accordingly.
- **per-sheet-eval dynamic-read scan knows the #66 helpers** — the v0.3.1 emitter lowers
`ref:OFFSET(...)` through `_dynRange`/`_offsetAddr` with no bare `_offset(` call, so the
GT-seed-scoping scan approved scoping on exactly the builds where ranges are runtime-
Expand Down
11 changes: 9 additions & 2 deletions eval/per-sheet-eval.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -812,8 +812,15 @@ process.stdout.write(JSON.stringify({ results, iters: _iters, converged: _conv,
console.log('='.repeat(60));
console.log('');

// Exit code: 0 if >85% accuracy, 1 otherwise
process.exit(overallAccuracy >= 0.85 ? 0 : 1);
// Exit code: 0 only when nothing hard-failed AND accuracy clears 85%. A
// crashed/OOMed sheet contributes ZERO tested cells, so accuracy alone is a
// dishonest gate: a run where the 17-sheet cluster child died scored 99.9%
// on the three standalone sheets and exited 0 — a confident wrong summary.
const hardFailures = sheetsError + sheetsOom;
if (hardFailures > 0) {
console.log(` ! ${hardFailures} sheet(s) crashed/errored — exiting non-zero regardless of tested-cell accuracy.`);
}
process.exit(hardFailures === 0 && overallAccuracy >= 0.85 ? 0 : 1);
}

main().catch(err => {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
"test:onboarding": "node tests/cli/test-onboarding.mjs",
"test:golden": "node tests/cli/test-golden-master.mjs",
"golden": "node eval/golden-master.mjs",
"test": "node tests/lib/test-lib.mjs && node tests/lib/test-scope-plan.mjs && node tests/lib/test-cone-emit.mjs && node tests/lib/test-lite-harness.mjs && node tests/lib/test-lite-tier0.mjs && node tests/lib/test-lite-tier0-generic.mjs && node tests/lib/test-driver-scope.mjs && node tests/lib/test-tier-recommender.mjs && node tests/lib/test-lite-evaluators.mjs && node tests/lib/test-lite-surrogate.mjs && node tests/lib/test-lite-provenance.mjs && node tests/lib/test-artifact-hash.mjs && node tests/cli/test-colrow-ref.mjs && node pipelines/rust/tests/test-date-axis-sumifs.mjs && node pipelines/rust/tests/test-shared-formula-anchors.mjs && node pipelines/rust/tests/test-structure-fidelity.mjs &&node pipelines/rust/tests/test-iferror-infinity.mjs && node pipelines/rust/tests/test-circular-honesty.mjs && node pipelines/rust/tests/test-cluster-transient-div0.mjs && node pipelines/rust/tests/test-div-nan-propagation.mjs && node pipelines/rust/tests/test-cluster-divergent-cap.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-model-type.mjs && node tests/cli/test-dedupe-inputs.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node pipelines/rust/tests/test-per-sheet-eval-intracycle.mjs && node pipelines/rust/tests/test-per-sheet-eval-lockstep.mjs && node pipelines/rust/tests/test-per-sheet-eval-cluster-size.mjs && node pipelines/rust/tests/test-row-chunked-modules.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs && node tests/cli/test-onboarding.mjs && node tests/cli/test-lite-e2e.mjs && node tests/cli/test-lite-byrequest-disclosure.mjs",
"test": "node tests/lib/test-lib.mjs && node tests/lib/test-scope-plan.mjs && node tests/lib/test-cone-emit.mjs && node tests/lib/test-lite-harness.mjs && node tests/lib/test-lite-tier0.mjs && node tests/lib/test-lite-tier0-generic.mjs && node tests/lib/test-driver-scope.mjs && node tests/lib/test-tier-recommender.mjs && node tests/lib/test-lite-evaluators.mjs && node tests/lib/test-lite-surrogate.mjs && node tests/lib/test-lite-provenance.mjs && node tests/lib/test-artifact-hash.mjs && node tests/cli/test-colrow-ref.mjs && node pipelines/rust/tests/test-date-axis-sumifs.mjs && node pipelines/rust/tests/test-shared-formula-anchors.mjs && node pipelines/rust/tests/test-structure-fidelity.mjs &&node pipelines/rust/tests/test-iferror-infinity.mjs && node pipelines/rust/tests/test-circular-honesty.mjs && node pipelines/rust/tests/test-cluster-transient-div0.mjs && node pipelines/rust/tests/test-div-nan-propagation.mjs && node pipelines/rust/tests/test-cluster-divergent-cap.mjs && node tests/cli/test-cli.mjs && node tests/cli/test-manifest-improvements.mjs && node tests/cli/test-manifest-maps.mjs && node tests/cli/test-model-type.mjs && node tests/cli/test-dedupe-inputs.mjs && node tests/cli/test-refine-label-index.mjs && node tests/cli/test-init-shared-gt.mjs && node tests/cli/test-per-sheet-eval.mjs && node pipelines/rust/tests/test-per-sheet-eval-intracycle.mjs && node pipelines/rust/tests/test-per-sheet-eval-lockstep.mjs && node pipelines/rust/tests/test-per-sheet-eval-cluster-size.mjs && node pipelines/rust/tests/test-row-chunked-modules.mjs && node pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs && node tests/cli/test-ai-interface.mjs && node tests/cli/test-e2e4-fixes.mjs && node tests/cli/test-ship-ready.mjs && node tests/cli/use-case-suite.mjs && node tests/cli/test-onboarding.mjs && node tests/cli/test-lite-e2e.mjs && node tests/cli/test-lite-byrequest-disclosure.mjs",
"bench": "node benchmarks/bench.mjs"
},
"devDependencies": {}
Expand Down
87 changes: 87 additions & 0 deletions pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env node
/**
* Regression: per-sheet-eval must exit NON-ZERO when any sheet hard-fails
* (crash/OOM/error), regardless of tested-cell accuracy.
*
* A crashed sheet contributes ZERO tested cells, so the old exit gate
* (accuracy >= 85%) never saw it: the real A-1 run where the 17-sheet cluster
* child OOMed its heap scored 99.9% on the three surviving standalone sheets
* and exited 0 — a confident wrong summary from the canonical harness.
*
* This test builds a cluster + standalone model through the REAL rust-parser
* and kills the cluster child via a 10ms EVAL_CLUSTER_TIMEOUT_MS:
* - pre-fix: standalone scores 100% -> exit 0 despite the dead cluster (RED)
* - post-fix: hard failure forces exit 1; the report still records the
* standalone accuracy and the cluster's crash/oom status (honest, visible)
*
* Needs the rust-parser binary. Skips (exit 0) if it isn't built.
*
* Usage: node pipelines/rust/tests/test-per-sheet-eval-exit-honesty.mjs
*/

import XLSX from 'xlsx';
import { writeFileSync, existsSync, readFileSync, mkdtempSync, rmSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { tmpdir } from 'os';
import { execFileSync } from 'child_process';

const __dir = dirname(fileURLToPath(import.meta.url));
const ROOT = join(__dir, '..', '..', '..');
const exe = process.platform === 'win32' ? '.exe' : '';
const PARSER = [
join(ROOT, 'pipelines/rust/target/release', `rust-parser${exe}`),
join(ROOT, 'pipelines/rust/target/debug', `rust-parser${exe}`),
].find(existsSync);
const EVAL = join(ROOT, 'eval', 'per-sheet-eval.mjs');

if (!PARSER) {
console.log('SKIP: rust-parser not built (cd pipelines/rust && cargo build --release)');
process.exit(0);
}

let passed = 0, failed = 0;
const assert = (c, m) => { if (c) { passed++; } else { failed++; console.error(` FAIL: ${m}`); } };
const S = (ref, cells) => { const s = { '!ref': ref }; for (const [k, v] of Object.entries(cells)) s[k] = v; return s; };
const n = (v, f) => (f ? { t: 'n', v, f } : { t: 'n', v });

console.log('Testing: a hard-failed sheet forces a non-zero exit (accuracy alone is a dishonest gate)');

// Bal<->Debt form a cluster (its child gets killed); Calc is a healthy standalone.
const Bal = S('A1:A1', { A1: n(2, '1+0.5*Debt!A1') });
const Debt = S('A1:A2', { A1: n(2, 'Bal!A1'), A2: n(4, 'Debt!A1+Bal!A1') });
const Calc = S('A1:A2', { A1: n(7), A2: n(14, 'A1*2') });

const tmp = mkdtempSync(join(tmpdir(), 'pse-exit-'));
let exitCode = 0, report = null;
try {
writeFileSync(join(tmp, 'm.xlsx'), XLSX.write({ SheetNames: ['Bal', 'Debt', 'Calc'], Sheets: { Bal, Debt, Calc } }, { type: 'buffer', bookType: 'xlsx' }));
execFileSync(PARSER, [join(tmp, 'm.xlsx'), join(tmp, 'out'), '--chunked'], { encoding: 'utf-8', stdio: 'pipe' });
const out = join(tmp, 'report.json');
try {
execFileSync('node', [EVAL, join(tmp, 'out', 'chunked'), '--output', out], {
encoding: 'utf-8', stdio: 'pipe', maxBuffer: 64 * 1024 * 1024,
env: { ...process.env, EVAL_CLUSTER_TIMEOUT_MS: '10' }, // kill the cluster child mid-boot
});
} catch (e) { exitCode = e.status ?? 1; }
report = existsSync(out) ? JSON.parse(readFileSync(out, 'utf-8')) : null;
} finally {
rmSync(tmp, { recursive: true, force: true });
}

assert(report !== null, 'report written despite the dead cluster');
if (report) {
const calc = report.sheets.find(s => s.name === 'Calc');
assert(calc && calc.status === 'ok' && calc.accuracy === 100,
`healthy standalone still scored (got ${calc ? `${calc.status}/${calc.accuracy}%` : 'no row'})`);
const dead = report.sheets.filter(s => ['crash', 'oom', 'error'].includes(s.status));
assert(dead.length === 2,
`both cluster members recorded as hard-failed (got ${dead.length}: ${report.sheets.map(s => `${s.name}=${s.status}`).join(', ')})`);
assert(report.summary.clustersConverged === 0, `cluster not reported converged (got ${report.summary.clustersConverged})`);
}
assert(exitCode !== 0,
`eval exits NON-ZERO when a sheet hard-fails (got exit ${exitCode} — pre-fix the 100% standalone hid the dead cluster)`);

console.log('');
console.log(`Results: ${passed} passed, ${failed} failed, ${passed + failed} total`);
process.exit(failed > 0 ? 1 : 0);
Loading