From c5610c3cfde02581a8fc38fc3f0953d192ad3af1 Mon Sep 17 00:00:00 2001 From: hlin99 Date: Mon, 6 Apr 2026 14:51:08 +0800 Subject: [PATCH] feat(M85): offline file-based comparison without endpoints - file_compare.py: load_outputs(), run_file_compare(), format_file_compare() - JSONL format: {id, output, logprobs?} per line - Full batch comparison pipeline (matching, classification, statistics) - CLI subcommand compare-files with --json/--csv/--markdown/--junit export - Match config support: --normalize-whitespace, --ignore-case, --numeric-tolerance - 20 tests covering loading, comparison, exports, edge cases, CLI Closes #183 --- ROADMAP.md | 4 +- docs/iterations/current.md | 1 + src/xpyd_acc/cli/__init__.py | 2 + src/xpyd_acc/cli/analysis.py | 51 +++++++++ src/xpyd_acc/cli/parsers.py | 26 +++++ src/xpyd_acc/file_compare.py | 199 +++++++++++++++++++++++++++++++++++ tests/test_file_compare.py | 197 ++++++++++++++++++++++++++++++++++ 7 files changed, 478 insertions(+), 2 deletions(-) create mode 100644 src/xpyd_acc/file_compare.py create mode 100644 tests/test_file_compare.py diff --git a/ROADMAP.md b/ROADMAP.md index ce8aa86..82691a5 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -787,7 +787,7 @@ - `repl.py` module: `run_repl()`, `ReplSession`, `ReplCommand` - 12 tests covering session state, command parsing, export, edge cases, CLI integration -## M83: Divergence Heatmap by Token Position +## M83: Divergence Heatmap by Token Position ✅ - `xpyd-acc heatmap --report ` analyzes divergence frequency by token position across all samples - Bin token positions into configurable buckets (e.g., 0-10, 10-50, 50-100, 100+) - Per-bucket: divergence count, divergence rate, avg logprob gap @@ -799,7 +799,7 @@ - `heatmap.py` module: `compute_heatmap()`, `HeatmapReport`, `format_heatmap()` - 12 tests covering bucket computation, edge cases, formatting, JSON export, CLI integration -## M84: Endpoint Response Time Regression Detection +## M84: Endpoint Response Time Regression Detection ✅ - `xpyd-acc latency-regression --old --new ` compares latency benchmarks - Welch's t-test for statistical significance of latency changes - Reports: mean diff, p-value, effect size (Cohen's d), verdict (faster/slower/unchanged) diff --git a/docs/iterations/current.md b/docs/iterations/current.md index dc8052a..5495755 100644 --- a/docs/iterations/current.md +++ b/docs/iterations/current.md @@ -50,3 +50,4 @@ shell for exploratory comparison of two endpoints. | M82 | 2026-04-06 | Interactive REPL for Exploratory Comparison | ✅ merged | Both approved | | M83 | 2026-04-06 | Divergence Heatmap by Token Position | ✅ merged | Both approved | | M84 | 2026-04-06 | Endpoint Response Time Regression Detection | ✅ merged | Both approved | +| M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ⏳ pending review | — | diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py index 55dc3a4..41d555d 100644 --- a/src/xpyd_acc/cli/__init__.py +++ b/src/xpyd_acc/cli/__init__.py @@ -13,6 +13,7 @@ # via "xpyd_acc.cli._run_compare_logprobs" etc. from .analysis import ( _run_entropy, + _run_file_compare, _run_fingerprint, _run_latency_regression, _run_length_bias, @@ -137,6 +138,7 @@ def main(argv: list[str] | None = None) -> None: "auto-threshold": lambda: _run_auto_threshold(args), "repl": lambda: _run_repl(args), "latency-regression": lambda: _run_latency_regression(args), + "compare-files": lambda: _run_file_compare(args), } if args.command in _early: diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py index 41c9224..738b538 100644 --- a/src/xpyd_acc/cli/analysis.py +++ b/src/xpyd_acc/cli/analysis.py @@ -297,3 +297,54 @@ def handle_heatmap(args: argparse.Namespace) -> None: if args.json: heatmap.to_json(args.json) print(f"\nHeatmap exported to {args.json}") + + +def _run_file_compare(args: argparse.Namespace) -> None: + """Run offline file-based comparison.""" + import json as _json + from pathlib import Path + + from xpyd_acc.file_compare import ( + format_file_compare, + load_outputs, + run_file_compare, + ) + from xpyd_acc.output_compare import MatchConfig + + baseline_outputs = load_outputs(Path(args.baseline)) + target_outputs = load_outputs(Path(args.target)) + + match_config = MatchConfig( + normalize_whitespace=getattr(args, "normalize_whitespace", False), + ignore_case=getattr(args, "ignore_case", False), + numeric_tolerance=getattr(args, "numeric_tolerance", None), + ) + + report = run_file_compare( + baseline_outputs, + target_outputs, + match_config=match_config, + ) + + print(format_file_compare(report)) + + if getattr(args, "json", None): + with open(args.json, "w") as f: + _json.dump(report.to_json(), f, indent=2) + print(f"\nJSON exported to {args.json}") + + if getattr(args, "csv", None): + report.to_csv(args.csv) + print(f"CSV exported to {args.csv}") + + if getattr(args, "markdown", None): + md = report.to_markdown() + Path(args.markdown).write_text(md) + print(f"Markdown exported to {args.markdown}") + + if getattr(args, "junit", None): + report.to_junit(args.junit) + print(f"JUnit XML exported to {args.junit}") + + if report.divergent_samples > 0: + raise SystemExit(1) diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py index e503f02..db7f6d2 100644 --- a/src/xpyd_acc/cli/parsers.py +++ b/src/xpyd_acc/cli/parsers.py @@ -52,6 +52,7 @@ def register_all(sub: argparse._SubParsersAction) -> None: _register_repl(sub) _register_latency_regression(sub) _register_heatmap(sub) + _register_file_compare(sub) def _register_compare(sub): lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints") lp.add_argument("--baseline", required=True, help="Baseline endpoint URL") @@ -636,3 +637,28 @@ def _register_heatmap(sub): help="Number of position buckets (default: 10)", ) hm.add_argument("--json", default=None, help="Export heatmap as JSON to this path") + + +def _register_file_compare(sub): + fc = sub.add_parser( + "compare-files", + help="Compare pre-collected outputs from JSONL files (offline mode)", + ) + fc.add_argument("--baseline", required=True, help="Path to baseline outputs JSONL") + fc.add_argument("--target", required=True, help="Path to target outputs JSONL") + fc.add_argument("--json", default=None, help="Export report as JSON") + fc.add_argument("--csv", default=None, help="Export report as CSV") + fc.add_argument("--markdown", default=None, help="Export report as Markdown") + fc.add_argument("--junit", default=None, help="Export report as JUnit XML") + fc.add_argument( + "--normalize-whitespace", action="store_true", default=False, + help="Normalize whitespace before comparison", + ) + fc.add_argument( + "--ignore-case", action="store_true", default=False, + help="Case-insensitive comparison", + ) + fc.add_argument( + "--numeric-tolerance", type=float, default=None, + help="Numeric tolerance for matching", + ) diff --git a/src/xpyd_acc/file_compare.py b/src/xpyd_acc/file_compare.py new file mode 100644 index 0000000..2aace82 --- /dev/null +++ b/src/xpyd_acc/file_compare.py @@ -0,0 +1,199 @@ +"""Offline file-based comparison: compare pre-collected outputs without endpoints.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from xpyd_acc.batch_compare import BatchReport, SampleResult, compute_report +from xpyd_acc.log import get_logger +from xpyd_acc.output_compare import MatchConfig, normalized_match + +logger = get_logger("file_compare") + + +@dataclass +class FileOutput: + """A single output loaded from a JSONL file.""" + + id: str + output: str + logprobs: list[float] | None = None + + +def load_outputs(path: Path) -> list[FileOutput]: + """Load outputs from a JSONL file. + + Each line must be a JSON object with at least ``id`` and ``output`` fields. + An optional ``logprobs`` field (list of floats) is supported. + + Raises: + FileNotFoundError: If the file does not exist. + ValueError: If a line is missing required fields or is not valid JSON. + """ + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + + outputs: list[FileOutput] = [] + with open(path) as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError( + f"{path}:{line_num}: invalid JSON: {exc}" + ) from exc + + if not isinstance(obj, dict): + typ = type(obj).__name__ + raise ValueError( + f"{path}:{line_num}: expected JSON object, got {typ}" + ) + if "id" not in obj: + raise ValueError(f"{path}:{line_num}: missing required field 'id'") + if "output" not in obj: + raise ValueError(f"{path}:{line_num}: missing required field 'output'") + + outputs.append(FileOutput( + id=str(obj["id"]), + output=str(obj["output"]), + logprobs=obj.get("logprobs"), + )) + + if not outputs: + raise ValueError(f"No samples found in {path}") + + return outputs + + +def _estimate_context_length(text: str) -> int: + """Rough token count estimate (words / 0.75).""" + words = len(text.split()) + return max(1, int(words / 0.75)) + + +def run_file_compare( + baseline_outputs: list[FileOutput], + target_outputs: list[FileOutput], + *, + match_config: MatchConfig | None = None, + logprob_gap_threshold: float = 0.1, +) -> BatchReport: + """Compare baseline and target outputs loaded from files. + + Matches samples by ID. Both lists must contain the same set of IDs. + + Returns: + A :class:`BatchReport` with comparison results. + + Raises: + ValueError: If IDs don't match between baseline and target. + """ + if match_config is None: + match_config = MatchConfig() + + baseline_map = {o.id: o for o in baseline_outputs} + target_map = {o.id: o for o in target_outputs} + + baseline_ids = set(baseline_map.keys()) + target_ids = set(target_map.keys()) + + if baseline_ids != target_ids: + only_baseline = baseline_ids - target_ids + only_target = target_ids - baseline_ids + parts = [] + if only_baseline: + parts.append(f"only in baseline: {sorted(only_baseline)[:5]}") + if only_target: + parts.append(f"only in target: {sorted(only_target)[:5]}") + raise ValueError(f"Sample ID mismatch: {'; '.join(parts)}") + + results: list[SampleResult] = [] + + for sample_id in sorted(baseline_ids): + bl = baseline_map[sample_id] + tg = target_map[sample_id] + + exact = normalized_match(bl.output, tg.output, match_config) + + # Find first divergence index (character-level) + first_div_idx: int | None = None + if not exact: + bl_tokens = bl.output.split() + tg_tokens = tg.output.split() + for i, (bt, tt) in enumerate(zip(bl_tokens, tg_tokens)): + if bt != tt: + first_div_idx = i + break + else: + # One is a prefix of the other + first_div_idx = min(len(bl_tokens), len(tg_tokens)) + + # Logprob gap at divergence point + bl_logprob: float | None = None + tg_logprob: float | None = None + logprob_gap: float | None = None + if first_div_idx is not None and bl.logprobs and tg.logprobs: + if first_div_idx < len(bl.logprobs): + bl_logprob = bl.logprobs[first_div_idx] + if first_div_idx < len(tg.logprobs): + tg_logprob = tg.logprobs[first_div_idx] + if bl_logprob is not None and tg_logprob is not None: + logprob_gap = abs(bl_logprob - tg_logprob) + + # Classification + if exact: + classification = "match" + elif logprob_gap is not None: + if logprob_gap >= logprob_gap_threshold: + classification = "likely_bug" + else: + classification = "likely_uncertainty" + else: + classification = "unknown" + + results.append(SampleResult( + sample_id=sample_id, + prompt=f"[file:{sample_id}]", + baseline_output=bl.output, + target_output=tg.output, + exact_match=exact, + first_divergence_index=first_div_idx, + baseline_logprob_at_divergence=bl_logprob, + target_logprob_at_divergence=tg_logprob, + logprob_gap=logprob_gap, + classification=classification, + context_length=_estimate_context_length(bl.output), + )) + + return compute_report(results, logprob_gap_threshold=logprob_gap_threshold) + + +def format_file_compare(report: BatchReport) -> str: + """Format a file comparison report for terminal display.""" + lines = [ + "═══ File Comparison Report ═══", + "", + f"Total samples: {report.total_samples}", + f"Matching: {report.match_samples}", + f"Divergent: {report.divergent_samples}", + f"Divergence rate: {report.divergence_rate:.1%}", + ] + + if report.likely_bugs: + lines.append(f"Likely bugs: {report.likely_bugs}") + if report.likely_uncertainty: + lines.append(f"Likely uncertainty: {report.likely_uncertainty}") + if report.unknown_classification: + lines.append(f"Unknown: {report.unknown_classification}") + + if report.divergence_index_mean is not None: + lines.append(f"Avg divergence idx: {report.divergence_index_mean:.1f}") + if report.logprob_gap_mean is not None: + lines.append(f"Avg logprob gap: {report.logprob_gap_mean:.4f}") + + return "\n".join(lines) diff --git a/tests/test_file_compare.py b/tests/test_file_compare.py new file mode 100644 index 0000000..e75813f --- /dev/null +++ b/tests/test_file_compare.py @@ -0,0 +1,197 @@ +"""Tests for file_compare module (M85: Offline File-Based Comparison).""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from xpyd_acc.file_compare import ( + FileOutput, + _estimate_context_length, + format_file_compare, + load_outputs, + run_file_compare, +) +from xpyd_acc.output_compare import MatchConfig + + +def _write_jsonl(path: Path, items: list[dict]) -> None: + """Helper to write JSONL file.""" + with open(path, "w") as f: + for item in items: + f.write(json.dumps(item) + "\n") + + +class TestLoadOutputs: + """Tests for load_outputs().""" + + def test_basic_load(self, tmp_path: Path) -> None: + p = tmp_path / "outputs.jsonl" + _write_jsonl(p, [ + {"id": "s1", "output": "hello world"}, + {"id": "s2", "output": "foo bar", "logprobs": [-0.1, -0.3]}, + ]) + result = load_outputs(p) + assert len(result) == 2 + assert result[0].id == "s1" + assert result[0].output == "hello world" + assert result[0].logprobs is None + assert result[1].logprobs == [-0.1, -0.3] + + def test_file_not_found(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + load_outputs(tmp_path / "missing.jsonl") + + def test_missing_id(self, tmp_path: Path) -> None: + p = tmp_path / "bad.jsonl" + _write_jsonl(p, [{"output": "hello"}]) + with pytest.raises(ValueError, match="missing required field 'id'"): + load_outputs(p) + + def test_missing_output(self, tmp_path: Path) -> None: + p = tmp_path / "bad.jsonl" + _write_jsonl(p, [{"id": "s1"}]) + with pytest.raises(ValueError, match="missing required field 'output'"): + load_outputs(p) + + def test_invalid_json(self, tmp_path: Path) -> None: + p = tmp_path / "bad.jsonl" + p.write_text("not json\n") + with pytest.raises(ValueError, match="invalid JSON"): + load_outputs(p) + + def test_empty_file(self, tmp_path: Path) -> None: + p = tmp_path / "empty.jsonl" + p.write_text("\n\n") + with pytest.raises(ValueError, match="No samples found"): + load_outputs(p) + + def test_skips_blank_lines(self, tmp_path: Path) -> None: + p = tmp_path / "outputs.jsonl" + p.write_text('\n{"id":"s1","output":"hi"}\n\n') + result = load_outputs(p) + assert len(result) == 1 + + +class TestRunFileCompare: + """Tests for run_file_compare().""" + + def test_all_match(self) -> None: + bl = [FileOutput("s1", "hello"), FileOutput("s2", "world")] + tg = [FileOutput("s1", "hello"), FileOutput("s2", "world")] + report = run_file_compare(bl, tg) + assert report.total_samples == 2 + assert report.divergent_samples == 0 + assert report.divergence_rate == 0.0 + + def test_divergence_detected(self) -> None: + bl = [FileOutput("s1", "hello world")] + tg = [FileOutput("s1", "hello earth")] + report = run_file_compare(bl, tg) + assert report.divergent_samples == 1 + assert report.results[0].first_divergence_index == 1 + + def test_id_mismatch_raises(self) -> None: + bl = [FileOutput("s1", "hello")] + tg = [FileOutput("s2", "hello")] + with pytest.raises(ValueError, match="Sample ID mismatch"): + run_file_compare(bl, tg) + + def test_with_logprobs_likely_bug(self) -> None: + bl = [FileOutput("s1", "a b c", logprobs=[-0.01, -0.5, -0.01])] + tg = [FileOutput("s1", "a x c", logprobs=[-0.01, -0.01, -0.01])] + report = run_file_compare(bl, tg, logprob_gap_threshold=0.1) + assert report.results[0].classification == "likely_bug" + assert report.results[0].logprob_gap is not None + assert report.results[0].logprob_gap == pytest.approx(0.49) + + def test_with_logprobs_likely_uncertainty(self) -> None: + bl = [FileOutput("s1", "a b c", logprobs=[-0.01, -0.05, -0.01])] + tg = [FileOutput("s1", "a x c", logprobs=[-0.01, -0.04, -0.01])] + report = run_file_compare(bl, tg, logprob_gap_threshold=0.1) + assert report.results[0].classification == "likely_uncertainty" + + def test_no_logprobs_unknown(self) -> None: + bl = [FileOutput("s1", "a b")] + tg = [FileOutput("s1", "a x")] + report = run_file_compare(bl, tg) + assert report.results[0].classification == "unknown" + + def test_match_config_ignore_case(self) -> None: + bl = [FileOutput("s1", "Hello World")] + tg = [FileOutput("s1", "hello world")] + cfg = MatchConfig(ignore_case=True) + report = run_file_compare(bl, tg, match_config=cfg) + assert report.divergent_samples == 0 + + def test_prefix_divergence(self) -> None: + bl = [FileOutput("s1", "a b c")] + tg = [FileOutput("s1", "a b")] + report = run_file_compare(bl, tg) + assert report.results[0].first_divergence_index == 2 + + +class TestFormatFileCompare: + """Tests for format_file_compare().""" + + def test_format_output(self) -> None: + bl = [FileOutput("s1", "hello"), FileOutput("s2", "world")] + tg = [FileOutput("s1", "hello"), FileOutput("s2", "earth")] + report = run_file_compare(bl, tg) + text = format_file_compare(report) + assert "File Comparison Report" in text + assert "Divergent: 1" in text + assert "50.0%" in text + + +class TestEstimateContextLength: + """Tests for _estimate_context_length().""" + + def test_basic(self) -> None: + assert _estimate_context_length("hello world foo bar") >= 1 + + def test_empty(self) -> None: + assert _estimate_context_length("") >= 1 + + +class TestCLIIntegration: + """Tests for compare-files CLI subcommand.""" + + def test_cli_basic(self, tmp_path: Path) -> None: + bl_path = tmp_path / "baseline.jsonl" + tg_path = tmp_path / "target.jsonl" + _write_jsonl(bl_path, [{"id": "s1", "output": "hello"}]) + _write_jsonl(tg_path, [{"id": "s1", "output": "hello"}]) + + import argparse + + from xpyd_acc.cli.parsers import register_all + parser = argparse.ArgumentParser() + sub = parser.add_subparsers(dest="command") + register_all(sub) + args = parser.parse_args([ + "compare-files", "--baseline", str(bl_path), "--target", str(tg_path), + ]) + assert args.command == "compare-files" + assert args.baseline == str(bl_path) + + def test_cli_json_export(self, tmp_path: Path) -> None: + bl_path = tmp_path / "baseline.jsonl" + tg_path = tmp_path / "target.jsonl" + json_out = tmp_path / "report.json" + _write_jsonl(bl_path, [{"id": "s1", "output": "hello"}]) + _write_jsonl(tg_path, [{"id": "s1", "output": "hello"}]) + + import argparse + + from xpyd_acc.cli.parsers import register_all + parser = argparse.ArgumentParser() + sub = parser.add_subparsers(dest="command") + register_all(sub) + args = parser.parse_args([ + "compare-files", "--baseline", str(bl_path), + "--target", str(tg_path), "--json", str(json_out), + ]) + assert args.json == str(json_out)