From c5610c3cfde02581a8fc38fc3f0953d192ad3af1 Mon Sep 17 00:00:00 2001
From: hlin99 <tony.lin@intel.com>
Date: Mon, 6 Apr 2026 14:51:08 +0800
Subject: [PATCH] feat(M85): offline file-based comparison without endpoints

- file_compare.py: load_outputs(), run_file_compare(), format_file_compare()
- JSONL format: {id, output, logprobs?} per line
- Full batch comparison pipeline (matching, classification, statistics)
- CLI subcommand compare-files with --json/--csv/--markdown/--junit export
- Match config support: --normalize-whitespace, --ignore-case, --numeric-tolerance
- 20 tests covering loading, comparison, exports, edge cases, CLI

Closes #183
---
 ROADMAP.md                   |   4 +-
 docs/iterations/current.md   |   1 +
 src/xpyd_acc/cli/__init__.py |   2 +
 src/xpyd_acc/cli/analysis.py |  51 +++++++++
 src/xpyd_acc/cli/parsers.py  |  26 +++++
 src/xpyd_acc/file_compare.py | 199 +++++++++++++++++++++++++++++++++++
 tests/test_file_compare.py   | 197 ++++++++++++++++++++++++++++++++++
 7 files changed, 478 insertions(+), 2 deletions(-)
 create mode 100644 src/xpyd_acc/file_compare.py
 create mode 100644 tests/test_file_compare.py
diff --git a/ROADMAP.md b/ROADMAP.md
index ce8aa86..82691a5 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -787,7 +787,7 @@
 - `repl.py` module: `run_repl()`, `ReplSession`, `ReplCommand`
 - 12 tests covering session state, command parsing, export, edge cases, CLI integration
 
-## M83: Divergence Heatmap by Token Position
+## M83: Divergence Heatmap by Token Position ✅
 - `xpyd-acc heatmap --report <path>` analyzes divergence frequency by token position across all samples
 - Bin token positions into configurable buckets (e.g., 0-10, 10-50, 50-100, 100+)
 - Per-bucket: divergence count, divergence rate, avg logprob gap
@@ -799,7 +799,7 @@
 - `heatmap.py` module: `compute_heatmap()`, `HeatmapReport`, `format_heatmap()`
 - 12 tests covering bucket computation, edge cases, formatting, JSON export, CLI integration
 
-## M84: Endpoint Response Time Regression Detection
+## M84: Endpoint Response Time Regression Detection ✅
 - `xpyd-acc latency-regression --old <benchmark.json> --new <benchmark.json>` compares latency benchmarks
 - Welch's t-test for statistical significance of latency changes
 - Reports: mean diff, p-value, effect size (Cohen's d), verdict (faster/slower/unchanged)
diff --git a/docs/iterations/current.md b/docs/iterations/current.md
index dc8052a..5495755 100644
--- a/docs/iterations/current.md
+++ b/docs/iterations/current.md
@@ -50,3 +50,4 @@ shell for exploratory comparison of two endpoints.
 | M82 | 2026-04-06 | Interactive REPL for Exploratory Comparison | ✅ merged | Both approved |
 | M83 | 2026-04-06 | Divergence Heatmap by Token Position | ✅ merged | Both approved |
 | M84 | 2026-04-06 | Endpoint Response Time Regression Detection | ✅ merged | Both approved |
+| M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ⏳ pending review | — |
diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py
index 55dc3a4..41d555d 100644
--- a/src/xpyd_acc/cli/__init__.py
+++ b/src/xpyd_acc/cli/__init__.py
@@ -13,6 +13,7 @@
 # via "xpyd_acc.cli._run_compare_logprobs" etc.
 from .analysis import (
     _run_entropy,
+    _run_file_compare,
     _run_fingerprint,
     _run_latency_regression,
     _run_length_bias,
@@ -137,6 +138,7 @@ def main(argv: list[str] | None = None) -> None:
         "auto-threshold": lambda: _run_auto_threshold(args),
         "repl": lambda: _run_repl(args),
         "latency-regression": lambda: _run_latency_regression(args),
+        "compare-files": lambda: _run_file_compare(args),
     }
 
     if args.command in _early:
diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py
index 41c9224..738b538 100644
--- a/src/xpyd_acc/cli/analysis.py
+++ b/src/xpyd_acc/cli/analysis.py
@@ -297,3 +297,54 @@ def handle_heatmap(args: argparse.Namespace) -> None:
     if args.json:
         heatmap.to_json(args.json)
         print(f"\nHeatmap exported to {args.json}")
+
+
+def _run_file_compare(args: argparse.Namespace) -> None:
+    """Run offline file-based comparison."""
+    import json as _json
+    from pathlib import Path
+
+    from xpyd_acc.file_compare import (
+        format_file_compare,
+        load_outputs,
+        run_file_compare,
+    )
+    from xpyd_acc.output_compare import MatchConfig
+
+    baseline_outputs = load_outputs(Path(args.baseline))
+    target_outputs = load_outputs(Path(args.target))
+
+    match_config = MatchConfig(
+        normalize_whitespace=getattr(args, "normalize_whitespace", False),
+        ignore_case=getattr(args, "ignore_case", False),
+        numeric_tolerance=getattr(args, "numeric_tolerance", None),
+    )
+
+    report = run_file_compare(
+        baseline_outputs,
+        target_outputs,
+        match_config=match_config,
+    )
+
+    print(format_file_compare(report))
+
+    if getattr(args, "json", None):
+        with open(args.json, "w") as f:
+            _json.dump(report.to_json(), f, indent=2)
+        print(f"\nJSON exported to {args.json}")
+
+    if getattr(args, "csv", None):
+        report.to_csv(args.csv)
+        print(f"CSV exported to {args.csv}")
+
+    if getattr(args, "markdown", None):
+        md = report.to_markdown()
+        Path(args.markdown).write_text(md)
+        print(f"Markdown exported to {args.markdown}")
+
+    if getattr(args, "junit", None):
+        report.to_junit(args.junit)
+        print(f"JUnit XML exported to {args.junit}")
+
+    if report.divergent_samples > 0:
+        raise SystemExit(1)
diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py
index e503f02..db7f6d2 100644
--- a/src/xpyd_acc/cli/parsers.py
+++ b/src/xpyd_acc/cli/parsers.py
@@ -52,6 +52,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
     _register_repl(sub)
     _register_latency_regression(sub)
     _register_heatmap(sub)
+    _register_file_compare(sub)
 def _register_compare(sub):
     lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
     lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
@@ -636,3 +637,28 @@ def _register_heatmap(sub):
         help="Number of position buckets (default: 10)",
     )
     hm.add_argument("--json", default=None, help="Export heatmap as JSON to this path")
+
+
+def _register_file_compare(sub):
+    fc = sub.add_parser(
+        "compare-files",
+        help="Compare pre-collected outputs from JSONL files (offline mode)",
+    )
+    fc.add_argument("--baseline", required=True, help="Path to baseline outputs JSONL")
+    fc.add_argument("--target", required=True, help="Path to target outputs JSONL")
+    fc.add_argument("--json", default=None, help="Export report as JSON")
+    fc.add_argument("--csv", default=None, help="Export report as CSV")
+    fc.add_argument("--markdown", default=None, help="Export report as Markdown")
+    fc.add_argument("--junit", default=None, help="Export report as JUnit XML")
+    fc.add_argument(
+        "--normalize-whitespace", action="store_true", default=False,
+        help="Normalize whitespace before comparison",
+    )
+    fc.add_argument(
+        "--ignore-case", action="store_true", default=False,
+        help="Case-insensitive comparison",
+    )
+    fc.add_argument(
+        "--numeric-tolerance", type=float, default=None,
+        help="Numeric tolerance for matching",
+    )
diff --git a/src/xpyd_acc/file_compare.py b/src/xpyd_acc/file_compare.py
new file mode 100644
index 0000000..2aace82
--- /dev/null
+++ b/src/xpyd_acc/file_compare.py
@@ -0,0 +1,199 @@
+"""Offline file-based comparison: compare pre-collected outputs without endpoints."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from xpyd_acc.batch_compare import BatchReport, SampleResult, compute_report
+from xpyd_acc.log import get_logger
+from xpyd_acc.output_compare import MatchConfig, normalized_match
+
+logger = get_logger("file_compare")
+
+
+@dataclass
+class FileOutput:
+    """A single output loaded from a JSONL file."""
+
+    id: str
+    output: str
+    logprobs: list[float] | None = None
+
+
+def load_outputs(path: Path) -> list[FileOutput]:
+    """Load outputs from a JSONL file.
+
+    Each line must be a JSON object with at least ``id`` and ``output`` fields.
+    An optional ``logprobs`` field (list of floats) is supported.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If a line is missing required fields or is not valid JSON.
+    """
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+
+    outputs: list[FileOutput] = []
+    with open(path) as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(
+                    f"{path}:{line_num}: invalid JSON: {exc}"
+                ) from exc
+
+            if not isinstance(obj, dict):
+                typ = type(obj).__name__
+                raise ValueError(
+                    f"{path}:{line_num}: expected JSON object, got {typ}"
+                )
+            if "id" not in obj:
+                raise ValueError(f"{path}:{line_num}: missing required field 'id'")
+            if "output" not in obj:
+                raise ValueError(f"{path}:{line_num}: missing required field 'output'")
+
+            outputs.append(FileOutput(
+                id=str(obj["id"]),
+                output=str(obj["output"]),
+                logprobs=obj.get("logprobs"),
+            ))
+
+    if not outputs:
+        raise ValueError(f"No samples found in {path}")
+
+    return outputs
+
+
+def _estimate_context_length(text: str) -> int:
+    """Rough token count estimate (words / 0.75)."""
+    words = len(text.split())
+    return max(1, int(words / 0.75))
+
+
+def run_file_compare(
+    baseline_outputs: list[FileOutput],
+    target_outputs: list[FileOutput],
+    *,
+    match_config: MatchConfig | None = None,
+    logprob_gap_threshold: float = 0.1,
+) -> BatchReport:
+    """Compare baseline and target outputs loaded from files.
+
+    Matches samples by ID. Both lists must contain the same set of IDs.
+
+    Returns:
+        A :class:`BatchReport` with comparison results.
+
+    Raises:
+        ValueError: If IDs don't match between baseline and target.
+    """
+    if match_config is None:
+        match_config = MatchConfig()
+
+    baseline_map = {o.id: o for o in baseline_outputs}
+    target_map = {o.id: o for o in target_outputs}
+
+    baseline_ids = set(baseline_map.keys())
+    target_ids = set(target_map.keys())
+
+    if baseline_ids != target_ids:
+        only_baseline = baseline_ids - target_ids
+        only_target = target_ids - baseline_ids
+        parts = []
+        if only_baseline:
+            parts.append(f"only in baseline: {sorted(only_baseline)[:5]}")
+        if only_target:
+            parts.append(f"only in target: {sorted(only_target)[:5]}")
+        raise ValueError(f"Sample ID mismatch: {'; '.join(parts)}")
+
+    results: list[SampleResult] = []
+
+    for sample_id in sorted(baseline_ids):
+        bl = baseline_map[sample_id]
+        tg = target_map[sample_id]
+
+        exact = normalized_match(bl.output, tg.output, match_config)
+
+        # Find first divergence index (character-level)
+        first_div_idx: int | None = None
+        if not exact:
+            bl_tokens = bl.output.split()
+            tg_tokens = tg.output.split()
+            for i, (bt, tt) in enumerate(zip(bl_tokens, tg_tokens)):
+                if bt != tt:
+                    first_div_idx = i
+                    break
+            else:
+                # One is a prefix of the other
+                first_div_idx = min(len(bl_tokens), len(tg_tokens))
+
+        # Logprob gap at divergence point
+        bl_logprob: float | None = None
+        tg_logprob: float | None = None
+        logprob_gap: float | None = None
+        if first_div_idx is not None and bl.logprobs and tg.logprobs:
+            if first_div_idx < len(bl.logprobs):
+                bl_logprob = bl.logprobs[first_div_idx]
+            if first_div_idx < len(tg.logprobs):
+                tg_logprob = tg.logprobs[first_div_idx]
+            if bl_logprob is not None and tg_logprob is not None:
+                logprob_gap = abs(bl_logprob - tg_logprob)
+
+        # Classification
+        if exact:
+            classification = "match"
+        elif logprob_gap is not None:
+            if logprob_gap >= logprob_gap_threshold:
+                classification = "likely_bug"
+            else:
+                classification = "likely_uncertainty"
+        else:
+            classification = "unknown"
+
+        results.append(SampleResult(
+            sample_id=sample_id,
+            prompt=f"[file:{sample_id}]",
+            baseline_output=bl.output,
+            target_output=tg.output,
+            exact_match=exact,
+            first_divergence_index=first_div_idx,
+            baseline_logprob_at_divergence=bl_logprob,
+            target_logprob_at_divergence=tg_logprob,
+            logprob_gap=logprob_gap,
+            classification=classification,
+            context_length=_estimate_context_length(bl.output),
+        ))
+
+    return compute_report(results, logprob_gap_threshold=logprob_gap_threshold)
+
+
+def format_file_compare(report: BatchReport) -> str:
+    """Format a file comparison report for terminal display."""
+    lines = [
+        "═══ File Comparison Report ═══",
+        "",
+        f"Total samples:     {report.total_samples}",
+        f"Matching:          {report.match_samples}",
+        f"Divergent:         {report.divergent_samples}",
+        f"Divergence rate:   {report.divergence_rate:.1%}",
+    ]
+
+    if report.likely_bugs:
+        lines.append(f"Likely bugs:       {report.likely_bugs}")
+    if report.likely_uncertainty:
+        lines.append(f"Likely uncertainty: {report.likely_uncertainty}")
+    if report.unknown_classification:
+        lines.append(f"Unknown:           {report.unknown_classification}")
+
+    if report.divergence_index_mean is not None:
+        lines.append(f"Avg divergence idx: {report.divergence_index_mean:.1f}")
+    if report.logprob_gap_mean is not None:
+        lines.append(f"Avg logprob gap:   {report.logprob_gap_mean:.4f}")
+
+    return "\n".join(lines)
diff --git a/tests/test_file_compare.py b/tests/test_file_compare.py
new file mode 100644
index 0000000..e75813f
--- /dev/null
+++ b/tests/test_file_compare.py
@@ -0,0 +1,197 @@
+"""Tests for file_compare module (M85: Offline File-Based Comparison)."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from xpyd_acc.file_compare import (
+    FileOutput,
+    _estimate_context_length,
+    format_file_compare,
+    load_outputs,
+    run_file_compare,
+)
+from xpyd_acc.output_compare import MatchConfig
+
+
+def _write_jsonl(path: Path, items: list[dict]) -> None:
+    """Helper to write JSONL file."""
+    with open(path, "w") as f:
+        for item in items:
+            f.write(json.dumps(item) + "\n")
+
+
+class TestLoadOutputs:
+    """Tests for load_outputs()."""
+
+    def test_basic_load(self, tmp_path: Path) -> None:
+        p = tmp_path / "outputs.jsonl"
+        _write_jsonl(p, [
+            {"id": "s1", "output": "hello world"},
+            {"id": "s2", "output": "foo bar", "logprobs": [-0.1, -0.3]},
+        ])
+        result = load_outputs(p)
+        assert len(result) == 2
+        assert result[0].id == "s1"
+        assert result[0].output == "hello world"
+        assert result[0].logprobs is None
+        assert result[1].logprobs == [-0.1, -0.3]
+
+    def test_file_not_found(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            load_outputs(tmp_path / "missing.jsonl")
+
+    def test_missing_id(self, tmp_path: Path) -> None:
+        p = tmp_path / "bad.jsonl"
+        _write_jsonl(p, [{"output": "hello"}])
+        with pytest.raises(ValueError, match="missing required field 'id'"):
+            load_outputs(p)
+
+    def test_missing_output(self, tmp_path: Path) -> None:
+        p = tmp_path / "bad.jsonl"
+        _write_jsonl(p, [{"id": "s1"}])
+        with pytest.raises(ValueError, match="missing required field 'output'"):
+            load_outputs(p)
+
+    def test_invalid_json(self, tmp_path: Path) -> None:
+        p = tmp_path / "bad.jsonl"
+        p.write_text("not json\n")
+        with pytest.raises(ValueError, match="invalid JSON"):
+            load_outputs(p)
+
+    def test_empty_file(self, tmp_path: Path) -> None:
+        p = tmp_path / "empty.jsonl"
+        p.write_text("\n\n")
+        with pytest.raises(ValueError, match="No samples found"):
+            load_outputs(p)
+
+    def test_skips_blank_lines(self, tmp_path: Path) -> None:
+        p = tmp_path / "outputs.jsonl"
+        p.write_text('\n{"id":"s1","output":"hi"}\n\n')
+        result = load_outputs(p)
+        assert len(result) == 1
+
+
+class TestRunFileCompare:
+    """Tests for run_file_compare()."""
+
+    def test_all_match(self) -> None:
+        bl = [FileOutput("s1", "hello"), FileOutput("s2", "world")]
+        tg = [FileOutput("s1", "hello"), FileOutput("s2", "world")]
+        report = run_file_compare(bl, tg)
+        assert report.total_samples == 2
+        assert report.divergent_samples == 0
+        assert report.divergence_rate == 0.0
+
+    def test_divergence_detected(self) -> None:
+        bl = [FileOutput("s1", "hello world")]
+        tg = [FileOutput("s1", "hello earth")]
+        report = run_file_compare(bl, tg)
+        assert report.divergent_samples == 1
+        assert report.results[0].first_divergence_index == 1
+
+    def test_id_mismatch_raises(self) -> None:
+        bl = [FileOutput("s1", "hello")]
+        tg = [FileOutput("s2", "hello")]
+        with pytest.raises(ValueError, match="Sample ID mismatch"):
+            run_file_compare(bl, tg)
+
+    def test_with_logprobs_likely_bug(self) -> None:
+        bl = [FileOutput("s1", "a b c", logprobs=[-0.01, -0.5, -0.01])]
+        tg = [FileOutput("s1", "a x c", logprobs=[-0.01, -0.01, -0.01])]
+        report = run_file_compare(bl, tg, logprob_gap_threshold=0.1)
+        assert report.results[0].classification == "likely_bug"
+        assert report.results[0].logprob_gap is not None
+        assert report.results[0].logprob_gap == pytest.approx(0.49)
+
+    def test_with_logprobs_likely_uncertainty(self) -> None:
+        bl = [FileOutput("s1", "a b c", logprobs=[-0.01, -0.05, -0.01])]
+        tg = [FileOutput("s1", "a x c", logprobs=[-0.01, -0.04, -0.01])]
+        report = run_file_compare(bl, tg, logprob_gap_threshold=0.1)
+        assert report.results[0].classification == "likely_uncertainty"
+
+    def test_no_logprobs_unknown(self) -> None:
+        bl = [FileOutput("s1", "a b")]
+        tg = [FileOutput("s1", "a x")]
+        report = run_file_compare(bl, tg)
+        assert report.results[0].classification == "unknown"
+
+    def test_match_config_ignore_case(self) -> None:
+        bl = [FileOutput("s1", "Hello World")]
+        tg = [FileOutput("s1", "hello world")]
+        cfg = MatchConfig(ignore_case=True)
+        report = run_file_compare(bl, tg, match_config=cfg)
+        assert report.divergent_samples == 0
+
+    def test_prefix_divergence(self) -> None:
+        bl = [FileOutput("s1", "a b c")]
+        tg = [FileOutput("s1", "a b")]
+        report = run_file_compare(bl, tg)
+        assert report.results[0].first_divergence_index == 2
+
+
+class TestFormatFileCompare:
+    """Tests for format_file_compare()."""
+
+    def test_format_output(self) -> None:
+        bl = [FileOutput("s1", "hello"), FileOutput("s2", "world")]
+        tg = [FileOutput("s1", "hello"), FileOutput("s2", "earth")]
+        report = run_file_compare(bl, tg)
+        text = format_file_compare(report)
+        assert "File Comparison Report" in text
+        assert "Divergent:         1" in text
+        assert "50.0%" in text
+
+
+class TestEstimateContextLength:
+    """Tests for _estimate_context_length()."""
+
+    def test_basic(self) -> None:
+        assert _estimate_context_length("hello world foo bar") >= 1
+
+    def test_empty(self) -> None:
+        assert _estimate_context_length("") >= 1
+
+
+class TestCLIIntegration:
+    """Tests for compare-files CLI subcommand."""
+
+    def test_cli_basic(self, tmp_path: Path) -> None:
+        bl_path = tmp_path / "baseline.jsonl"
+        tg_path = tmp_path / "target.jsonl"
+        _write_jsonl(bl_path, [{"id": "s1", "output": "hello"}])
+        _write_jsonl(tg_path, [{"id": "s1", "output": "hello"}])
+
+        import argparse
+
+        from xpyd_acc.cli.parsers import register_all
+        parser = argparse.ArgumentParser()
+        sub = parser.add_subparsers(dest="command")
+        register_all(sub)
+        args = parser.parse_args([
+            "compare-files", "--baseline", str(bl_path), "--target", str(tg_path),
+        ])
+        assert args.command == "compare-files"
+        assert args.baseline == str(bl_path)
+
+    def test_cli_json_export(self, tmp_path: Path) -> None:
+        bl_path = tmp_path / "baseline.jsonl"
+        tg_path = tmp_path / "target.jsonl"
+        json_out = tmp_path / "report.json"
+        _write_jsonl(bl_path, [{"id": "s1", "output": "hello"}])
+        _write_jsonl(tg_path, [{"id": "s1", "output": "hello"}])
+
+        import argparse
+
+        from xpyd_acc.cli.parsers import register_all
+        parser = argparse.ArgumentParser()
+        sub = parser.add_subparsers(dest="command")
+        register_all(sub)
+        args = parser.parse_args([
+            "compare-files", "--baseline", str(bl_path),
+            "--target", str(tg_path), "--json", str(json_out),
+        ])
+        assert args.json == str(json_out)