xPyD-hub · hlin99 · Apr 6, 2026 · Apr 6, 2026
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -787,7 +787,7 @@
 - `repl.py` module: `run_repl()`, `ReplSession`, `ReplCommand`
 - 12 tests covering session state, command parsing, export, edge cases, CLI integration
 
-## M83: Divergence Heatmap by Token Position
+## M83: Divergence Heatmap by Token Position ✅
 - `xpyd-acc heatmap --report <path>` analyzes divergence frequency by token position across all samples
 - Bin token positions into configurable buckets (e.g., 0-10, 10-50, 50-100, 100+)
 - Per-bucket: divergence count, divergence rate, avg logprob gap
@@ -799,7 +799,7 @@
 - `heatmap.py` module: `compute_heatmap()`, `HeatmapReport`, `format_heatmap()`
 - 12 tests covering bucket computation, edge cases, formatting, JSON export, CLI integration
 
-## M84: Endpoint Response Time Regression Detection
+## M84: Endpoint Response Time Regression Detection ✅
 - `xpyd-acc latency-regression --old <benchmark.json> --new <benchmark.json>` compares latency benchmarks
 - Welch's t-test for statistical significance of latency changes
 - Reports: mean diff, p-value, effect size (Cohen's d), verdict (faster/slower/unchanged)

diff --git a/docs/iterations/current.md b/docs/iterations/current.md
@@ -50,3 +50,4 @@ shell for exploratory comparison of two endpoints.
 | M82 | 2026-04-06 | Interactive REPL for Exploratory Comparison | ✅ merged | Both approved |
 | M83 | 2026-04-06 | Divergence Heatmap by Token Position | ✅ merged | Both approved |
 | M84 | 2026-04-06 | Endpoint Response Time Regression Detection | ✅ merged | Both approved |
+| M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ⏳ pending review | — |
diff --git a/src/xpyd_acc/cli/__init__.py b/src/xpyd_acc/cli/__init__.py
@@ -13,6 +13,7 @@
 # via "xpyd_acc.cli._run_compare_logprobs" etc.
 from .analysis import (
     _run_entropy,
+    _run_file_compare,
     _run_fingerprint,
     _run_latency_regression,
     _run_length_bias,
@@ -137,6 +138,7 @@ def main(argv: list[str] | None = None) -> None:
         "auto-threshold": lambda: _run_auto_threshold(args),
         "repl": lambda: _run_repl(args),
         "latency-regression": lambda: _run_latency_regression(args),
+        "compare-files": lambda: _run_file_compare(args),
     }
 
     if args.command in _early:

diff --git a/src/xpyd_acc/cli/analysis.py b/src/xpyd_acc/cli/analysis.py
@@ -297,3 +297,54 @@ def handle_heatmap(args: argparse.Namespace) -> None:
     if args.json:
         heatmap.to_json(args.json)
         print(f"\nHeatmap exported to {args.json}")
+
+
+def _run_file_compare(args: argparse.Namespace) -> None:
+    """Run offline file-based comparison."""
+    import json as _json
+    from pathlib import Path
+
+    from xpyd_acc.file_compare import (
+        format_file_compare,
+        load_outputs,
+        run_file_compare,
+    )
+    from xpyd_acc.output_compare import MatchConfig
+
+    baseline_outputs = load_outputs(Path(args.baseline))
+    target_outputs = load_outputs(Path(args.target))
+
+    match_config = MatchConfig(
+        normalize_whitespace=getattr(args, "normalize_whitespace", False),
+        ignore_case=getattr(args, "ignore_case", False),
+        numeric_tolerance=getattr(args, "numeric_tolerance", None),
+    )
+
+    report = run_file_compare(
+        baseline_outputs,
+        target_outputs,
+        match_config=match_config,
+    )
+
+    print(format_file_compare(report))
+
+    if getattr(args, "json", None):
+        with open(args.json, "w") as f:
+            _json.dump(report.to_json(), f, indent=2)
+        print(f"\nJSON exported to {args.json}")
+
+    if getattr(args, "csv", None):
+        report.to_csv(args.csv)
+        print(f"CSV exported to {args.csv}")
+
+    if getattr(args, "markdown", None):
+        md = report.to_markdown()
+        Path(args.markdown).write_text(md)
+        print(f"Markdown exported to {args.markdown}")
+
+    if getattr(args, "junit", None):
+        report.to_junit(args.junit)
+        print(f"JUnit XML exported to {args.junit}")
+
+    if report.divergent_samples > 0:
+        raise SystemExit(1)
diff --git a/src/xpyd_acc/cli/parsers.py b/src/xpyd_acc/cli/parsers.py
@@ -52,6 +52,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
     _register_repl(sub)
     _register_latency_regression(sub)
     _register_heatmap(sub)
+    _register_file_compare(sub)
 def _register_compare(sub):
     lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
     lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
@@ -636,3 +637,28 @@ def _register_heatmap(sub):
         help="Number of position buckets (default: 10)",
     )
     hm.add_argument("--json", default=None, help="Export heatmap as JSON to this path")
+
+
+def _register_file_compare(sub):
+    fc = sub.add_parser(
+        "compare-files",
+        help="Compare pre-collected outputs from JSONL files (offline mode)",
+    )
+    fc.add_argument("--baseline", required=True, help="Path to baseline outputs JSONL")
+    fc.add_argument("--target", required=True, help="Path to target outputs JSONL")
+    fc.add_argument("--json", default=None, help="Export report as JSON")
+    fc.add_argument("--csv", default=None, help="Export report as CSV")
+    fc.add_argument("--markdown", default=None, help="Export report as Markdown")
+    fc.add_argument("--junit", default=None, help="Export report as JUnit XML")
+    fc.add_argument(
+        "--normalize-whitespace", action="store_true", default=False,
+        help="Normalize whitespace before comparison",
+    )
+    fc.add_argument(
+        "--ignore-case", action="store_true", default=False,
+        help="Case-insensitive comparison",
+    )
+    fc.add_argument(
+        "--numeric-tolerance", type=float, default=None,
+        help="Numeric tolerance for matching",
+    )
diff --git a/src/xpyd_acc/file_compare.py b/src/xpyd_acc/file_compare.py
@@ -0,0 +1,199 @@
+"""Offline file-based comparison: compare pre-collected outputs without endpoints."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from xpyd_acc.batch_compare import BatchReport, SampleResult, compute_report
+from xpyd_acc.log import get_logger
+from xpyd_acc.output_compare import MatchConfig, normalized_match
+
+logger = get_logger("file_compare")
+
+
+@dataclass
+class FileOutput:
+    """A single output loaded from a JSONL file."""
+
+    id: str
+    output: str
+    logprobs: list[float] | None = None
+
+
+def load_outputs(path: Path) -> list[FileOutput]:
+    """Load outputs from a JSONL file.
+
+    Each line must be a JSON object with at least ``id`` and ``output`` fields.
+    An optional ``logprobs`` field (list of floats) is supported.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        ValueError: If a line is missing required fields or is not valid JSON.
+    """
+    if not path.exists():
+        raise FileNotFoundError(f"File not found: {path}")
+
+    outputs: list[FileOutput] = []
+    with open(path) as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError as exc:
+                raise ValueError(
+                    f"{path}:{line_num}: invalid JSON: {exc}"
+                ) from exc
+
+            if not isinstance(obj, dict):
+                typ = type(obj).__name__
+                raise ValueError(
+                    f"{path}:{line_num}: expected JSON object, got {typ}"
+                )
+            if "id" not in obj:
+                raise ValueError(f"{path}:{line_num}: missing required field 'id'")
+            if "output" not in obj:
+                raise ValueError(f"{path}:{line_num}: missing required field 'output'")
+
+            outputs.append(FileOutput(
+                id=str(obj["id"]),
+                output=str(obj["output"]),
+                logprobs=obj.get("logprobs"),
+            ))
+
+    if not outputs:
+        raise ValueError(f"No samples found in {path}")
+
+    return outputs
+
+
+def _estimate_context_length(text: str) -> int:
+    """Rough token count estimate (words / 0.75)."""
+    words = len(text.split())
+    return max(1, int(words / 0.75))
+
+
+def run_file_compare(
+    baseline_outputs: list[FileOutput],
+    target_outputs: list[FileOutput],
+    *,
+    match_config: MatchConfig | None = None,
+    logprob_gap_threshold: float = 0.1,
+) -> BatchReport:
+    """Compare baseline and target outputs loaded from files.
+
+    Matches samples by ID. Both lists must contain the same set of IDs.
+
+    Returns:
+        A :class:`BatchReport` with comparison results.
+
+    Raises:
+        ValueError: If IDs don't match between baseline and target.
+    """
+    if match_config is None:
+        match_config = MatchConfig()
+
+    baseline_map = {o.id: o for o in baseline_outputs}
+    target_map = {o.id: o for o in target_outputs}
+
+    baseline_ids = set(baseline_map.keys())
+    target_ids = set(target_map.keys())
+
+    if baseline_ids != target_ids:
+        only_baseline = baseline_ids - target_ids
+        only_target = target_ids - baseline_ids
+        parts = []
+        if only_baseline:
+            parts.append(f"only in baseline: {sorted(only_baseline)[:5]}")
+        if only_target:
+            parts.append(f"only in target: {sorted(only_target)[:5]}")
+        raise ValueError(f"Sample ID mismatch: {'; '.join(parts)}")
+
+    results: list[SampleResult] = []
+
+    for sample_id in sorted(baseline_ids):
+        bl = baseline_map[sample_id]
+        tg = target_map[sample_id]
+
+        exact = normalized_match(bl.output, tg.output, match_config)
+
+        # Find first divergence index (character-level)
+        first_div_idx: int | None = None
+        if not exact:
+            bl_tokens = bl.output.split()
+            tg_tokens = tg.output.split()
+            for i, (bt, tt) in enumerate(zip(bl_tokens, tg_tokens)):
+                if bt != tt:
+                    first_div_idx = i
+                    break
+            else:
+                # One is a prefix of the other
+                first_div_idx = min(len(bl_tokens), len(tg_tokens))
+
+        # Logprob gap at divergence point
+        bl_logprob: float | None = None
+        tg_logprob: float | None = None
+        logprob_gap: float | None = None
+        if first_div_idx is not None and bl.logprobs and tg.logprobs:
+            if first_div_idx < len(bl.logprobs):
+                bl_logprob = bl.logprobs[first_div_idx]
+            if first_div_idx < len(tg.logprobs):
+                tg_logprob = tg.logprobs[first_div_idx]
+            if bl_logprob is not None and tg_logprob is not None:
+                logprob_gap = abs(bl_logprob - tg_logprob)
+
+        # Classification
+        if exact:
+            classification = "match"
+        elif logprob_gap is not None:
+            if logprob_gap >= logprob_gap_threshold:
+                classification = "likely_bug"
+            else:
+                classification = "likely_uncertainty"
+        else:
+            classification = "unknown"
+
+        results.append(SampleResult(
+            sample_id=sample_id,
+            prompt=f"[file:{sample_id}]",
+            baseline_output=bl.output,
+            target_output=tg.output,
+            exact_match=exact,
+            first_divergence_index=first_div_idx,
+            baseline_logprob_at_divergence=bl_logprob,
+            target_logprob_at_divergence=tg_logprob,
+            logprob_gap=logprob_gap,
+            classification=classification,
+            context_length=_estimate_context_length(bl.output),
+        ))
+
+    return compute_report(results, logprob_gap_threshold=logprob_gap_threshold)
+
+
+def format_file_compare(report: BatchReport) -> str:
+    """Format a file comparison report for terminal display."""
+    lines = [
+        "═══ File Comparison Report ═══",
+        "",
+        f"Total samples:     {report.total_samples}",
+        f"Matching:          {report.match_samples}",
+        f"Divergent:         {report.divergent_samples}",
+        f"Divergence rate:   {report.divergence_rate:.1%}",
+    ]
+
+    if report.likely_bugs:
+        lines.append(f"Likely bugs:       {report.likely_bugs}")
+    if report.likely_uncertainty:
+        lines.append(f"Likely uncertainty: {report.likely_uncertainty}")
+    if report.unknown_classification:
+        lines.append(f"Unknown:           {report.unknown_classification}")
+
+    if report.divergence_index_mean is not None:
+        lines.append(f"Avg divergence idx: {report.divergence_index_mean:.1f}")
+    if report.logprob_gap_mean is not None:
+        lines.append(f"Avg logprob gap:   {report.logprob_gap_mean:.4f}")
+
+    return "\n".join(lines)