Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@
- `repl.py` module: `run_repl()`, `ReplSession`, `ReplCommand`
- 12 tests covering session state, command parsing, export, edge cases, CLI integration

## M83: Divergence Heatmap by Token Position
## M83: Divergence Heatmap by Token Position
- `xpyd-acc heatmap --report <path>` analyzes divergence frequency by token position across all samples
- Bin token positions into configurable buckets (e.g., 0-10, 10-50, 50-100, 100+)
- Per-bucket: divergence count, divergence rate, avg logprob gap
Expand All @@ -799,7 +799,7 @@
- `heatmap.py` module: `compute_heatmap()`, `HeatmapReport`, `format_heatmap()`
- 12 tests covering bucket computation, edge cases, formatting, JSON export, CLI integration

## M84: Endpoint Response Time Regression Detection
## M84: Endpoint Response Time Regression Detection
- `xpyd-acc latency-regression --old <benchmark.json> --new <benchmark.json>` compares latency benchmarks
- Welch's t-test for statistical significance of latency changes
- Reports: mean diff, p-value, effect size (Cohen's d), verdict (faster/slower/unchanged)
Expand Down
1 change: 1 addition & 0 deletions docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,4 @@ shell for exploratory comparison of two endpoints.
| M82 | 2026-04-06 | Interactive REPL for Exploratory Comparison | ✅ merged | Both approved |
| M83 | 2026-04-06 | Divergence Heatmap by Token Position | ✅ merged | Both approved |
| M84 | 2026-04-06 | Endpoint Response Time Regression Detection | ✅ merged | Both approved |
| M85 | 2026-04-06 | Offline Mode — File-Based Comparison | ⏳ pending review | — |
2 changes: 2 additions & 0 deletions src/xpyd_acc/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# via "xpyd_acc.cli._run_compare_logprobs" etc.
from .analysis import (
_run_entropy,
_run_file_compare,
_run_fingerprint,
_run_latency_regression,
_run_length_bias,
Expand Down Expand Up @@ -137,6 +138,7 @@ def main(argv: list[str] | None = None) -> None:
"auto-threshold": lambda: _run_auto_threshold(args),
"repl": lambda: _run_repl(args),
"latency-regression": lambda: _run_latency_regression(args),
"compare-files": lambda: _run_file_compare(args),
}

if args.command in _early:
Expand Down
51 changes: 51 additions & 0 deletions src/xpyd_acc/cli/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,54 @@ def handle_heatmap(args: argparse.Namespace) -> None:
if args.json:
heatmap.to_json(args.json)
print(f"\nHeatmap exported to {args.json}")


def _run_file_compare(args: argparse.Namespace) -> None:
"""Run offline file-based comparison."""
import json as _json
from pathlib import Path

from xpyd_acc.file_compare import (
format_file_compare,
load_outputs,
run_file_compare,
)
from xpyd_acc.output_compare import MatchConfig

baseline_outputs = load_outputs(Path(args.baseline))
target_outputs = load_outputs(Path(args.target))

match_config = MatchConfig(
normalize_whitespace=getattr(args, "normalize_whitespace", False),
ignore_case=getattr(args, "ignore_case", False),
numeric_tolerance=getattr(args, "numeric_tolerance", None),
)

report = run_file_compare(
baseline_outputs,
target_outputs,
match_config=match_config,
)

print(format_file_compare(report))

if getattr(args, "json", None):
with open(args.json, "w") as f:
_json.dump(report.to_json(), f, indent=2)
print(f"\nJSON exported to {args.json}")

if getattr(args, "csv", None):
report.to_csv(args.csv)
print(f"CSV exported to {args.csv}")

if getattr(args, "markdown", None):
md = report.to_markdown()
Path(args.markdown).write_text(md)
print(f"Markdown exported to {args.markdown}")

if getattr(args, "junit", None):
report.to_junit(args.junit)
print(f"JUnit XML exported to {args.junit}")

if report.divergent_samples > 0:
raise SystemExit(1)
26 changes: 26 additions & 0 deletions src/xpyd_acc/cli/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def register_all(sub: argparse._SubParsersAction) -> None:
_register_repl(sub)
_register_latency_regression(sub)
_register_heatmap(sub)
_register_file_compare(sub)
def _register_compare(sub):
lp = sub.add_parser("compare-logprobs", help="Compare logprobs between two endpoints")
lp.add_argument("--baseline", required=True, help="Baseline endpoint URL")
Expand Down Expand Up @@ -636,3 +637,28 @@ def _register_heatmap(sub):
help="Number of position buckets (default: 10)",
)
hm.add_argument("--json", default=None, help="Export heatmap as JSON to this path")


def _register_file_compare(sub):
fc = sub.add_parser(
"compare-files",
help="Compare pre-collected outputs from JSONL files (offline mode)",
)
fc.add_argument("--baseline", required=True, help="Path to baseline outputs JSONL")
fc.add_argument("--target", required=True, help="Path to target outputs JSONL")
fc.add_argument("--json", default=None, help="Export report as JSON")
fc.add_argument("--csv", default=None, help="Export report as CSV")
fc.add_argument("--markdown", default=None, help="Export report as Markdown")
fc.add_argument("--junit", default=None, help="Export report as JUnit XML")
fc.add_argument(
"--normalize-whitespace", action="store_true", default=False,
help="Normalize whitespace before comparison",
)
fc.add_argument(
"--ignore-case", action="store_true", default=False,
help="Case-insensitive comparison",
)
fc.add_argument(
"--numeric-tolerance", type=float, default=None,
help="Numeric tolerance for matching",
)
199 changes: 199 additions & 0 deletions src/xpyd_acc/file_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""Offline file-based comparison: compare pre-collected outputs without endpoints."""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path

from xpyd_acc.batch_compare import BatchReport, SampleResult, compute_report
from xpyd_acc.log import get_logger
from xpyd_acc.output_compare import MatchConfig, normalized_match

logger = get_logger("file_compare")


@dataclass
class FileOutput:
"""A single output loaded from a JSONL file."""

id: str
output: str
logprobs: list[float] | None = None


def load_outputs(path: Path) -> list[FileOutput]:
"""Load outputs from a JSONL file.

Each line must be a JSON object with at least ``id`` and ``output`` fields.
An optional ``logprobs`` field (list of floats) is supported.

Raises:
FileNotFoundError: If the file does not exist.
ValueError: If a line is missing required fields or is not valid JSON.
"""
if not path.exists():
raise FileNotFoundError(f"File not found: {path}")

outputs: list[FileOutput] = []
with open(path) as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError as exc:
raise ValueError(
f"{path}:{line_num}: invalid JSON: {exc}"
) from exc

if not isinstance(obj, dict):
typ = type(obj).__name__
raise ValueError(
f"{path}:{line_num}: expected JSON object, got {typ}"
)
if "id" not in obj:
raise ValueError(f"{path}:{line_num}: missing required field 'id'")
if "output" not in obj:
raise ValueError(f"{path}:{line_num}: missing required field 'output'")

outputs.append(FileOutput(
id=str(obj["id"]),
output=str(obj["output"]),
logprobs=obj.get("logprobs"),
))

if not outputs:
raise ValueError(f"No samples found in {path}")

return outputs


def _estimate_context_length(text: str) -> int:
"""Rough token count estimate (words / 0.75)."""
words = len(text.split())
return max(1, int(words / 0.75))


def run_file_compare(
baseline_outputs: list[FileOutput],
target_outputs: list[FileOutput],
*,
match_config: MatchConfig | None = None,
logprob_gap_threshold: float = 0.1,
) -> BatchReport:
"""Compare baseline and target outputs loaded from files.

Matches samples by ID. Both lists must contain the same set of IDs.

Returns:
A :class:`BatchReport` with comparison results.

Raises:
ValueError: If IDs don't match between baseline and target.
"""
if match_config is None:
match_config = MatchConfig()

baseline_map = {o.id: o for o in baseline_outputs}
target_map = {o.id: o for o in target_outputs}

baseline_ids = set(baseline_map.keys())
target_ids = set(target_map.keys())

if baseline_ids != target_ids:
only_baseline = baseline_ids - target_ids
only_target = target_ids - baseline_ids
parts = []
if only_baseline:
parts.append(f"only in baseline: {sorted(only_baseline)[:5]}")
if only_target:
parts.append(f"only in target: {sorted(only_target)[:5]}")
raise ValueError(f"Sample ID mismatch: {'; '.join(parts)}")

results: list[SampleResult] = []

for sample_id in sorted(baseline_ids):
bl = baseline_map[sample_id]
tg = target_map[sample_id]

exact = normalized_match(bl.output, tg.output, match_config)

# Find first divergence index (character-level)
first_div_idx: int | None = None
if not exact:
bl_tokens = bl.output.split()
tg_tokens = tg.output.split()
for i, (bt, tt) in enumerate(zip(bl_tokens, tg_tokens)):
if bt != tt:
first_div_idx = i
break
else:
# One is a prefix of the other
first_div_idx = min(len(bl_tokens), len(tg_tokens))

# Logprob gap at divergence point
bl_logprob: float | None = None
tg_logprob: float | None = None
logprob_gap: float | None = None
if first_div_idx is not None and bl.logprobs and tg.logprobs:
if first_div_idx < len(bl.logprobs):
bl_logprob = bl.logprobs[first_div_idx]
if first_div_idx < len(tg.logprobs):
tg_logprob = tg.logprobs[first_div_idx]
if bl_logprob is not None and tg_logprob is not None:
logprob_gap = abs(bl_logprob - tg_logprob)

# Classification
if exact:
classification = "match"
elif logprob_gap is not None:
if logprob_gap >= logprob_gap_threshold:
classification = "likely_bug"
else:
classification = "likely_uncertainty"
else:
classification = "unknown"

results.append(SampleResult(
sample_id=sample_id,
prompt=f"[file:{sample_id}]",
baseline_output=bl.output,
target_output=tg.output,
exact_match=exact,
first_divergence_index=first_div_idx,
baseline_logprob_at_divergence=bl_logprob,
target_logprob_at_divergence=tg_logprob,
logprob_gap=logprob_gap,
classification=classification,
context_length=_estimate_context_length(bl.output),
))

return compute_report(results, logprob_gap_threshold=logprob_gap_threshold)


def format_file_compare(report: BatchReport) -> str:
"""Format a file comparison report for terminal display."""
lines = [
"═══ File Comparison Report ═══",
"",
f"Total samples: {report.total_samples}",
f"Matching: {report.match_samples}",
f"Divergent: {report.divergent_samples}",
f"Divergence rate: {report.divergence_rate:.1%}",
]

if report.likely_bugs:
lines.append(f"Likely bugs: {report.likely_bugs}")
if report.likely_uncertainty:
lines.append(f"Likely uncertainty: {report.likely_uncertainty}")
if report.unknown_classification:
lines.append(f"Unknown: {report.unknown_classification}")

if report.divergence_index_mean is not None:
lines.append(f"Avg divergence idx: {report.divergence_index_mean:.1f}")
if report.logprob_gap_mean is not None:
lines.append(f"Avg logprob gap: {report.logprob_gap_mean:.4f}")

return "\n".join(lines)
Loading
Loading