From a226f336b2d956025fdc0e03009a0d7f709b252e Mon Sep 17 00:00:00 2001 From: hlin99 Date: Mon, 6 Apr 2026 15:18:01 +0800 Subject: [PATCH] feat: Benchmark Quality Gate (M117) - QualityGate class in quality_gate.py with YAML-configurable GateConfig - GateCheck, GateConfig, GateResult, GateVerdict Pydantic models - 5 composite checks: min_requests, quality_score, outlier_ratio, convergence, load_profile - Non-zero exit code on FAIL (CI/CD pipeline friendly) - CLI quality-gate subcommand with table + JSON output - Programmatic evaluate_quality_gate() API - 23 new tests Closes #257 --- ROADMAP.md | 19 +- docs/iterations/current.md | 3 +- src/xpyd_plan/__init__.py | 19 ++ src/xpyd_plan/cli/_main.py | 6 + src/xpyd_plan/cli/_quality_gate.py | 120 ++++++++++ src/xpyd_plan/quality_gate.py | 326 +++++++++++++++++++++++++++ tests/test_quality_gate.py | 351 +++++++++++++++++++++++++++++ 7 files changed, 840 insertions(+), 4 deletions(-) create mode 100644 src/xpyd_plan/cli/_quality_gate.py create mode 100644 src/xpyd_plan/quality_gate.py create mode 100644 tests/test_quality_gate.py diff --git a/ROADMAP.md b/ROADMAP.md index e67b3a9..4496cb9 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1557,9 +1557,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be - Programmatic `optimize_workload_mix()` API - 32 new tests -### M116 🔄 GPU Hour Calculator +### M116 ✅ GPU Hour Calculator -*In progress* +*Completed — PR #256* - `GPUHourCalculator` class in `gpu_hours.py` - `TrafficProfile`, `HourlyTraffic`, `GPUHourReport`, `ScalingSavings`, `HourBreakdown` Pydantic models @@ -1568,4 +1568,17 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be - Auto-scaling savings estimation (fixed vs dynamic provisioning) - CLI `gpu-hours` subcommand with `--benchmark`, `--traffic-profile`, `--gpu-cost`, table + JSON output - Programmatic `calculate_gpu_hours()` API -- ~24 new tests +- 24 new tests + +### M117 🔄 Benchmark Quality Gate + +*In progress* + +- `QualityGate` class in `quality_gate.py` +- `GateConfig`, `GateCheck`, `GateResult`, `GateVerdict` Pydantic models +- Composite pass/fail gate combining: data validation, percentile convergence, load profile stability, outlier ratio, minimum request count +- YAML-configurable gate rules (thresholds per check) +- Non-zero exit code on FAIL (CI/CD pipeline integration) +- CLI `quality-gate` subcommand with `--benchmark`, `--config`, table + JSON output +- Programmatic `evaluate_quality_gate()` API +- ~22 new tests diff --git a/docs/iterations/current.md b/docs/iterations/current.md index c3d1dd0..2fd08d7 100644 --- a/docs/iterations/current.md +++ b/docs/iterations/current.md @@ -68,4 +68,5 @@ The project has completed **110 milestones**, covering the full feature chain fr | 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved | | 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved | | 9 | 2026-04-06 | M115 Workload Mix Optimizer | ✅ merged | PR #254, both bots approved | -| 10 | 2026-04-06 | M116 GPU Hour Calculator | ⏳ pending review | Issue #255 | +| 10 | 2026-04-06 | M116 GPU Hour Calculator | ✅ merged | PR #256, both bots approved | +| 11 | 2026-04-06 | M117 Benchmark Quality Gate | ⏳ pending review | PR TBD | diff --git a/src/xpyd_plan/__init__.py b/src/xpyd_plan/__init__.py index 96d3f46..158a9ff 100644 --- a/src/xpyd_plan/__init__.py +++ b/src/xpyd_plan/__init__.py @@ -1554,3 +1554,22 @@ "TrafficProfile", "calculate_gpu_hours", ] +from xpyd_plan.quality_gate import ( # noqa: E402 + GateCheck, + GateConfig, + GateResult, + GateVerdict, + QualityGate, + evaluate_quality_gate, + load_gate_config, +) + +__all__ += [ + "GateCheck", + "GateConfig", + "GateResult", + "GateVerdict", + "QualityGate", + "evaluate_quality_gate", + "load_gate_config", +] diff --git a/src/xpyd_plan/cli/_main.py b/src/xpyd_plan/cli/_main.py index 3dd6226..330409a 100644 --- a/src/xpyd_plan/cli/_main.py +++ b/src/xpyd_plan/cli/_main.py @@ -64,6 +64,7 @@ from xpyd_plan.cli._pipeline import _cmd_pipeline from xpyd_plan.cli._plan_benchmarks import _cmd_plan_benchmarks, add_plan_benchmarks_parser from xpyd_plan.cli._qps_curve import add_qps_curve_parser +from xpyd_plan.cli._quality_gate import register as register_quality_gate from xpyd_plan.cli._queue import add_queue_parser from xpyd_plan.cli._ranking import _cmd_ranking, add_ranking_parser from xpyd_plan.cli._rate_limit import add_rate_limit_parser @@ -970,6 +971,7 @@ def main(argv: list[str] | None = None) -> None: register_trtllm_commands(subparsers) register_compare_backends(subparsers) register_gpu_hours(subparsers) + register_quality_gate(subparsers) register_workload_mix(subparsers) add_rate_limit_parser(subparsers) add_batch_analysis_parser(subparsers) @@ -1332,6 +1334,10 @@ def main(argv: list[str] | None = None) -> None: from xpyd_plan.cli._gpu_hours import _run as _cmd_gpu_hours _cmd_gpu_hours(args) + elif args.command == "quality-gate": + from xpyd_plan.cli._quality_gate import _run as _cmd_quality_gate + + _cmd_quality_gate(args) else: parser.print_help() sys.exit(1) diff --git a/src/xpyd_plan/cli/_quality_gate.py b/src/xpyd_plan/cli/_quality_gate.py new file mode 100644 index 0000000..7cdcdfa --- /dev/null +++ b/src/xpyd_plan/cli/_quality_gate.py @@ -0,0 +1,120 @@ +"""CLI subcommand for benchmark quality gate.""" + +from __future__ import annotations + +import argparse +import json +import sys +from typing import Any + +from rich.console import Console +from rich.table import Table + +from xpyd_plan.bench_adapter import load_benchmark_auto +from xpyd_plan.quality_gate import GateConfig, GateVerdict, QualityGate, load_gate_config + + +def register(subparsers: Any) -> None: + """Register the quality-gate subcommand.""" + parser = subparsers.add_parser( + "quality-gate", + help="Composite pass/fail gate for benchmark quality (CI/CD friendly)", + ) + parser.add_argument( + "--benchmark", + required=True, + help="Path to benchmark JSON file", + ) + parser.add_argument( + "--config", + default=None, + help="Path to YAML gate config file (overrides other flags)", + ) + parser.add_argument( + "--min-requests", + type=int, + default=100, + help="Minimum request count (default: 100)", + ) + parser.add_argument( + "--min-quality-score", + type=float, + default=0.7, + help="Minimum data quality score 0-1 (default: 0.7)", + ) + parser.add_argument( + "--max-outlier-pct", + type=float, + default=10.0, + help="Maximum outlier percentage (default: 10.0)", + ) + parser.add_argument( + "--require-stable-convergence", + action="store_true", + default=True, + help="Require stable percentile convergence (default: True)", + ) + parser.add_argument( + "--no-require-stable-convergence", + action="store_false", + dest="require_stable_convergence", + help="Allow marginal convergence", + ) + parser.add_argument( + "--output-format", + choices=["table", "json"], + default="table", + help="Output format (default: table)", + ) + + +def _run(args: argparse.Namespace) -> None: + """Execute the quality-gate subcommand.""" + console = Console() + + data = load_benchmark_auto(args.benchmark) + + if args.config: + config = load_gate_config(args.config) + else: + config = GateConfig( + min_requests=args.min_requests, + min_quality_score=args.min_quality_score, + max_outlier_pct=args.max_outlier_pct, + require_stable_convergence=args.require_stable_convergence, + ) + + gate = QualityGate(config) + result = gate.evaluate(data) + + output_format = getattr(args, "output_format", "table") + if output_format == "json": + json.dump(result.model_dump(), sys.stdout, indent=2) + sys.stdout.write("\n") + if result.verdict == GateVerdict.FAIL: + sys.exit(1) + return + + verdict_style = { + GateVerdict.PASS: ("[green]PASS[/green]", "✅"), + GateVerdict.WARN: ("[yellow]WARN[/yellow]", "⚠️"), + GateVerdict.FAIL: ("[red]FAIL[/red]", "❌"), + } + + styled, emoji = verdict_style[result.verdict] + console.print(f"\n{emoji} Quality Gate: {styled} ({result.request_count} requests)\n") + + table = Table(title="Gate Checks") + table.add_column("Check", justify="left") + table.add_column("Verdict", justify="center") + table.add_column("Detail", justify="left") + table.add_column("Threshold", justify="left") + + for check in result.checks: + s, e = verdict_style[check.verdict] + table.add_row(check.name, f"{e} {s}", check.detail, check.threshold or "—") + + console.print(table) + + if result.verdict == GateVerdict.FAIL: + sys.exit(1) diff --git a/src/xpyd_plan/quality_gate.py b/src/xpyd_plan/quality_gate.py new file mode 100644 index 0000000..9f98c7f --- /dev/null +++ b/src/xpyd_plan/quality_gate.py @@ -0,0 +1,326 @@ +"""Benchmark Quality Gate — composite pass/fail gate for CI/CD pipelines. + +Combines multiple quality checks (validation, convergence, load profile, +outlier ratio, minimum request count) into a single gate verdict with +YAML-configurable thresholds. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Any, Optional + +import yaml +from pydantic import BaseModel, Field + +from xpyd_plan.bench_adapter import load_benchmark_auto +from xpyd_plan.benchmark_models import BenchmarkData +from xpyd_plan.convergence import ConvergenceAnalyzer, StabilityStatus +from xpyd_plan.load_profile import LoadProfileClassifier +from xpyd_plan.outlier_impact import OutlierImpactAnalyzer +from xpyd_plan.validator import DataValidator + + +class GateVerdict(str, Enum): + """Overall gate verdict.""" + + PASS = "pass" + WARN = "warn" + FAIL = "fail" + + +class GateCheck(BaseModel): + """Result of a single gate check.""" + + name: str = Field(description="Check name") + verdict: GateVerdict = Field(description="Check verdict") + detail: str = Field(description="Human-readable detail") + threshold: Optional[str] = Field( + default=None, description="Threshold that was evaluated" + ) + + +class GateConfig(BaseModel): + """Configuration for quality gate thresholds.""" + + min_requests: int = Field( + default=100, ge=1, description="Minimum number of requests required" + ) + min_quality_score: float = Field( + default=0.7, ge=0.0, le=1.0, description="Minimum data quality score" + ) + max_outlier_pct: float = Field( + default=10.0, ge=0.0, le=100.0, description="Maximum outlier percentage" + ) + require_stable_convergence: bool = Field( + default=True, description="Require stable percentile convergence" + ) + allowed_load_profiles: list[str] = Field( + default_factory=lambda: ["steady_state", "ramp_up", "ramp_down"], + description="Allowed load profile types", + ) + convergence_steps: int = Field(default=10, ge=2, description="Convergence window steps") + convergence_threshold: float = Field( + default=0.05, ge=0.0, description="CV threshold for convergence" + ) + iqr_multiplier: float = Field( + default=1.5, ge=0.0, description="IQR multiplier for outlier detection" + ) + load_profile_window: float = Field( + default=5.0, ge=0.1, description="Window size for load profile (seconds)" + ) + + +class GateResult(BaseModel): + """Complete quality gate result.""" + + verdict: GateVerdict = Field(description="Overall gate verdict") + checks: list[GateCheck] = Field(description="Per-check results") + request_count: int = Field(description="Total requests in benchmark") + config: GateConfig = Field(description="Config used for evaluation") + passed: bool = Field(description="True if verdict is PASS") + + +def load_gate_config(path: str) -> GateConfig: + """Load gate config from a YAML file. + + Args: + path: Path to YAML config file. + + Returns: + Parsed GateConfig. + """ + with open(path) as f: + raw = yaml.safe_load(f) + if raw is None: + return GateConfig() + return GateConfig(**raw) + + +class QualityGate: + """Evaluate benchmark data against configurable quality checks. + + Checks: + 1. Minimum request count + 2. Data validation quality score + 3. Outlier percentage + 4. Percentile convergence stability + 5. Load profile type + """ + + def __init__(self, config: GateConfig | None = None) -> None: + self._config = config or GateConfig() + + @property + def config(self) -> GateConfig: + """Return current gate configuration.""" + return self._config + + def evaluate(self, data: BenchmarkData) -> GateResult: + """Evaluate all gate checks and return unified result.""" + checks: list[GateCheck] = [] + + checks.append(self._check_min_requests(data)) + checks.append(self._check_validation(data)) + checks.append(self._check_outlier_ratio(data)) + checks.append(self._check_convergence(data)) + checks.append(self._check_load_profile(data)) + + verdicts = [c.verdict for c in checks] + if GateVerdict.FAIL in verdicts: + overall = GateVerdict.FAIL + elif GateVerdict.WARN in verdicts: + overall = GateVerdict.WARN + else: + overall = GateVerdict.PASS + + return GateResult( + verdict=overall, + checks=checks, + request_count=len(data.requests), + config=self._config, + passed=overall == GateVerdict.PASS, + ) + + def _check_min_requests(self, data: BenchmarkData) -> GateCheck: + """Check minimum request count.""" + count = len(data.requests) + threshold = self._config.min_requests + if count >= threshold: + return GateCheck( + name="min_requests", + verdict=GateVerdict.PASS, + detail=f"{count} requests (≥{threshold})", + threshold=str(threshold), + ) + return GateCheck( + name="min_requests", + verdict=GateVerdict.FAIL, + detail=f"{count} requests (required ≥{threshold})", + threshold=str(threshold), + ) + + def _check_validation(self, data: BenchmarkData) -> GateCheck: + """Check data quality score.""" + validator = DataValidator() + result = validator.validate(data) + score = result.quality.overall + threshold = self._config.min_quality_score + + if score >= threshold: + return GateCheck( + name="quality_score", + verdict=GateVerdict.PASS, + detail=f"Quality score {score:.2f} (≥{threshold:.2f})", + threshold=str(threshold), + ) + # Warn if close, fail if far + if score >= threshold * 0.8: + return GateCheck( + name="quality_score", + verdict=GateVerdict.WARN, + detail=f"Quality score {score:.2f} (threshold {threshold:.2f})", + threshold=str(threshold), + ) + return GateCheck( + name="quality_score", + verdict=GateVerdict.FAIL, + detail=f"Quality score {score:.2f} (required ≥{threshold:.2f})", + threshold=str(threshold), + ) + + def _check_outlier_ratio(self, data: BenchmarkData) -> GateCheck: + """Check outlier percentage.""" + analyzer = OutlierImpactAnalyzer() + report = analyzer.analyze(data, iqr_multiplier=self._config.iqr_multiplier) + + pct = ( + report.outlier_count / report.total_requests * 100 + if report.total_requests > 0 + else 0.0 + ) + max_pct = self._config.max_outlier_pct + + if pct <= max_pct: + return GateCheck( + name="outlier_ratio", + verdict=GateVerdict.PASS, + detail=f"Outlier ratio {pct:.1f}% (≤{max_pct:.1f}%)", + threshold=f"{max_pct}%", + ) + if pct <= max_pct * 1.5: + return GateCheck( + name="outlier_ratio", + verdict=GateVerdict.WARN, + detail=f"Outlier ratio {pct:.1f}% (threshold {max_pct:.1f}%)", + threshold=f"{max_pct}%", + ) + return GateCheck( + name="outlier_ratio", + verdict=GateVerdict.FAIL, + detail=f"Outlier ratio {pct:.1f}% (max {max_pct:.1f}%)", + threshold=f"{max_pct}%", + ) + + def _check_convergence(self, data: BenchmarkData) -> GateCheck: + """Check percentile convergence stability.""" + analyzer = ConvergenceAnalyzer(data) + report = analyzer.analyze( + steps=self._config.convergence_steps, + threshold=self._config.convergence_threshold, + ) + + if report.overall_status == StabilityStatus.STABLE: + return GateCheck( + name="convergence", + verdict=GateVerdict.PASS, + detail="Percentile convergence: stable", + ) + if report.overall_status == StabilityStatus.MARGINAL: + unstable = [ + m.field + for m in report.metrics + if m.status != StabilityStatus.STABLE + ] + verdict = ( + GateVerdict.WARN + if not self._config.require_stable_convergence + else GateVerdict.FAIL + ) + return GateCheck( + name="convergence", + verdict=verdict, + detail=f"Percentile convergence: marginal ({', '.join(unstable)})", + ) + + # UNSTABLE + unstable = [ + m.field + for m in report.metrics + if m.status == StabilityStatus.UNSTABLE + ] + return GateCheck( + name="convergence", + verdict=GateVerdict.FAIL, + detail=f"Percentile convergence: unstable ({', '.join(unstable)})", + ) + + def _check_load_profile(self, data: BenchmarkData) -> GateCheck: + """Check load profile type against allowed list.""" + classifier = LoadProfileClassifier(data) + report = classifier.classify(window_size=self._config.load_profile_window) + profile_type = report.profile.profile_type + allowed = self._config.allowed_load_profiles + + if profile_type.value in allowed: + return GateCheck( + name="load_profile", + verdict=GateVerdict.PASS, + detail=f"Load profile: {profile_type.value}", + threshold=f"allowed: {', '.join(allowed)}", + ) + return GateCheck( + name="load_profile", + verdict=GateVerdict.WARN, + detail=f"Load profile: {profile_type.value} (not in allowed: {', '.join(allowed)})", + threshold=f"allowed: {', '.join(allowed)}", + ) + + +def evaluate_quality_gate( + benchmark_path: str, + *, + config_path: str | None = None, + min_requests: int = 100, + min_quality_score: float = 0.7, + max_outlier_pct: float = 10.0, + require_stable_convergence: bool = True, +) -> dict[str, Any]: + """Programmatic API for benchmark quality gate. + + Args: + benchmark_path: Path to benchmark JSON file. + config_path: Optional YAML config file path. If provided, overrides + other keyword arguments. + min_requests: Minimum request count. + min_quality_score: Minimum data quality score (0-1). + max_outlier_pct: Maximum allowed outlier percentage. + require_stable_convergence: Whether to require stable convergence. + + Returns: + Dict with gate result fields. + """ + if config_path: + config = load_gate_config(config_path) + else: + config = GateConfig( + min_requests=min_requests, + min_quality_score=min_quality_score, + max_outlier_pct=max_outlier_pct, + require_stable_convergence=require_stable_convergence, + ) + + data = load_benchmark_auto(benchmark_path) + gate = QualityGate(config) + result = gate.evaluate(data) + return result.model_dump() diff --git a/tests/test_quality_gate.py b/tests/test_quality_gate.py new file mode 100644 index 0000000..ebc2846 --- /dev/null +++ b/tests/test_quality_gate.py @@ -0,0 +1,351 @@ +"""Tests for Benchmark Quality Gate (M117).""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pytest +import yaml + +from xpyd_plan.benchmark_models import ( + BenchmarkData, + BenchmarkMetadata, + BenchmarkRequest, +) +from xpyd_plan.quality_gate import ( + GateCheck, + GateConfig, + GateResult, + GateVerdict, + QualityGate, + evaluate_quality_gate, + load_gate_config, +) + + +def _make_data( + n: int = 200, + measured_qps: float = 100.0, + num_prefill: int = 2, + num_decode: int = 2, +) -> BenchmarkData: + """Create benchmark data with n requests.""" + requests = [ + BenchmarkRequest( + request_id=f"r{i}", + prompt_tokens=100 + (i % 50), + output_tokens=50 + (i % 30), + ttft_ms=20.0 + (i % 10), + tpot_ms=10.0 + (i % 5), + total_latency_ms=30.0 + (i % 15), + timestamp=float(i), + ) + for i in range(n) + ] + return BenchmarkData( + metadata=BenchmarkMetadata( + num_prefill_instances=num_prefill, + num_decode_instances=num_decode, + total_instances=num_prefill + num_decode, + measured_qps=measured_qps, + ), + requests=requests, + ) + + +def _save_benchmark(data: BenchmarkData, path: Path) -> None: + """Save benchmark data to JSON.""" + path.write_text(json.dumps(data.model_dump(), default=str)) + + +# --- Model tests --- + + +class TestGateConfig: + """Test GateConfig model.""" + + def test_defaults(self) -> None: + config = GateConfig() + assert config.min_requests == 100 + assert config.min_quality_score == 0.7 + assert config.max_outlier_pct == 10.0 + assert config.require_stable_convergence is True + assert "steady_state" in config.allowed_load_profiles + + def test_custom_values(self) -> None: + config = GateConfig(min_requests=50, min_quality_score=0.9) + assert config.min_requests == 50 + assert config.min_quality_score == 0.9 + + def test_validation(self) -> None: + with pytest.raises(Exception): + GateConfig(min_requests=0) + with pytest.raises(Exception): + GateConfig(min_quality_score=1.5) + + +class TestGateVerdict: + """Test GateVerdict enum.""" + + def test_values(self) -> None: + assert GateVerdict.PASS == "pass" + assert GateVerdict.WARN == "warn" + assert GateVerdict.FAIL == "fail" + + +class TestGateCheck: + """Test GateCheck model.""" + + def test_basic(self) -> None: + check = GateCheck( + name="test", verdict=GateVerdict.PASS, detail="ok" + ) + assert check.name == "test" + assert check.verdict == GateVerdict.PASS + assert check.threshold is None + + +class TestGateResult: + """Test GateResult model.""" + + def test_passed_flag(self) -> None: + result = GateResult( + verdict=GateVerdict.PASS, + checks=[], + request_count=100, + config=GateConfig(), + passed=True, + ) + assert result.passed is True + + def test_failed_flag(self) -> None: + result = GateResult( + verdict=GateVerdict.FAIL, + checks=[], + request_count=10, + config=GateConfig(), + passed=False, + ) + assert result.passed is False + + +# --- Load config tests --- + + +class TestLoadGateConfig: + """Test YAML config loading.""" + + def test_load_yaml(self, tmp_path: Path) -> None: + cfg = {"min_requests": 50, "max_outlier_pct": 5.0} + path = tmp_path / "gate.yaml" + path.write_text(yaml.dump(cfg)) + config = load_gate_config(str(path)) + assert config.min_requests == 50 + assert config.max_outlier_pct == 5.0 + + def test_load_empty_yaml(self, tmp_path: Path) -> None: + path = tmp_path / "empty.yaml" + path.write_text("") + config = load_gate_config(str(path)) + assert config.min_requests == 100 # default + + +# --- QualityGate tests --- + + +class TestQualityGate: + """Test QualityGate evaluator.""" + + def test_pass_with_good_data(self) -> None: + data = _make_data(n=200) + gate = QualityGate() + result = gate.evaluate(data) + assert result.request_count == 200 + assert len(result.checks) == 5 + # Should have check names + names = [c.name for c in result.checks] + assert "min_requests" in names + assert "quality_score" in names + assert "outlier_ratio" in names + assert "convergence" in names + assert "load_profile" in names + + def test_fail_min_requests(self) -> None: + data = _make_data(n=10) + config = GateConfig(min_requests=100) + gate = QualityGate(config) + result = gate.evaluate(data) + min_req_check = [c for c in result.checks if c.name == "min_requests"][0] + assert min_req_check.verdict == GateVerdict.FAIL + + def test_pass_min_requests(self) -> None: + data = _make_data(n=200) + config = GateConfig(min_requests=100) + gate = QualityGate(config) + result = gate.evaluate(data) + min_req_check = [c for c in result.checks if c.name == "min_requests"][0] + assert min_req_check.verdict == GateVerdict.PASS + + def test_fail_propagates_to_overall(self) -> None: + data = _make_data(n=5) + config = GateConfig(min_requests=100) + gate = QualityGate(config) + result = gate.evaluate(data) + assert result.verdict == GateVerdict.FAIL + assert result.passed is False + + def test_config_property(self) -> None: + config = GateConfig(min_requests=42) + gate = QualityGate(config) + assert gate.config.min_requests == 42 + + def test_default_config(self) -> None: + gate = QualityGate() + assert gate.config.min_requests == 100 + + def test_custom_gate_config(self) -> None: + data = _make_data(n=200) + config = GateConfig( + min_requests=10, + max_outlier_pct=50.0, + require_stable_convergence=False, + ) + gate = QualityGate(config) + result = gate.evaluate(data) + # With relaxed config, should likely pass + assert isinstance(result.verdict, GateVerdict) + + def test_result_serialization(self) -> None: + data = _make_data(n=200) + gate = QualityGate() + result = gate.evaluate(data) + d = result.model_dump() + assert "verdict" in d + assert "checks" in d + assert "config" in d + assert "passed" in d + + +# --- Programmatic API tests --- + + +class TestEvaluateQualityGate: + """Test the evaluate_quality_gate() API.""" + + def test_basic(self, tmp_path: Path) -> None: + data = _make_data(n=200) + path = tmp_path / "bench.json" + _save_benchmark(data, path) + result = evaluate_quality_gate(str(path)) + assert "verdict" in result + assert "checks" in result + assert "passed" in result + + def test_with_yaml_config(self, tmp_path: Path) -> None: + data = _make_data(n=200) + bench_path = tmp_path / "bench.json" + _save_benchmark(data, bench_path) + + cfg = {"min_requests": 10, "max_outlier_pct": 50.0} + cfg_path = tmp_path / "gate.yaml" + cfg_path.write_text(yaml.dump(cfg)) + + result = evaluate_quality_gate(str(bench_path), config_path=str(cfg_path)) + assert "verdict" in result + + def test_custom_kwargs(self, tmp_path: Path) -> None: + data = _make_data(n=200) + path = tmp_path / "bench.json" + _save_benchmark(data, path) + result = evaluate_quality_gate( + str(path), min_requests=10, max_outlier_pct=50.0 + ) + assert "verdict" in result + + +# --- CLI tests --- + + +class TestQualityGateCLI: + """Test CLI quality-gate subcommand.""" + + def test_json_output(self, tmp_path: Path) -> None: + + data = _make_data(n=200) + path = tmp_path / "bench.json" + _save_benchmark(data, path) + + result = subprocess.run( + [ + sys.executable, + "-m", + "xpyd_plan.cli", + "quality-gate", + "--benchmark", + str(path), + "--output-format", + "json", + "--min-requests", + "10", + ], + capture_output=True, + text=True, + ) + # May pass or fail depending on data quality, but should produce valid JSON + output = result.stdout.strip() + if output: + parsed = json.loads(output) + assert "verdict" in parsed + assert "checks" in parsed + + def test_table_output(self, tmp_path: Path) -> None: + + data = _make_data(n=200) + path = tmp_path / "bench.json" + _save_benchmark(data, path) + + result = subprocess.run( + [ + sys.executable, + "-m", + "xpyd_plan.cli", + "quality-gate", + "--benchmark", + str(path), + "--min-requests", + "10", + ], + capture_output=True, + text=True, + ) + # Should produce table output (check it ran without crash) + assert "Quality Gate" in result.stdout or result.returncode in (0, 1) + + +# --- Public imports test --- + + +class TestPublicImports: + """Test that all public symbols are importable.""" + + def test_imports(self) -> None: + from xpyd_plan import ( + GateCheck, + GateConfig, + GateResult, + GateVerdict, + QualityGate, + evaluate_quality_gate, + load_gate_config, + ) + + assert GateCheck is not None + assert GateConfig is not None + assert GateResult is not None + assert GateVerdict is not None + assert QualityGate is not None + assert evaluate_quality_gate is not None + assert load_gate_config is not None