From a226f336b2d956025fdc0e03009a0d7f709b252e Mon Sep 17 00:00:00 2001
From: hlin99 <tony.lin@intel.com>
Date: Mon, 6 Apr 2026 15:18:01 +0800
Subject: [PATCH] feat: Benchmark Quality Gate (M117)

- QualityGate class in quality_gate.py with YAML-configurable GateConfig
- GateCheck, GateConfig, GateResult, GateVerdict Pydantic models
- 5 composite checks: min_requests, quality_score, outlier_ratio, convergence, load_profile
- Non-zero exit code on FAIL (CI/CD pipeline friendly)
- CLI quality-gate subcommand with table + JSON output
- Programmatic evaluate_quality_gate() API
- 23 new tests

Closes #257
---
 ROADMAP.md                         |  19 +-
 docs/iterations/current.md         |   3 +-
 src/xpyd_plan/__init__.py          |  19 ++
 src/xpyd_plan/cli/_main.py         |   6 +
 src/xpyd_plan/cli/_quality_gate.py | 120 ++++++++++
 src/xpyd_plan/quality_gate.py      | 326 +++++++++++++++++++++++++++
 tests/test_quality_gate.py         | 351 +++++++++++++++++++++++++++++
 7 files changed, 840 insertions(+), 4 deletions(-)
 create mode 100644 src/xpyd_plan/cli/_quality_gate.py
 create mode 100644 src/xpyd_plan/quality_gate.py
 create mode 100644 tests/test_quality_gate.py

diff --git a/ROADMAP.md b/ROADMAP.md
index e67b3a9..4496cb9 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -1557,9 +1557,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
 - Programmatic `optimize_workload_mix()` API
 - 32 new tests
 
-### M116 🔄 GPU Hour Calculator
+### M116 ✅ GPU Hour Calculator
 
-*In progress*
+*Completed — PR #256*
 
 - `GPUHourCalculator` class in `gpu_hours.py`
 - `TrafficProfile`, `HourlyTraffic`, `GPUHourReport`, `ScalingSavings`, `HourBreakdown` Pydantic models
@@ -1568,4 +1568,17 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
 - Auto-scaling savings estimation (fixed vs dynamic provisioning)
 - CLI `gpu-hours` subcommand with `--benchmark`, `--traffic-profile`, `--gpu-cost`, table + JSON output
 - Programmatic `calculate_gpu_hours()` API
-- ~24 new tests
+- 24 new tests
+
+### M117 🔄 Benchmark Quality Gate
+
+*In progress*
+
+- `QualityGate` class in `quality_gate.py`
+- `GateConfig`, `GateCheck`, `GateResult`, `GateVerdict` Pydantic models
+- Composite pass/fail gate combining: data validation, percentile convergence, load profile stability, outlier ratio, minimum request count
+- YAML-configurable gate rules (thresholds per check)
+- Non-zero exit code on FAIL (CI/CD pipeline integration)
+- CLI `quality-gate` subcommand with `--benchmark`, `--config`, table + JSON output
+- Programmatic `evaluate_quality_gate()` API
+- ~22 new tests
diff --git a/docs/iterations/current.md b/docs/iterations/current.md
index c3d1dd0..2fd08d7 100644
--- a/docs/iterations/current.md
+++ b/docs/iterations/current.md
@@ -68,4 +68,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
 | 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved |
 | 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved |
 | 9 | 2026-04-06 | M115 Workload Mix Optimizer | ✅ merged | PR #254, both bots approved |
-| 10 | 2026-04-06 | M116 GPU Hour Calculator | ⏳ pending review | Issue #255 |
+| 10 | 2026-04-06 | M116 GPU Hour Calculator | ✅ merged | PR #256, both bots approved |
+| 11 | 2026-04-06 | M117 Benchmark Quality Gate | ⏳ pending review | PR TBD |
diff --git a/src/xpyd_plan/__init__.py b/src/xpyd_plan/__init__.py
index 96d3f46..158a9ff 100644
--- a/src/xpyd_plan/__init__.py
+++ b/src/xpyd_plan/__init__.py
@@ -1554,3 +1554,22 @@
     "TrafficProfile",
     "calculate_gpu_hours",
 ]
+from xpyd_plan.quality_gate import (  # noqa: E402
+    GateCheck,
+    GateConfig,
+    GateResult,
+    GateVerdict,
+    QualityGate,
+    evaluate_quality_gate,
+    load_gate_config,
+)
+
+__all__ += [
+    "GateCheck",
+    "GateConfig",
+    "GateResult",
+    "GateVerdict",
+    "QualityGate",
+    "evaluate_quality_gate",
+    "load_gate_config",
+]
diff --git a/src/xpyd_plan/cli/_main.py b/src/xpyd_plan/cli/_main.py
index 3dd6226..330409a 100644
--- a/src/xpyd_plan/cli/_main.py
+++ b/src/xpyd_plan/cli/_main.py
@@ -64,6 +64,7 @@
 from xpyd_plan.cli._pipeline import _cmd_pipeline
 from xpyd_plan.cli._plan_benchmarks import _cmd_plan_benchmarks, add_plan_benchmarks_parser
 from xpyd_plan.cli._qps_curve import add_qps_curve_parser
+from xpyd_plan.cli._quality_gate import register as register_quality_gate
 from xpyd_plan.cli._queue import add_queue_parser
 from xpyd_plan.cli._ranking import _cmd_ranking, add_ranking_parser
 from xpyd_plan.cli._rate_limit import add_rate_limit_parser
@@ -970,6 +971,7 @@ def main(argv: list[str] | None = None) -> None:
     register_trtllm_commands(subparsers)
     register_compare_backends(subparsers)
     register_gpu_hours(subparsers)
+    register_quality_gate(subparsers)
     register_workload_mix(subparsers)
     add_rate_limit_parser(subparsers)
     add_batch_analysis_parser(subparsers)
@@ -1332,6 +1334,10 @@ def main(argv: list[str] | None = None) -> None:
         from xpyd_plan.cli._gpu_hours import _run as _cmd_gpu_hours
 
         _cmd_gpu_hours(args)
+    elif args.command == "quality-gate":
+        from xpyd_plan.cli._quality_gate import _run as _cmd_quality_gate
+
+        _cmd_quality_gate(args)
     else:
         parser.print_help()
         sys.exit(1)
diff --git a/src/xpyd_plan/cli/_quality_gate.py b/src/xpyd_plan/cli/_quality_gate.py
new file mode 100644
index 0000000..7cdcdfa
--- /dev/null
+++ b/src/xpyd_plan/cli/_quality_gate.py
@@ -0,0 +1,120 @@
+"""CLI subcommand for benchmark quality gate."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from typing import Any
+
+from rich.console import Console
+from rich.table import Table
+
+from xpyd_plan.bench_adapter import load_benchmark_auto
+from xpyd_plan.quality_gate import GateConfig, GateVerdict, QualityGate, load_gate_config
+
+
+def register(subparsers: Any) -> None:
+    """Register the quality-gate subcommand."""
+    parser = subparsers.add_parser(
+        "quality-gate",
+        help="Composite pass/fail gate for benchmark quality (CI/CD friendly)",
+    )
+    parser.add_argument(
+        "--benchmark",
+        required=True,
+        help="Path to benchmark JSON file",
+    )
+    parser.add_argument(
+        "--config",
+        default=None,
+        help="Path to YAML gate config file (overrides other flags)",
+    )
+    parser.add_argument(
+        "--min-requests",
+        type=int,
+        default=100,
+        help="Minimum request count (default: 100)",
+    )
+    parser.add_argument(
+        "--min-quality-score",
+        type=float,
+        default=0.7,
+        help="Minimum data quality score 0-1 (default: 0.7)",
+    )
+    parser.add_argument(
+        "--max-outlier-pct",
+        type=float,
+        default=10.0,
+        help="Maximum outlier percentage (default: 10.0)",
+    )
+    parser.add_argument(
+        "--require-stable-convergence",
+        action="store_true",
+        default=True,
+        help="Require stable percentile convergence (default: True)",
+    )
+    parser.add_argument(
+        "--no-require-stable-convergence",
+        action="store_false",
+        dest="require_stable_convergence",
+        help="Allow marginal convergence",
+    )
+    parser.add_argument(
+        "--output-format",
+        choices=["table", "json"],
+        default="table",
+        help="Output format (default: table)",
+    )
+
+
+def _run(args: argparse.Namespace) -> None:
+    """Execute the quality-gate subcommand."""
+    console = Console()
+
+    data = load_benchmark_auto(args.benchmark)
+
+    if args.config:
+        config = load_gate_config(args.config)
+    else:
+        config = GateConfig(
+            min_requests=args.min_requests,
+            min_quality_score=args.min_quality_score,
+            max_outlier_pct=args.max_outlier_pct,
+            require_stable_convergence=args.require_stable_convergence,
+        )
+
+    gate = QualityGate(config)
+    result = gate.evaluate(data)
+
+    output_format = getattr(args, "output_format", "table")
+    if output_format == "json":
+        json.dump(result.model_dump(), sys.stdout, indent=2)
+        sys.stdout.write("\n")
+        if result.verdict == GateVerdict.FAIL:
+            sys.exit(1)
+        return
+
+    verdict_style = {
+        GateVerdict.PASS: ("[green]PASS[/green]", "✅"),
+        GateVerdict.WARN: ("[yellow]WARN[/yellow]", "⚠️"),
+        GateVerdict.FAIL: ("[red]FAIL[/red]", "❌"),
+    }
+
+    styled, emoji = verdict_style[result.verdict]
+    console.print(f"\n{emoji} Quality Gate: {styled}  ({result.request_count} requests)\n")
+
+    table = Table(title="Gate Checks")
+    table.add_column("Check", justify="left")
+    table.add_column("Verdict", justify="center")
+    table.add_column("Detail", justify="left")
+    table.add_column("Threshold", justify="left")
+
+    for check in result.checks:
+        s, e = verdict_style[check.verdict]
+        table.add_row(check.name, f"{e} {s}", check.detail, check.threshold or "—")
+
+    console.print(table)
+
+    if result.verdict == GateVerdict.FAIL:
+        sys.exit(1)
diff --git a/src/xpyd_plan/quality_gate.py b/src/xpyd_plan/quality_gate.py
new file mode 100644
index 0000000..9f98c7f
--- /dev/null
+++ b/src/xpyd_plan/quality_gate.py
@@ -0,0 +1,326 @@
+"""Benchmark Quality Gate — composite pass/fail gate for CI/CD pipelines.
+
+Combines multiple quality checks (validation, convergence, load profile,
+outlier ratio, minimum request count) into a single gate verdict with
+YAML-configurable thresholds.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Optional
+
+import yaml
+from pydantic import BaseModel, Field
+
+from xpyd_plan.bench_adapter import load_benchmark_auto
+from xpyd_plan.benchmark_models import BenchmarkData
+from xpyd_plan.convergence import ConvergenceAnalyzer, StabilityStatus
+from xpyd_plan.load_profile import LoadProfileClassifier
+from xpyd_plan.outlier_impact import OutlierImpactAnalyzer
+from xpyd_plan.validator import DataValidator
+
+
+class GateVerdict(str, Enum):
+    """Overall gate verdict."""
+
+    PASS = "pass"
+    WARN = "warn"
+    FAIL = "fail"
+
+
+class GateCheck(BaseModel):
+    """Result of a single gate check."""
+
+    name: str = Field(description="Check name")
+    verdict: GateVerdict = Field(description="Check verdict")
+    detail: str = Field(description="Human-readable detail")
+    threshold: Optional[str] = Field(
+        default=None, description="Threshold that was evaluated"
+    )
+
+
+class GateConfig(BaseModel):
+    """Configuration for quality gate thresholds."""
+
+    min_requests: int = Field(
+        default=100, ge=1, description="Minimum number of requests required"
+    )
+    min_quality_score: float = Field(
+        default=0.7, ge=0.0, le=1.0, description="Minimum data quality score"
+    )
+    max_outlier_pct: float = Field(
+        default=10.0, ge=0.0, le=100.0, description="Maximum outlier percentage"
+    )
+    require_stable_convergence: bool = Field(
+        default=True, description="Require stable percentile convergence"
+    )
+    allowed_load_profiles: list[str] = Field(
+        default_factory=lambda: ["steady_state", "ramp_up", "ramp_down"],
+        description="Allowed load profile types",
+    )
+    convergence_steps: int = Field(default=10, ge=2, description="Convergence window steps")
+    convergence_threshold: float = Field(
+        default=0.05, ge=0.0, description="CV threshold for convergence"
+    )
+    iqr_multiplier: float = Field(
+        default=1.5, ge=0.0, description="IQR multiplier for outlier detection"
+    )
+    load_profile_window: float = Field(
+        default=5.0, ge=0.1, description="Window size for load profile (seconds)"
+    )
+
+
+class GateResult(BaseModel):
+    """Complete quality gate result."""
+
+    verdict: GateVerdict = Field(description="Overall gate verdict")
+    checks: list[GateCheck] = Field(description="Per-check results")
+    request_count: int = Field(description="Total requests in benchmark")
+    config: GateConfig = Field(description="Config used for evaluation")
+    passed: bool = Field(description="True if verdict is PASS")
+
+
+def load_gate_config(path: str) -> GateConfig:
+    """Load gate config from a YAML file.
+
+    Args:
+        path: Path to YAML config file.
+
+    Returns:
+        Parsed GateConfig.
+    """
+    with open(path) as f:
+        raw = yaml.safe_load(f)
+    if raw is None:
+        return GateConfig()
+    return GateConfig(**raw)
+
+
+class QualityGate:
+    """Evaluate benchmark data against configurable quality checks.
+
+    Checks:
+    1. Minimum request count
+    2. Data validation quality score
+    3. Outlier percentage
+    4. Percentile convergence stability
+    5. Load profile type
+    """
+
+    def __init__(self, config: GateConfig | None = None) -> None:
+        self._config = config or GateConfig()
+
+    @property
+    def config(self) -> GateConfig:
+        """Return current gate configuration."""
+        return self._config
+
+    def evaluate(self, data: BenchmarkData) -> GateResult:
+        """Evaluate all gate checks and return unified result."""
+        checks: list[GateCheck] = []
+
+        checks.append(self._check_min_requests(data))
+        checks.append(self._check_validation(data))
+        checks.append(self._check_outlier_ratio(data))
+        checks.append(self._check_convergence(data))
+        checks.append(self._check_load_profile(data))
+
+        verdicts = [c.verdict for c in checks]
+        if GateVerdict.FAIL in verdicts:
+            overall = GateVerdict.FAIL
+        elif GateVerdict.WARN in verdicts:
+            overall = GateVerdict.WARN
+        else:
+            overall = GateVerdict.PASS
+
+        return GateResult(
+            verdict=overall,
+            checks=checks,
+            request_count=len(data.requests),
+            config=self._config,
+            passed=overall == GateVerdict.PASS,
+        )
+
+    def _check_min_requests(self, data: BenchmarkData) -> GateCheck:
+        """Check minimum request count."""
+        count = len(data.requests)
+        threshold = self._config.min_requests
+        if count >= threshold:
+            return GateCheck(
+                name="min_requests",
+                verdict=GateVerdict.PASS,
+                detail=f"{count} requests (≥{threshold})",
+                threshold=str(threshold),
+            )
+        return GateCheck(
+            name="min_requests",
+            verdict=GateVerdict.FAIL,
+            detail=f"{count} requests (required ≥{threshold})",
+            threshold=str(threshold),
+        )
+
+    def _check_validation(self, data: BenchmarkData) -> GateCheck:
+        """Check data quality score."""
+        validator = DataValidator()
+        result = validator.validate(data)
+        score = result.quality.overall
+        threshold = self._config.min_quality_score
+
+        if score >= threshold:
+            return GateCheck(
+                name="quality_score",
+                verdict=GateVerdict.PASS,
+                detail=f"Quality score {score:.2f} (≥{threshold:.2f})",
+                threshold=str(threshold),
+            )
+        # Warn if close, fail if far
+        if score >= threshold * 0.8:
+            return GateCheck(
+                name="quality_score",
+                verdict=GateVerdict.WARN,
+                detail=f"Quality score {score:.2f} (threshold {threshold:.2f})",
+                threshold=str(threshold),
+            )
+        return GateCheck(
+            name="quality_score",
+            verdict=GateVerdict.FAIL,
+            detail=f"Quality score {score:.2f} (required ≥{threshold:.2f})",
+            threshold=str(threshold),
+        )
+
+    def _check_outlier_ratio(self, data: BenchmarkData) -> GateCheck:
+        """Check outlier percentage."""
+        analyzer = OutlierImpactAnalyzer()
+        report = analyzer.analyze(data, iqr_multiplier=self._config.iqr_multiplier)
+
+        pct = (
+            report.outlier_count / report.total_requests * 100
+            if report.total_requests > 0
+            else 0.0
+        )
+        max_pct = self._config.max_outlier_pct
+
+        if pct <= max_pct:
+            return GateCheck(
+                name="outlier_ratio",
+                verdict=GateVerdict.PASS,
+                detail=f"Outlier ratio {pct:.1f}% (≤{max_pct:.1f}%)",
+                threshold=f"{max_pct}%",
+            )
+        if pct <= max_pct * 1.5:
+            return GateCheck(
+                name="outlier_ratio",
+                verdict=GateVerdict.WARN,
+                detail=f"Outlier ratio {pct:.1f}% (threshold {max_pct:.1f}%)",
+                threshold=f"{max_pct}%",
+            )
+        return GateCheck(
+            name="outlier_ratio",
+            verdict=GateVerdict.FAIL,
+            detail=f"Outlier ratio {pct:.1f}% (max {max_pct:.1f}%)",
+            threshold=f"{max_pct}%",
+        )
+
+    def _check_convergence(self, data: BenchmarkData) -> GateCheck:
+        """Check percentile convergence stability."""
+        analyzer = ConvergenceAnalyzer(data)
+        report = analyzer.analyze(
+            steps=self._config.convergence_steps,
+            threshold=self._config.convergence_threshold,
+        )
+
+        if report.overall_status == StabilityStatus.STABLE:
+            return GateCheck(
+                name="convergence",
+                verdict=GateVerdict.PASS,
+                detail="Percentile convergence: stable",
+            )
+        if report.overall_status == StabilityStatus.MARGINAL:
+            unstable = [
+                m.field
+                for m in report.metrics
+                if m.status != StabilityStatus.STABLE
+            ]
+            verdict = (
+                GateVerdict.WARN
+                if not self._config.require_stable_convergence
+                else GateVerdict.FAIL
+            )
+            return GateCheck(
+                name="convergence",
+                verdict=verdict,
+                detail=f"Percentile convergence: marginal ({', '.join(unstable)})",
+            )
+
+        # UNSTABLE
+        unstable = [
+            m.field
+            for m in report.metrics
+            if m.status == StabilityStatus.UNSTABLE
+        ]
+        return GateCheck(
+            name="convergence",
+            verdict=GateVerdict.FAIL,
+            detail=f"Percentile convergence: unstable ({', '.join(unstable)})",
+        )
+
+    def _check_load_profile(self, data: BenchmarkData) -> GateCheck:
+        """Check load profile type against allowed list."""
+        classifier = LoadProfileClassifier(data)
+        report = classifier.classify(window_size=self._config.load_profile_window)
+        profile_type = report.profile.profile_type
+        allowed = self._config.allowed_load_profiles
+
+        if profile_type.value in allowed:
+            return GateCheck(
+                name="load_profile",
+                verdict=GateVerdict.PASS,
+                detail=f"Load profile: {profile_type.value}",
+                threshold=f"allowed: {', '.join(allowed)}",
+            )
+        return GateCheck(
+            name="load_profile",
+            verdict=GateVerdict.WARN,
+            detail=f"Load profile: {profile_type.value} (not in allowed: {', '.join(allowed)})",
+            threshold=f"allowed: {', '.join(allowed)}",
+        )
+
+
+def evaluate_quality_gate(
+    benchmark_path: str,
+    *,
+    config_path: str | None = None,
+    min_requests: int = 100,
+    min_quality_score: float = 0.7,
+    max_outlier_pct: float = 10.0,
+    require_stable_convergence: bool = True,
+) -> dict[str, Any]:
+    """Programmatic API for benchmark quality gate.
+
+    Args:
+        benchmark_path: Path to benchmark JSON file.
+        config_path: Optional YAML config file path. If provided, overrides
+            other keyword arguments.
+        min_requests: Minimum request count.
+        min_quality_score: Minimum data quality score (0-1).
+        max_outlier_pct: Maximum allowed outlier percentage.
+        require_stable_convergence: Whether to require stable convergence.
+
+    Returns:
+        Dict with gate result fields.
+    """
+    if config_path:
+        config = load_gate_config(config_path)
+    else:
+        config = GateConfig(
+            min_requests=min_requests,
+            min_quality_score=min_quality_score,
+            max_outlier_pct=max_outlier_pct,
+            require_stable_convergence=require_stable_convergence,
+        )
+
+    data = load_benchmark_auto(benchmark_path)
+    gate = QualityGate(config)
+    result = gate.evaluate(data)
+    return result.model_dump()
diff --git a/tests/test_quality_gate.py b/tests/test_quality_gate.py
new file mode 100644
index 0000000..ebc2846
--- /dev/null
+++ b/tests/test_quality_gate.py
@@ -0,0 +1,351 @@
+"""Tests for Benchmark Quality Gate (M117)."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+import yaml
+
+from xpyd_plan.benchmark_models import (
+    BenchmarkData,
+    BenchmarkMetadata,
+    BenchmarkRequest,
+)
+from xpyd_plan.quality_gate import (
+    GateCheck,
+    GateConfig,
+    GateResult,
+    GateVerdict,
+    QualityGate,
+    evaluate_quality_gate,
+    load_gate_config,
+)
+
+
+def _make_data(
+    n: int = 200,
+    measured_qps: float = 100.0,
+    num_prefill: int = 2,
+    num_decode: int = 2,
+) -> BenchmarkData:
+    """Create benchmark data with n requests."""
+    requests = [
+        BenchmarkRequest(
+            request_id=f"r{i}",
+            prompt_tokens=100 + (i % 50),
+            output_tokens=50 + (i % 30),
+            ttft_ms=20.0 + (i % 10),
+            tpot_ms=10.0 + (i % 5),
+            total_latency_ms=30.0 + (i % 15),
+            timestamp=float(i),
+        )
+        for i in range(n)
+    ]
+    return BenchmarkData(
+        metadata=BenchmarkMetadata(
+            num_prefill_instances=num_prefill,
+            num_decode_instances=num_decode,
+            total_instances=num_prefill + num_decode,
+            measured_qps=measured_qps,
+        ),
+        requests=requests,
+    )
+
+
+def _save_benchmark(data: BenchmarkData, path: Path) -> None:
+    """Save benchmark data to JSON."""
+    path.write_text(json.dumps(data.model_dump(), default=str))
+
+
+# --- Model tests ---
+
+
+class TestGateConfig:
+    """Test GateConfig model."""
+
+    def test_defaults(self) -> None:
+        config = GateConfig()
+        assert config.min_requests == 100
+        assert config.min_quality_score == 0.7
+        assert config.max_outlier_pct == 10.0
+        assert config.require_stable_convergence is True
+        assert "steady_state" in config.allowed_load_profiles
+
+    def test_custom_values(self) -> None:
+        config = GateConfig(min_requests=50, min_quality_score=0.9)
+        assert config.min_requests == 50
+        assert config.min_quality_score == 0.9
+
+    def test_validation(self) -> None:
+        with pytest.raises(Exception):
+            GateConfig(min_requests=0)
+        with pytest.raises(Exception):
+            GateConfig(min_quality_score=1.5)
+
+
+class TestGateVerdict:
+    """Test GateVerdict enum."""
+
+    def test_values(self) -> None:
+        assert GateVerdict.PASS == "pass"
+        assert GateVerdict.WARN == "warn"
+        assert GateVerdict.FAIL == "fail"
+
+
+class TestGateCheck:
+    """Test GateCheck model."""
+
+    def test_basic(self) -> None:
+        check = GateCheck(
+            name="test", verdict=GateVerdict.PASS, detail="ok"
+        )
+        assert check.name == "test"
+        assert check.verdict == GateVerdict.PASS
+        assert check.threshold is None
+
+
+class TestGateResult:
+    """Test GateResult model."""
+
+    def test_passed_flag(self) -> None:
+        result = GateResult(
+            verdict=GateVerdict.PASS,
+            checks=[],
+            request_count=100,
+            config=GateConfig(),
+            passed=True,
+        )
+        assert result.passed is True
+
+    def test_failed_flag(self) -> None:
+        result = GateResult(
+            verdict=GateVerdict.FAIL,
+            checks=[],
+            request_count=10,
+            config=GateConfig(),
+            passed=False,
+        )
+        assert result.passed is False
+
+
+# --- Load config tests ---
+
+
+class TestLoadGateConfig:
+    """Test YAML config loading."""
+
+    def test_load_yaml(self, tmp_path: Path) -> None:
+        cfg = {"min_requests": 50, "max_outlier_pct": 5.0}
+        path = tmp_path / "gate.yaml"
+        path.write_text(yaml.dump(cfg))
+        config = load_gate_config(str(path))
+        assert config.min_requests == 50
+        assert config.max_outlier_pct == 5.0
+
+    def test_load_empty_yaml(self, tmp_path: Path) -> None:
+        path = tmp_path / "empty.yaml"
+        path.write_text("")
+        config = load_gate_config(str(path))
+        assert config.min_requests == 100  # default
+
+
+# --- QualityGate tests ---
+
+
+class TestQualityGate:
+    """Test QualityGate evaluator."""
+
+    def test_pass_with_good_data(self) -> None:
+        data = _make_data(n=200)
+        gate = QualityGate()
+        result = gate.evaluate(data)
+        assert result.request_count == 200
+        assert len(result.checks) == 5
+        # Should have check names
+        names = [c.name for c in result.checks]
+        assert "min_requests" in names
+        assert "quality_score" in names
+        assert "outlier_ratio" in names
+        assert "convergence" in names
+        assert "load_profile" in names
+
+    def test_fail_min_requests(self) -> None:
+        data = _make_data(n=10)
+        config = GateConfig(min_requests=100)
+        gate = QualityGate(config)
+        result = gate.evaluate(data)
+        min_req_check = [c for c in result.checks if c.name == "min_requests"][0]
+        assert min_req_check.verdict == GateVerdict.FAIL
+
+    def test_pass_min_requests(self) -> None:
+        data = _make_data(n=200)
+        config = GateConfig(min_requests=100)
+        gate = QualityGate(config)
+        result = gate.evaluate(data)
+        min_req_check = [c for c in result.checks if c.name == "min_requests"][0]
+        assert min_req_check.verdict == GateVerdict.PASS
+
+    def test_fail_propagates_to_overall(self) -> None:
+        data = _make_data(n=5)
+        config = GateConfig(min_requests=100)
+        gate = QualityGate(config)
+        result = gate.evaluate(data)
+        assert result.verdict == GateVerdict.FAIL
+        assert result.passed is False
+
+    def test_config_property(self) -> None:
+        config = GateConfig(min_requests=42)
+        gate = QualityGate(config)
+        assert gate.config.min_requests == 42
+
+    def test_default_config(self) -> None:
+        gate = QualityGate()
+        assert gate.config.min_requests == 100
+
+    def test_custom_gate_config(self) -> None:
+        data = _make_data(n=200)
+        config = GateConfig(
+            min_requests=10,
+            max_outlier_pct=50.0,
+            require_stable_convergence=False,
+        )
+        gate = QualityGate(config)
+        result = gate.evaluate(data)
+        # With relaxed config, should likely pass
+        assert isinstance(result.verdict, GateVerdict)
+
+    def test_result_serialization(self) -> None:
+        data = _make_data(n=200)
+        gate = QualityGate()
+        result = gate.evaluate(data)
+        d = result.model_dump()
+        assert "verdict" in d
+        assert "checks" in d
+        assert "config" in d
+        assert "passed" in d
+
+
+# --- Programmatic API tests ---
+
+
+class TestEvaluateQualityGate:
+    """Test the evaluate_quality_gate() API."""
+
+    def test_basic(self, tmp_path: Path) -> None:
+        data = _make_data(n=200)
+        path = tmp_path / "bench.json"
+        _save_benchmark(data, path)
+        result = evaluate_quality_gate(str(path))
+        assert "verdict" in result
+        assert "checks" in result
+        assert "passed" in result
+
+    def test_with_yaml_config(self, tmp_path: Path) -> None:
+        data = _make_data(n=200)
+        bench_path = tmp_path / "bench.json"
+        _save_benchmark(data, bench_path)
+
+        cfg = {"min_requests": 10, "max_outlier_pct": 50.0}
+        cfg_path = tmp_path / "gate.yaml"
+        cfg_path.write_text(yaml.dump(cfg))
+
+        result = evaluate_quality_gate(str(bench_path), config_path=str(cfg_path))
+        assert "verdict" in result
+
+    def test_custom_kwargs(self, tmp_path: Path) -> None:
+        data = _make_data(n=200)
+        path = tmp_path / "bench.json"
+        _save_benchmark(data, path)
+        result = evaluate_quality_gate(
+            str(path), min_requests=10, max_outlier_pct=50.0
+        )
+        assert "verdict" in result
+
+
+# --- CLI tests ---
+
+
+class TestQualityGateCLI:
+    """Test CLI quality-gate subcommand."""
+
+    def test_json_output(self, tmp_path: Path) -> None:
+
+        data = _make_data(n=200)
+        path = tmp_path / "bench.json"
+        _save_benchmark(data, path)
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "xpyd_plan.cli",
+                "quality-gate",
+                "--benchmark",
+                str(path),
+                "--output-format",
+                "json",
+                "--min-requests",
+                "10",
+            ],
+            capture_output=True,
+            text=True,
+        )
+        # May pass or fail depending on data quality, but should produce valid JSON
+        output = result.stdout.strip()
+        if output:
+            parsed = json.loads(output)
+            assert "verdict" in parsed
+            assert "checks" in parsed
+
+    def test_table_output(self, tmp_path: Path) -> None:
+
+        data = _make_data(n=200)
+        path = tmp_path / "bench.json"
+        _save_benchmark(data, path)
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "xpyd_plan.cli",
+                "quality-gate",
+                "--benchmark",
+                str(path),
+                "--min-requests",
+                "10",
+            ],
+            capture_output=True,
+            text=True,
+        )
+        # Should produce table output (check it ran without crash)
+        assert "Quality Gate" in result.stdout or result.returncode in (0, 1)
+
+
+# --- Public imports test ---
+
+
+class TestPublicImports:
+    """Test that all public symbols are importable."""
+
+    def test_imports(self) -> None:
+        from xpyd_plan import (
+            GateCheck,
+            GateConfig,
+            GateResult,
+            GateVerdict,
+            QualityGate,
+            evaluate_quality_gate,
+            load_gate_config,
+        )
+
+        assert GateCheck is not None
+        assert GateConfig is not None
+        assert GateResult is not None
+        assert GateVerdict is not None
+        assert QualityGate is not None
+        assert evaluate_quality_gate is not None
+        assert load_gate_config is not None