Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -1557,9 +1557,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Programmatic `optimize_workload_mix()` API
- 32 new tests

### M116 🔄 GPU Hour Calculator
### M116 GPU Hour Calculator

*In progress*
*Completed — PR #256*

- `GPUHourCalculator` class in `gpu_hours.py`
- `TrafficProfile`, `HourlyTraffic`, `GPUHourReport`, `ScalingSavings`, `HourBreakdown` Pydantic models
Expand All @@ -1568,4 +1568,17 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Auto-scaling savings estimation (fixed vs dynamic provisioning)
- CLI `gpu-hours` subcommand with `--benchmark`, `--traffic-profile`, `--gpu-cost`, table + JSON output
- Programmatic `calculate_gpu_hours()` API
- ~24 new tests
- 24 new tests

### M117 🔄 Benchmark Quality Gate

*In progress*

- `QualityGate` class in `quality_gate.py`
- `GateConfig`, `GateCheck`, `GateResult`, `GateVerdict` Pydantic models
- Composite pass/fail gate combining: data validation, percentile convergence, load profile stability, outlier ratio, minimum request count
- YAML-configurable gate rules (thresholds per check)
- Non-zero exit code on FAIL (CI/CD pipeline integration)
- CLI `quality-gate` subcommand with `--benchmark`, `--config`, table + JSON output
- Programmatic `evaluate_quality_gate()` API
- ~22 new tests
3 changes: 2 additions & 1 deletion docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
| 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved |
| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved |
| 9 | 2026-04-06 | M115 Workload Mix Optimizer | ✅ merged | PR #254, both bots approved |
| 10 | 2026-04-06 | M116 GPU Hour Calculator | ⏳ pending review | Issue #255 |
| 10 | 2026-04-06 | M116 GPU Hour Calculator | ✅ merged | PR #256, both bots approved |
| 11 | 2026-04-06 | M117 Benchmark Quality Gate | ⏳ pending review | PR TBD |
19 changes: 19 additions & 0 deletions src/xpyd_plan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1554,3 +1554,22 @@
"TrafficProfile",
"calculate_gpu_hours",
]
from xpyd_plan.quality_gate import ( # noqa: E402
GateCheck,
GateConfig,
GateResult,
GateVerdict,
QualityGate,
evaluate_quality_gate,
load_gate_config,
)

__all__ += [
"GateCheck",
"GateConfig",
"GateResult",
"GateVerdict",
"QualityGate",
"evaluate_quality_gate",
"load_gate_config",
]
6 changes: 6 additions & 0 deletions src/xpyd_plan/cli/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from xpyd_plan.cli._pipeline import _cmd_pipeline
from xpyd_plan.cli._plan_benchmarks import _cmd_plan_benchmarks, add_plan_benchmarks_parser
from xpyd_plan.cli._qps_curve import add_qps_curve_parser
from xpyd_plan.cli._quality_gate import register as register_quality_gate
from xpyd_plan.cli._queue import add_queue_parser
from xpyd_plan.cli._ranking import _cmd_ranking, add_ranking_parser
from xpyd_plan.cli._rate_limit import add_rate_limit_parser
Expand Down Expand Up @@ -970,6 +971,7 @@ def main(argv: list[str] | None = None) -> None:
register_trtllm_commands(subparsers)
register_compare_backends(subparsers)
register_gpu_hours(subparsers)
register_quality_gate(subparsers)
register_workload_mix(subparsers)
add_rate_limit_parser(subparsers)
add_batch_analysis_parser(subparsers)
Expand Down Expand Up @@ -1332,6 +1334,10 @@ def main(argv: list[str] | None = None) -> None:
from xpyd_plan.cli._gpu_hours import _run as _cmd_gpu_hours

_cmd_gpu_hours(args)
elif args.command == "quality-gate":
from xpyd_plan.cli._quality_gate import _run as _cmd_quality_gate

_cmd_quality_gate(args)
else:
parser.print_help()
sys.exit(1)
120 changes: 120 additions & 0 deletions src/xpyd_plan/cli/_quality_gate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""CLI subcommand for benchmark quality gate."""

from __future__ import annotations

import argparse
import json
import sys
from typing import Any

from rich.console import Console
from rich.table import Table

from xpyd_plan.bench_adapter import load_benchmark_auto
from xpyd_plan.quality_gate import GateConfig, GateVerdict, QualityGate, load_gate_config


def register(subparsers: Any) -> None:
"""Register the quality-gate subcommand."""
parser = subparsers.add_parser(
"quality-gate",
help="Composite pass/fail gate for benchmark quality (CI/CD friendly)",
)
parser.add_argument(
"--benchmark",
required=True,
help="Path to benchmark JSON file",
)
parser.add_argument(
"--config",
default=None,
help="Path to YAML gate config file (overrides other flags)",
)
parser.add_argument(
"--min-requests",
type=int,
default=100,
help="Minimum request count (default: 100)",
)
parser.add_argument(
"--min-quality-score",
type=float,
default=0.7,
help="Minimum data quality score 0-1 (default: 0.7)",
)
parser.add_argument(
"--max-outlier-pct",
type=float,
default=10.0,
help="Maximum outlier percentage (default: 10.0)",
)
parser.add_argument(
"--require-stable-convergence",
action="store_true",
default=True,
help="Require stable percentile convergence (default: True)",
)
parser.add_argument(
"--no-require-stable-convergence",
action="store_false",
dest="require_stable_convergence",
help="Allow marginal convergence",
)
parser.add_argument(
"--output-format",
choices=["table", "json"],
default="table",
help="Output format (default: table)",
)


def _run(args: argparse.Namespace) -> None:
"""Execute the quality-gate subcommand."""
console = Console()

data = load_benchmark_auto(args.benchmark)

if args.config:
config = load_gate_config(args.config)
else:
config = GateConfig(
min_requests=args.min_requests,
min_quality_score=args.min_quality_score,
max_outlier_pct=args.max_outlier_pct,
require_stable_convergence=args.require_stable_convergence,
)

gate = QualityGate(config)
result = gate.evaluate(data)

output_format = getattr(args, "output_format", "table")
if output_format == "json":
json.dump(result.model_dump(), sys.stdout, indent=2)
sys.stdout.write("\n")
if result.verdict == GateVerdict.FAIL:
sys.exit(1)
return

verdict_style = {
GateVerdict.PASS: ("[green]PASS[/green]", "✅"),
GateVerdict.WARN: ("[yellow]WARN[/yellow]", "⚠️"),
GateVerdict.FAIL: ("[red]FAIL[/red]", "❌"),
}

styled, emoji = verdict_style[result.verdict]
console.print(f"\n{emoji} Quality Gate: {styled} ({result.request_count} requests)\n")

table = Table(title="Gate Checks")
table.add_column("Check", justify="left")
table.add_column("Verdict", justify="center")
table.add_column("Detail", justify="left")
table.add_column("Threshold", justify="left")

for check in result.checks:
s, e = verdict_style[check.verdict]
table.add_row(check.name, f"{e} {s}", check.detail, check.threshold or "—")

console.print(table)

if result.verdict == GateVerdict.FAIL:
sys.exit(1)
Loading
Loading