Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -1544,9 +1544,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Programmatic `compare_backends()` API
- ~25 new tests

### M115 🔄 Workload Mix Optimizer
### M115 Workload Mix Optimizer

*In progress*
*Completed — PR #254*

- `WorkloadMixOptimizer` class in `workload_mix.py`
- `WorkloadSpec`, `WorkloadAllocation`, `MixOptimizationResult` Pydantic models
Expand All @@ -1555,4 +1555,17 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Support shared vs dedicated instance pools
- CLI `workload-mix` subcommand with `--workload` (repeatable), `--total-gpus`, table + JSON output
- Programmatic `optimize_workload_mix()` API
- ~25 new tests
- 32 new tests

### M116 🔄 GPU Hour Calculator

*In progress*

- `GPUHourCalculator` class in `gpu_hours.py`
- `TrafficProfile`, `HourlyTraffic`, `GPUHourReport`, `ScalingSavings`, `HourBreakdown` Pydantic models
- 24-hour traffic profile input (hourly QPS values)
- Map each hour's QPS to required instances using measured benchmark capacity
- Auto-scaling savings estimation (fixed vs dynamic provisioning)
- CLI `gpu-hours` subcommand with `--benchmark`, `--traffic-profile`, `--gpu-cost`, table + JSON output
- Programmatic `calculate_gpu_hours()` API
- ~24 new tests
3 changes: 2 additions & 1 deletion docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
| 6 | 2026-04-06 | M112 TensorRT-LLM Benchmark Format Importer | ✅ merged | PR #248, both bots approved |
| 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved |
| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved |
| 9 | 2026-04-06 | M115 Workload Mix Optimizer | ⏳ pending review | Issue #253 |
| 9 | 2026-04-06 | M115 Workload Mix Optimizer | ✅ merged | PR #254, both bots approved |
| 10 | 2026-04-06 | M116 GPU Hour Calculator | ⏳ pending review | Issue #255 |
19 changes: 19 additions & 0 deletions src/xpyd_plan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1535,3 +1535,22 @@
"WorkloadSpec",
"optimize_workload_mix",
]
from xpyd_plan.gpu_hours import ( # noqa: E402
GPUHourCalculator,
GPUHourReport,
HourBreakdown,
HourlyTraffic,
ScalingSavings,
TrafficProfile,
calculate_gpu_hours,
)

__all__ += [
"GPUHourCalculator",
"GPUHourReport",
"HourBreakdown",
"HourlyTraffic",
"ScalingSavings",
"TrafficProfile",
"calculate_gpu_hours",
]
159 changes: 159 additions & 0 deletions src/xpyd_plan/cli/_gpu_hours.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""CLI subcommand for GPU hour calculation."""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

import yaml

from xpyd_plan.gpu_hours import (
GPUHourCalculator,
GPUHourReport,
HourlyTraffic,
TrafficProfile,
)


def register(subparsers: Any) -> None:
"""Register the gpu-hours subcommand."""
p = subparsers.add_parser(
"gpu-hours",
help="Estimate GPU hours and costs from traffic profiles",
description=(
"Given benchmark data and a daily traffic profile (hourly QPS), "
"estimate total GPU hours, costs, and auto-scaling savings."
),
)
p.add_argument(
"--benchmark",
required=True,
help="Benchmark JSON file",
)
p.add_argument(
"--traffic-profile",
required=True,
help="Traffic profile YAML file (hourly QPS schedule)",
)
p.add_argument(
"--gpu-cost",
type=float,
default=2.0,
help="GPU cost per instance per hour (default: 2.0)",
)
p.add_argument(
"--currency",
default="USD",
help="Currency label (default: USD)",
)
p.add_argument(
"--output-format",
choices=["table", "json"],
default="table",
help="Output format (default: table)",
)
p.set_defaults(func=_run)


def _run(args: argparse.Namespace) -> None:
"""Execute gpu-hours subcommand."""
from xpyd_plan.bench_adapter import load_benchmark_auto

data = load_benchmark_auto(Path(args.benchmark))

# Load traffic profile
profile_path = Path(args.traffic_profile)
with open(profile_path) as f:
profile_data = yaml.safe_load(f)

hours = [
HourlyTraffic(hour=h["hour"], qps=h["qps"]) for h in profile_data["hours"]
]
profile = TrafficProfile(
hours=hours,
name=profile_data.get("name", profile_path.stem),
)

calc = GPUHourCalculator(data)
report = calc.calculate(
profile,
gpu_cost_per_hour=args.gpu_cost,
currency=args.currency,
)

if args.output_format == "json":
json.dump(report.model_dump(), sys.stdout, indent=2)
sys.stdout.write("\n")
else:
_print_table(report)


def _print_table(report: GPUHourReport) -> None:
"""Print report as Rich table."""
from rich.console import Console
from rich.table import Table

console = Console()

# Summary
console.print(f"\n[bold]GPU Hour Report: {report.profile_name}[/bold]\n")

summary = Table(title="Summary")
summary.add_column("Metric", style="cyan")
summary.add_column("Value", justify="right")
summary.add_row("QPS per Instance", f"{report.qps_per_instance:.2f}")
summary.add_row("Peak QPS", f"{report.peak_qps:.1f}")
summary.add_row("Peak Instances", str(report.peak_instances))
summary.add_row("Off-Peak QPS", f"{report.off_peak_qps:.1f}")
summary.add_row("Off-Peak Instances", str(report.off_peak_instances))
summary.add_row("Avg Utilization", f"{report.avg_utilization:.1%}")
summary.add_row("Daily GPU Hours", f"{report.daily_gpu_hours:.1f}")
summary.add_row("Monthly GPU Hours", f"{report.monthly_gpu_hours:.1f}")
summary.add_row(
"Daily Cost", f"{report.daily_cost:.2f} {report.currency}"
)
summary.add_row(
"Monthly Cost", f"{report.monthly_cost:.2f} {report.currency}"
)
console.print(summary)

# Scaling savings
s = report.scaling_savings
savings = Table(title="Auto-Scaling Savings")
savings.add_column("Metric", style="cyan")
savings.add_column("Fixed", justify="right")
savings.add_column("Dynamic", justify="right")
savings.add_column("Saved", justify="right")
savings.add_row(
"Daily GPU Hours",
f"{s.fixed_daily_gpu_hours:.1f}",
f"{s.dynamic_daily_gpu_hours:.1f}",
f"{s.saved_gpu_hours:.1f} ({s.savings_percent:.1f}%)",
)
savings.add_row(
f"Daily Cost ({report.currency})",
f"{s.fixed_daily_cost:.2f}",
f"{s.dynamic_daily_cost:.2f}",
f"{s.saved_cost:.2f}",
)
console.print(savings)

# Hourly breakdown
hourly = Table(title="Hourly Breakdown")
hourly.add_column("Hour", justify="right")
hourly.add_column("QPS", justify="right")
hourly.add_column("Instances", justify="right")
hourly.add_column("GPU Hours", justify="right")
hourly.add_column(f"Cost ({report.currency})", justify="right")
for hb in report.hourly_breakdown:
hourly.add_row(
f"{hb.hour:02d}:00",
f"{hb.qps:.1f}",
str(hb.required_instances),
f"{hb.gpu_hours:.1f}",
f"{hb.cost:.2f}",
)
console.print(hourly)
6 changes: 6 additions & 0 deletions src/xpyd_plan/cli/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from xpyd_plan.cli._forecast import add_forecast_parser
from xpyd_plan.cli._generate import _cmd_generate
from xpyd_plan.cli._goodput import add_goodput_parser
from xpyd_plan.cli._gpu_hours import register as register_gpu_hours
from xpyd_plan.cli._health_check import _cmd_health_check, add_health_check_parser
from xpyd_plan.cli._heatmap import _cmd_heatmap, add_heatmap_parser
from xpyd_plan.cli._import import add_import_parser
Expand Down Expand Up @@ -968,6 +969,7 @@ def main(argv: list[str] | None = None) -> None:
register_sglang_commands(subparsers)
register_trtllm_commands(subparsers)
register_compare_backends(subparsers)
register_gpu_hours(subparsers)
register_workload_mix(subparsers)
add_rate_limit_parser(subparsers)
add_batch_analysis_parser(subparsers)
Expand Down Expand Up @@ -1326,6 +1328,10 @@ def main(argv: list[str] | None = None) -> None:
from xpyd_plan.cli._workload_mix import _run as _cmd_workload_mix

_cmd_workload_mix(args)
elif args.command == "gpu-hours":
from xpyd_plan.cli._gpu_hours import _run as _cmd_gpu_hours

_cmd_gpu_hours(args)
else:
parser.print_help()
sys.exit(1)
Loading
Loading