Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -1531,9 +1531,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Programmatic `generate_trtllm_commands()` API
- 29 new tests

### M114 🔄 Multi-Backend Comparison Report
### M114 Multi-Backend Comparison Report

*In progress*
*Completed — PR #252*

- `BackendComparator` class in `backend_compare.py`
- `BackendComparisonConfig`, `BackendMetrics`, `BackendComparisonReport`, `BackendRanking`, `SLAResult` Pydantic models
Expand All @@ -1543,3 +1543,16 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- CLI `compare-backends` subcommand with `--benchmark`, `--labels`, `--formats`, `--rank-by`, table + JSON output
- Programmatic `compare_backends()` API
- ~25 new tests

### M115 🔄 Workload Mix Optimizer

*In progress*

- `WorkloadMixOptimizer` class in `workload_mix.py`
- `WorkloadSpec`, `WorkloadAllocation`, `MixOptimizationResult` Pydantic models
- Given benchmark data for multiple workloads (different models/request patterns), find minimum total GPU instances while meeting per-workload SLA
- Brute-force enumeration across all valid P:D allocations per workload
- Support shared vs dedicated instance pools
- CLI `workload-mix` subcommand with `--workload` (repeatable), `--total-gpus`, table + JSON output
- Programmatic `optimize_workload_mix()` API
- ~25 new tests
3 changes: 2 additions & 1 deletion docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
| 5 | 2026-04-06 | M111 SGLang Benchmark Command Generator | ✅ merged | PR #246 |
| 6 | 2026-04-06 | M112 TensorRT-LLM Benchmark Format Importer | ✅ merged | PR #248, both bots approved |
| 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved |
| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ⏳ pending review | Issue #251 |
| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved |
| 9 | 2026-04-06 | M115 Workload Mix Optimizer | ⏳ pending review | Issue #253 |
18 changes: 18 additions & 0 deletions src/xpyd_plan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,21 @@
"SLAResult",
"compare_backends",
]

from xpyd_plan.workload_mix import ( # noqa: E402
AllocationMode,
MixOptimizationResult,
WorkloadAllocation,
WorkloadMixOptimizer,
WorkloadSpec,
optimize_workload_mix,
)

__all__ += [
"AllocationMode",
"MixOptimizationResult",
"WorkloadAllocation",
"WorkloadMixOptimizer",
"WorkloadSpec",
"optimize_workload_mix",
]
6 changes: 6 additions & 0 deletions src/xpyd_plan/cli/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
from xpyd_plan.cli._weighted_goodput import register as _register_weighted_goodput
from xpyd_plan.cli._whatif import _cmd_what_if
from xpyd_plan.cli._workload import _cmd_workload
from xpyd_plan.cli._workload_mix import register as register_workload_mix


def main(argv: list[str] | None = None) -> None:
Expand Down Expand Up @@ -967,6 +968,7 @@ def main(argv: list[str] | None = None) -> None:
register_sglang_commands(subparsers)
register_trtllm_commands(subparsers)
register_compare_backends(subparsers)
register_workload_mix(subparsers)
add_rate_limit_parser(subparsers)
add_batch_analysis_parser(subparsers)
add_stat_summary_parser(subparsers)
Expand Down Expand Up @@ -1320,6 +1322,10 @@ def main(argv: list[str] | None = None) -> None:
from xpyd_plan.cli._compare_backends import _cmd_compare_backends

_cmd_compare_backends(args)
elif args.command == "workload-mix":
from xpyd_plan.cli._workload_mix import _run as _cmd_workload_mix

_cmd_workload_mix(args)
else:
parser.print_help()
sys.exit(1)
149 changes: 149 additions & 0 deletions src/xpyd_plan/cli/_workload_mix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""CLI subcommand for workload mix optimization."""

from __future__ import annotations

import argparse
import sys
from pathlib import Path
from typing import Any

import yaml

from xpyd_plan.workload_mix import (
AllocationMode,
MixOptimizationResult,
WorkloadMixOptimizer,
WorkloadSpec,
)


def register(subparsers: Any) -> None:
"""Register the workload-mix subcommand."""
p = subparsers.add_parser(
"workload-mix",
help="Optimize GPU allocation across multiple workloads",
description=(
"Given benchmark data for multiple workloads, find the minimum "
"total GPU instances while meeting per-workload SLA constraints."
),
)
p.add_argument(
"--workload",
action="append",
required=True,
metavar="YAML",
help="Workload spec YAML file (repeatable, one per workload)",
)
p.add_argument(
"--total-gpus",
type=int,
default=None,
help="Total GPU budget (default: unlimited)",
)
p.add_argument(
"--max-per-workload",
type=int,
default=32,
help="Max instances per workload role (default: 32)",
)
p.add_argument(
"--json",
dest="json_output",
action="store_true",
help="Output as JSON",
)
p.set_defaults(func=_run)


def _load_workload(path: str) -> WorkloadSpec:
"""Load a WorkloadSpec from a YAML file.

Expected YAML format:
name: "workload-a"
benchmark: "path/to/benchmark.json"
sla:
ttft_p99_ms: 200
tpot_p99_ms: 50
min_prefill: 1
min_decode: 1
weight: 1.0
"""
from xpyd_plan.benchmark_models import BenchmarkData
from xpyd_plan.models import SLAConfig

data = yaml.safe_load(Path(path).read_text())
bench_path = Path(path).parent / data["benchmark"]
bench_data = BenchmarkData.model_validate_json(bench_path.read_text())
sla = SLAConfig(**data.get("sla", {}))
return WorkloadSpec(
name=data.get("name", bench_path.stem),
benchmark_data=bench_data,
sla=sla,
min_prefill=data.get("min_prefill", 1),
min_decode=data.get("min_decode", 1),
weight=data.get("weight", 1.0),
)


def _print_table(result: MixOptimizationResult) -> None:
"""Print results as a Rich table."""
try:
from rich.console import Console
from rich.table import Table
except ImportError:
# Fallback plain text
print(result.summary)
for a in result.allocations:
print(f" {a.name}: {a.ratio_str} waste={a.weighted_waste:.3f} sla={a.meets_sla}")
return

console = Console()
console.print(f"\n[bold]{result.summary}[/bold]\n")

if not result.feasible:
return

table = Table(title="Workload Allocations")
table.add_column("Workload", style="cyan")
table.add_column("P:D Ratio", style="green")
table.add_column("Instances", justify="right")
table.add_column("P Waste", justify="right")
table.add_column("D Waste", justify="right")
table.add_column("Weighted Waste", justify="right")
table.add_column("SLA Met", justify="center")

for a in result.allocations:
table.add_row(
a.name,
a.ratio_str,
str(a.total_instances),
f"{a.prefill_waste:.1%}",
f"{a.decode_waste:.1%}",
f"{a.weighted_waste:.4f}",
"✅" if a.meets_sla else "❌",
)

console.print(table)
console.print(f"\nCandidates evaluated: {result.candidates_evaluated}")


def _run(args: argparse.Namespace) -> None:
"""Execute workload-mix optimization."""
workloads: list[WorkloadSpec] = []
for wpath in args.workload:
workloads.append(_load_workload(wpath))

optimizer = WorkloadMixOptimizer(max_instances_per_workload=args.max_per_workload)
result = optimizer.optimize(
workloads,
total_gpu_budget=args.total_gpus,
mode=AllocationMode.DEDICATED,
)

if args.json_output:
print(result.model_dump_json(indent=2))
else:
_print_table(result)

if not result.feasible:
sys.exit(1)
Loading
Loading