xPyD-hub · hlin99 · Apr 6, 2026 · Apr 6, 2026
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -1531,9 +1531,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
 - Programmatic `generate_trtllm_commands()` API
 - 29 new tests
 
-### M114 🔄 Multi-Backend Comparison Report
+### M114 ✅ Multi-Backend Comparison Report
 
-*In progress*
+*Completed — PR #252*
 
 - `BackendComparator` class in `backend_compare.py`
 - `BackendComparisonConfig`, `BackendMetrics`, `BackendComparisonReport`, `BackendRanking`, `SLAResult` Pydantic models
@@ -1543,3 +1543,16 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
 - CLI `compare-backends` subcommand with `--benchmark`, `--labels`, `--formats`, `--rank-by`, table + JSON output
 - Programmatic `compare_backends()` API
 - ~25 new tests
+
+### M115 🔄 Workload Mix Optimizer
+
+*In progress*
+
+- `WorkloadMixOptimizer` class in `workload_mix.py`
+- `WorkloadSpec`, `WorkloadAllocation`, `MixOptimizationResult` Pydantic models
+- Given benchmark data for multiple workloads (different models/request patterns), find minimum total GPU instances while meeting per-workload SLA
+- Brute-force enumeration across all valid P:D allocations per workload
+- Support shared vs dedicated instance pools
+- CLI `workload-mix` subcommand with `--workload` (repeatable), `--total-gpus`, table + JSON output
+- Programmatic `optimize_workload_mix()` API
+- ~25 new tests
diff --git a/docs/iterations/current.md b/docs/iterations/current.md
@@ -66,4 +66,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
 | 5 | 2026-04-06 | M111 SGLang Benchmark Command Generator | ✅ merged | PR #246 |
 | 6 | 2026-04-06 | M112 TensorRT-LLM Benchmark Format Importer | ✅ merged | PR #248, both bots approved |
 | 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ✅ merged | PR #250, both bots approved |
-| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ⏳ pending review | Issue #251 |
+| 8 | 2026-04-06 | M114 Multi-Backend Comparison Report | ✅ merged | PR #252, both bots approved |
+| 9 | 2026-04-06 | M115 Workload Mix Optimizer | ⏳ pending review | Issue #253 |
diff --git a/src/xpyd_plan/__init__.py b/src/xpyd_plan/__init__.py
@@ -1517,3 +1517,21 @@
     "SLAResult",
     "compare_backends",
 ]
+
+from xpyd_plan.workload_mix import (  # noqa: E402
+    AllocationMode,
+    MixOptimizationResult,
+    WorkloadAllocation,
+    WorkloadMixOptimizer,
+    WorkloadSpec,
+    optimize_workload_mix,
+)
+
+__all__ += [
+    "AllocationMode",
+    "MixOptimizationResult",
+    "WorkloadAllocation",
+    "WorkloadMixOptimizer",
+    "WorkloadSpec",
+    "optimize_workload_mix",
+]
diff --git a/src/xpyd_plan/cli/_main.py b/src/xpyd_plan/cli/_main.py
@@ -105,6 +105,7 @@
 from xpyd_plan.cli._weighted_goodput import register as _register_weighted_goodput
 from xpyd_plan.cli._whatif import _cmd_what_if
 from xpyd_plan.cli._workload import _cmd_workload
+from xpyd_plan.cli._workload_mix import register as register_workload_mix
 
 
 def main(argv: list[str] | None = None) -> None:
@@ -967,6 +968,7 @@ def main(argv: list[str] | None = None) -> None:
     register_sglang_commands(subparsers)
     register_trtllm_commands(subparsers)
     register_compare_backends(subparsers)
+    register_workload_mix(subparsers)
     add_rate_limit_parser(subparsers)
     add_batch_analysis_parser(subparsers)
     add_stat_summary_parser(subparsers)
@@ -1320,6 +1322,10 @@ def main(argv: list[str] | None = None) -> None:
         from xpyd_plan.cli._compare_backends import _cmd_compare_backends
 
         _cmd_compare_backends(args)
+    elif args.command == "workload-mix":
+        from xpyd_plan.cli._workload_mix import _run as _cmd_workload_mix
+
+        _cmd_workload_mix(args)
     else:
         parser.print_help()
         sys.exit(1)
diff --git a/src/xpyd_plan/cli/_workload_mix.py b/src/xpyd_plan/cli/_workload_mix.py
@@ -0,0 +1,149 @@
+"""CLI subcommand for workload mix optimization."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from xpyd_plan.workload_mix import (
+    AllocationMode,
+    MixOptimizationResult,
+    WorkloadMixOptimizer,
+    WorkloadSpec,
+)
+
+
+def register(subparsers: Any) -> None:
+    """Register the workload-mix subcommand."""
+    p = subparsers.add_parser(
+        "workload-mix",
+        help="Optimize GPU allocation across multiple workloads",
+        description=(
+            "Given benchmark data for multiple workloads, find the minimum "
+            "total GPU instances while meeting per-workload SLA constraints."
+        ),
+    )
+    p.add_argument(
+        "--workload",
+        action="append",
+        required=True,
+        metavar="YAML",
+        help="Workload spec YAML file (repeatable, one per workload)",
+    )
+    p.add_argument(
+        "--total-gpus",
+        type=int,
+        default=None,
+        help="Total GPU budget (default: unlimited)",
+    )
+    p.add_argument(
+        "--max-per-workload",
+        type=int,
+        default=32,
+        help="Max instances per workload role (default: 32)",
+    )
+    p.add_argument(
+        "--json",
+        dest="json_output",
+        action="store_true",
+        help="Output as JSON",
+    )
+    p.set_defaults(func=_run)
+
+
+def _load_workload(path: str) -> WorkloadSpec:
+    """Load a WorkloadSpec from a YAML file.
+
+    Expected YAML format:
+        name: "workload-a"
+        benchmark: "path/to/benchmark.json"
+        sla:
+          ttft_p99_ms: 200
+          tpot_p99_ms: 50
+        min_prefill: 1
+        min_decode: 1
+        weight: 1.0
+    """
+    from xpyd_plan.benchmark_models import BenchmarkData
+    from xpyd_plan.models import SLAConfig
+
+    data = yaml.safe_load(Path(path).read_text())
+    bench_path = Path(path).parent / data["benchmark"]
+    bench_data = BenchmarkData.model_validate_json(bench_path.read_text())
+    sla = SLAConfig(**data.get("sla", {}))
+    return WorkloadSpec(
+        name=data.get("name", bench_path.stem),
+        benchmark_data=bench_data,
+        sla=sla,
+        min_prefill=data.get("min_prefill", 1),
+        min_decode=data.get("min_decode", 1),
+        weight=data.get("weight", 1.0),
+    )
+
+
+def _print_table(result: MixOptimizationResult) -> None:
+    """Print results as a Rich table."""
+    try:
+        from rich.console import Console
+        from rich.table import Table
+    except ImportError:
+        # Fallback plain text
+        print(result.summary)
+        for a in result.allocations:
+            print(f"  {a.name}: {a.ratio_str} waste={a.weighted_waste:.3f} sla={a.meets_sla}")
+        return
+
+    console = Console()
+    console.print(f"\n[bold]{result.summary}[/bold]\n")
+
+    if not result.feasible:
+        return
+
+    table = Table(title="Workload Allocations")
+    table.add_column("Workload", style="cyan")
+    table.add_column("P:D Ratio", style="green")
+    table.add_column("Instances", justify="right")
+    table.add_column("P Waste", justify="right")
+    table.add_column("D Waste", justify="right")
+    table.add_column("Weighted Waste", justify="right")
+    table.add_column("SLA Met", justify="center")
+
+    for a in result.allocations:
+        table.add_row(
+            a.name,
+            a.ratio_str,
+            str(a.total_instances),
+            f"{a.prefill_waste:.1%}",
+            f"{a.decode_waste:.1%}",
+            f"{a.weighted_waste:.4f}",
+            "✅" if a.meets_sla else "❌",
+        )
+
+    console.print(table)
+    console.print(f"\nCandidates evaluated: {result.candidates_evaluated}")
+
+
+def _run(args: argparse.Namespace) -> None:
+    """Execute workload-mix optimization."""
+    workloads: list[WorkloadSpec] = []
+    for wpath in args.workload:
+        workloads.append(_load_workload(wpath))
+
+    optimizer = WorkloadMixOptimizer(max_instances_per_workload=args.max_per_workload)
+    result = optimizer.optimize(
+        workloads,
+        total_gpu_budget=args.total_gpus,
+        mode=AllocationMode.DEDICATED,
+    )
+
+    if args.json_output:
+        print(result.model_dump_json(indent=2))
+    else:
+        _print_table(result)
+
+    if not result.feasible:
+        sys.exit(1)