Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -1505,9 +1505,9 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- Programmatic `generate_sglang_commands()` API
- 25 new tests

### M112 🔄 TensorRT-LLM Benchmark Format Importer
### M112 TensorRT-LLM Benchmark Format Importer

*In progress — PR #TBD*
*Completed — PR #248*

- `TRTLLMImporter` module in `trtllm_import.py`
- `TRTLLMRequest`, `TRTLLMBenchmarkData`, `TRTLLMImportConfig`, `TRTLLMImportResult` Pydantic models
Expand All @@ -1517,3 +1517,16 @@ Help users find the **optimal Prefill:Decode instance ratio** based on **real be
- CLI `import --format trtllm` support
- Programmatic `import_trtllm()` and `import_trtllm_data()` API
- 25+ new tests

### M113 🔄 TensorRT-LLM Benchmark Command Generator

*In progress*

- `TRTLLMCommandGenerator` class in `trtllm_commands.py`
- `TRTLLMCommandConfig`, `TRTLLMServerCommand`, `TRTLLMBenchmarkCommand`, `TRTLLMCommandSet` Pydantic models
- Generate TRT-LLM engine build (`trtllm-build`) and server launch commands for each P:D ratio
- TRT-LLM specific options: max_batch_size, kv_cache_free_gpu_mem_fraction, pp_size, dtype, engine_dir
- Shell script output with engine build + server + benchmark lifecycle
- CLI `trtllm-commands` subcommand with table + JSON output
- Programmatic `generate_trtllm_commands()` API
- 29 new tests
5 changes: 3 additions & 2 deletions docs/iterations/current.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ The project has completed **110 milestones**, covering the full feature chain fr
- Closed-loop integration with xPyD-proxy auto-tuning
- Web UI dashboard (replacing TUI)
- Richer visualizations (interactive charts)
- Support additional benchmark tool formats (TensorRT-LLM)
- Support additional benchmark tool formats (TensorRT-LLM)

## Iteration History

Expand All @@ -64,4 +64,5 @@ The project has completed **110 milestones**, covering the full feature chain fr
| 3 | 2026-04-06 | M109 vLLM Benchmark Command Generator | ✅ merged | PR #242 |
| 4 | 2026-04-06 | M110 SGLang Benchmark Format Importer | ✅ merged | PR #244 |
| 5 | 2026-04-06 | M111 SGLang Benchmark Command Generator | ✅ merged | PR #246 |
| 6 | 2026-04-06 | M112 TensorRT-LLM Benchmark Format Importer | ⏳ pending review | Issue #247 |
| 6 | 2026-04-06 | M112 TensorRT-LLM Benchmark Format Importer | ✅ merged | PR #248, both bots approved |
| 7 | 2026-04-06 | M113 TensorRT-LLM Benchmark Command Generator | ⏳ pending review | Issue #249 |
8 changes: 8 additions & 0 deletions src/xpyd_plan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1412,6 +1412,10 @@
from xpyd_plan.sglang_commands import SGLangCommandGenerator # noqa: E402
from xpyd_plan.sglang_commands import SGLangCommandSet # noqa: E402
from xpyd_plan.sglang_commands import generate_sglang_commands # noqa: E402
from xpyd_plan.trtllm_commands import TRTLLMCommandConfig # noqa: E402, I001
from xpyd_plan.trtllm_commands import TRTLLMCommandGenerator # noqa: E402
from xpyd_plan.trtllm_commands import TRTLLMCommandSet # noqa: E402
from xpyd_plan.trtllm_commands import generate_trtllm_commands # noqa: E402
from xpyd_plan.vllm_commands import BenchmarkCommand # noqa: E402
from xpyd_plan.vllm_commands import CommandGenerator # noqa: E402
from xpyd_plan.vllm_commands import CommandSet # noqa: E402
Expand All @@ -1430,6 +1434,10 @@
"SGLangCommandConfig",
"SGLangCommandGenerator",
"SGLangCommandSet",
"generate_trtllm_commands",
"TRTLLMCommandConfig",
"TRTLLMCommandGenerator",
"TRTLLMCommandSet",
]

from xpyd_plan.vllm_import import ( # noqa: E402
Expand Down
6 changes: 6 additions & 0 deletions src/xpyd_plan/cli/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
from xpyd_plan.cli._token_budget import add_token_budget_parser
from xpyd_plan.cli._token_efficiency import add_token_efficiency_parser, handle_token_efficiency
from xpyd_plan.cli._trend import _cmd_trend
from xpyd_plan.cli._trtllm_commands import register_trtllm_commands
from xpyd_plan.cli._validate import _cmd_validate
from xpyd_plan.cli._variance import _cmd_variance, add_variance_parser
from xpyd_plan.cli._vllm_commands import register_vllm_commands
Expand Down Expand Up @@ -963,6 +964,7 @@ def main(argv: list[str] | None = None) -> None:
add_import_parser(subparsers)
register_vllm_commands(subparsers)
register_sglang_commands(subparsers)
register_trtllm_commands(subparsers)
add_rate_limit_parser(subparsers)
add_batch_analysis_parser(subparsers)
add_stat_summary_parser(subparsers)
Expand Down Expand Up @@ -1308,6 +1310,10 @@ def main(argv: list[str] | None = None) -> None:
from xpyd_plan.cli._sglang_commands import _cmd_sglang_commands

_cmd_sglang_commands(args)
elif args.command == "trtllm-commands":
from xpyd_plan.cli._trtllm_commands import _cmd_trtllm_commands

_cmd_trtllm_commands(args)
else:
parser.print_help()
sys.exit(1)
162 changes: 162 additions & 0 deletions src/xpyd_plan/cli/_trtllm_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""CLI trtllm-commands subcommand."""

from __future__ import annotations

import argparse
import json

from rich.console import Console
from rich.table import Table

from xpyd_plan.trtllm_commands import TRTLLMCommandConfig, TRTLLMCommandGenerator


def _cmd_trtllm_commands(args: argparse.Namespace) -> None:
"""Handle the 'trtllm-commands' subcommand."""
console = Console()

qps_levels = [float(q) for q in args.qps.split(",")]

config = TRTLLMCommandConfig(
model=args.model,
total_instances=args.total_instances,
qps_levels=qps_levels,
tp_size=getattr(args, "tp_size", 1),
pp_size=getattr(args, "pp_size", 1),
max_batch_size=getattr(args, "max_batch_size", 256),
max_input_len=getattr(args, "max_input_len", 2048),
max_output_len=getattr(args, "max_output_len", 2048),
kv_cache_free_gpu_mem_fraction=getattr(
args, "kv_cache_free_gpu_mem_fraction", 0.9
),
dtype=getattr(args, "dtype", "float16"),
dataset=getattr(args, "dataset", None),
num_prompts=getattr(args, "num_prompts", 1000),
host=getattr(args, "host", "localhost"),
port=getattr(args, "port", 8000),
engine_dir=getattr(args, "engine_dir", "./engines"),
)

gen = TRTLLMCommandGenerator(config)
result = gen.generate()

if args.output_script:
script = _to_shell_script(result, config)
with open(args.output_script, "w") as f:
f.write(script)
console.print(
f"[green]Shell script written to {args.output_script} "
f"({len(result)} ratios)[/green]"
)
return

output_format = getattr(args, "output_format", "table")

if output_format == "json":
print(json.dumps([cs.model_dump() for cs in result], indent=2, default=str))
return

# Table output
console.print("\n[bold]TensorRT-LLM Benchmark Commands[/bold]")
console.print(
f"Model: {config.model} | Instances: {config.total_instances} | "
f"Ratios: {len(result)}\n"
)

table = Table(title="Benchmark Runs")
table.add_column("P:D Ratio")
table.add_column("Prefill", justify="right")
table.add_column("Decode", justify="right")
table.add_column("QPS Levels")

for cs in result:
table.add_row(
cs.server.ratio,
str(cs.server.prefill_instances),
str(cs.server.decode_instances),
", ".join(f"{b.qps}" for b in cs.benchmarks),
)

console.print(table)
console.print(
"\n[dim]Use --output-script to generate an executable shell script[/dim]"
)


def _to_shell_script(
command_sets: list,
config: TRTLLMCommandConfig,
) -> str:
"""Build a complete shell script from command sets."""
lines = [
"#!/usr/bin/env bash",
"set -euo pipefail",
f"# TensorRT-LLM Benchmark Script — {config.model}",
f"# Total instances: {config.total_instances}",
f"# QPS levels: {', '.join(str(q) for q in config.qps_levels)}",
"",
]
for cs in command_sets:
lines.append(cs.script_snippet)
lines.append("echo 'All benchmarks complete!'")
return "\n".join(lines) + "\n"


def register_trtllm_commands(subparsers: argparse._SubParsersAction) -> None:
"""Register the trtllm-commands subcommand."""
p = subparsers.add_parser(
"trtllm-commands",
help="Generate TensorRT-LLM benchmark commands for P:D ratio exploration",
)
p.add_argument("--model", type=str, required=True, help="HuggingFace model name")
p.add_argument(
"--total-instances",
type=int,
required=True,
help="Total instances (prefill + decode)",
)
p.add_argument(
"--qps",
type=str,
required=True,
help="Comma-separated QPS levels (e.g. 1,2,4)",
)
p.add_argument("--tp-size", type=int, default=1, help="Tensor parallel size")
p.add_argument("--pp-size", type=int, default=1, help="Pipeline parallel size")
p.add_argument("--max-batch-size", type=int, default=256, help="Max batch size")
p.add_argument(
"--max-input-len", type=int, default=2048, help="Max input length"
)
p.add_argument(
"--max-output-len", type=int, default=2048, help="Max output length"
)
p.add_argument(
"--kv-cache-free-gpu-mem-fraction",
type=float,
default=0.9,
help="KV cache GPU memory fraction",
)
p.add_argument(
"--dtype",
type=str,
default="float16",
choices=["float16", "bfloat16", "float32"],
help="Data type",
)
p.add_argument("--dataset", type=str, default=None, help="Dataset path")
p.add_argument("--num-prompts", type=int, default=1000, help="Prompts per run")
p.add_argument("--host", type=str, default="localhost", help="Server host")
p.add_argument("--port", type=int, default=8000, help="Server port")
p.add_argument(
"--engine-dir", type=str, default="./engines", help="Engine output directory"
)
p.add_argument(
"--output-script", type=str, default=None, help="Write shell script"
)
p.add_argument(
"--output-format",
choices=["table", "json"],
default="table",
help="Output format",
)
p.set_defaults(func=_cmd_trtllm_commands)
Loading
Loading