From d7db0810c5f9ab21ff0bb64637f1a15c5c333bec Mon Sep 17 00:00:00 2001 From: Josh Date: Thu, 18 Jun 2026 22:46:41 -0500 Subject: [PATCH] [Example] Add Higgs-Audio-v3 TTS example for Tesla V100 (SM70) Adds a runnable real-time text-to-speech example for bosonai/higgs-audio-v3-tts-4b on V100, using the FLASH_ATTN_V100 backend and the Stage-0 FULL_DECODE_ONLY CUDA graph (low-latency) profile. Reaches RTF ~1.0 (~2.4x faster than the eager profile); the generated audio transcribes back to the input prompt. The CUDA graph path requires the SM70 decode kernel >= e64d39aa7 (this fork) and the vllm-omni talker capture fix (vllm-project/vllm-omni#4563); the README also documents the eager baseline, which needs neither. Signed-off-by: Josh --- .../multimodal/higgs_audio_v3/README.md | 35 +++++++++++ .../higgs_v100_low_latency.yaml | 47 +++++++++++++++ .../generate/multimodal/higgs_audio_v3/tts.py | 59 +++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 examples/generate/multimodal/higgs_audio_v3/README.md create mode 100644 examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml create mode 100644 examples/generate/multimodal/higgs_audio_v3/tts.py diff --git a/examples/generate/multimodal/higgs_audio_v3/README.md b/examples/generate/multimodal/higgs_audio_v3/README.md new file mode 100644 index 000000000..3675d1786 --- /dev/null +++ b/examples/generate/multimodal/higgs_audio_v3/README.md @@ -0,0 +1,35 @@ +# Higgs-Audio-v3 TTS on Tesla V100 (SM70) + +Runs [`bosonai/higgs-audio-v3-tts-4b`](https://huggingface.co/bosonai/higgs-audio-v3-tts-4b) +text-to-speech on a single Tesla V100 with the `FLASH_ATTN_V100` backend and the +Stage-0 CUDA graph (low-latency) profile, reaching **real-time** generation +(RTF ~1.0 — about 2.4x faster than the eager profile). + +## Requirements + +- **1Cat-vLLM** with the SM70 decode CUDA graph kernel **>= `e64d39aa7`** + ("Stabilize SM70 Qwen MTP paths"). Earlier kernels cap the scalar-paged decode + workspace at the capture-time `seq_len`, so the talker CUDA graph replays a + short/stale KV span and produces incorrect audio. +- **vllm-omni** with the talker CUDA-graph-capture fix + ([vllm-project/vllm-omni#4563](https://github.com/vllm-project/vllm-omni/pull/4563)). + Without it, Stage-0 capture aborts with + `operation not permitted when stream is capturing`. + +## Run + +```bash +python examples/generate/multimodal/higgs_audio_v3/tts.py \ + --text "Hello! This is Higgs Audio version three, generating speech on a Tesla V100." \ + --deploy-config examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml \ + --out higgs_out.wav +``` + +For the **eager baseline** (correct audio, no CUDA graph — works without the two +fixes above), set Stage-0 `enforce_eager: true` in the deploy config. + +## Notes + +- Stage 0 (talker) uses `FLASH_ATTN_V100` + `FULL_DECODE_ONLY` CUDA graph in + `float16`; Stage 1 (code2wav) stays `enforce_eager: true` in `float32`. +- Verified on a V100: the generated audio transcribes back to the input prompt. diff --git a/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml b/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml new file mode 100644 index 000000000..b9de9c712 --- /dev/null +++ b/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml @@ -0,0 +1,47 @@ +# Higgs-Audio-v3 low-latency (CUDA graph) deploy profile for Tesla V100 / SM70. +# +# Requires: +# * 1Cat-vLLM SM70 decode CUDA graph kernel >= e64d39aa7 +# * vllm-omni talker CUDA-graph-capture fix (vllm-project/vllm-omni#4563) +# +# For an eager baseline (no CUDA graph, no fixes needed) set Stage-0 +# `enforce_eager: true` and remove the `compilation_config` block. + +async_chunk: true + +stages: + - stage_id: 0 # talker (Qwen3-based) — real-time decode + max_num_seqs: 16 + gpu_memory_utilization: 0.6 + enforce_eager: false + dtype: float16 + attention_backend: FLASH_ATTN_V100 + trust_remote_code: true + enable_prefix_caching: true + compilation_config: + cudagraph_mode: FULL_DECODE_ONLY + cudagraph_capture_sizes: [1] + cudagraph_num_of_warmups: 1 + max_num_batched_tokens: 4096 + max_model_len: 8192 + devices: "0" + default_sampling_params: + temperature: 1.0 + top_p: 0.95 + top_k: 50 + max_tokens: 2048 + seed: 42 + + - stage_id: 1 # code2wav (codec -> 24 kHz PCM) + max_num_seqs: 16 + gpu_memory_utilization: 0.25 + enforce_eager: true + dtype: float32 + trust_remote_code: true + max_num_batched_tokens: 65536 + max_model_len: 65536 + devices: "0" + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 diff --git a/examples/generate/multimodal/higgs_audio_v3/tts.py b/examples/generate/multimodal/higgs_audio_v3/tts.py new file mode 100644 index 000000000..1469b2ecf --- /dev/null +++ b/examples/generate/multimodal/higgs_audio_v3/tts.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Higgs-Audio-v3 text-to-speech on Tesla V100 (SM70). + +Runs ``bosonai/higgs-audio-v3-tts-4b`` with the ``FLASH_ATTN_V100`` backend and +the Stage-0 CUDA graph (low-latency) profile, which reaches real-time generation +on a single V100. See README.md for the kernel / vllm-omni requirements. + +Example: + python examples/generate/multimodal/higgs_audio_v3/tts.py \ + --text "Hello from a Tesla V100." \ + --deploy-config examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml \ + --out higgs_out.wav +""" + +import argparse + +import numpy as np + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--text", default="Hello! This is Higgs Audio version three, " + "generating speech on a Tesla V100.") + p.add_argument("--model", default="bosonai/higgs-audio-v3-tts-4b") + p.add_argument("--deploy-config", required=True, + help="Stage deploy YAML (see higgs_v100_low_latency.yaml).") + p.add_argument("--out", default="higgs_out.wav") + args = p.parse_args() + + from transformers import AutoTokenizer + + from vllm_omni.entrypoints.omni import Omni + from vllm_omni.model_executor.models.higgs_audio_v3.higgs_audio_v3_tokenizer import ( + HiggsAudioV3TokenizerAdapter, + ) + + tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + prompt_token_ids = HiggsAudioV3TokenizerAdapter(tok).build_prompt(args.text) + + omni = Omni(model=args.model, deploy_config=args.deploy_config, + trust_remote_code=True) + out = omni.generate([{"prompt_token_ids": prompt_token_ids}])[0] + + mm = out.multimodal_output + audio = mm.tensors["audio"].detach().cpu().float().numpy().reshape(-1) + sr = int(mm.tensors["sr"]) + print(f"durations: {getattr(out, 'stage_durations', None)}") + print(f"audio: {audio.size / max(sr, 1):.2f}s @ {sr} Hz") + + if audio.size: + import soundfile as sf + sf.write(args.out, audio, sr) + print(f"wrote {args.out}") + omni.close() + + +if __name__ == "__main__": + main()