From d7db0810c5f9ab21ff0bb64637f1a15c5c333bec Mon Sep 17 00:00:00 2001
From: Josh <jajmangold@gmail.com>
Date: Thu, 18 Jun 2026 22:46:41 -0500
Subject: [PATCH] [Example] Add Higgs-Audio-v3 TTS example for Tesla V100
 (SM70)

Adds a runnable real-time text-to-speech example for
bosonai/higgs-audio-v3-tts-4b on V100, using the FLASH_ATTN_V100 backend and the
Stage-0 FULL_DECODE_ONLY CUDA graph (low-latency) profile. Reaches RTF ~1.0
(~2.4x faster than the eager profile); the generated audio transcribes back to
the input prompt.

The CUDA graph path requires the SM70 decode kernel >= e64d39aa7 (this fork) and
the vllm-omni talker capture fix (vllm-project/vllm-omni#4563); the README also
documents the eager baseline, which needs neither.

Signed-off-by: Josh <jajmangold@gmail.com>
---
 .../multimodal/higgs_audio_v3/README.md       | 35 +++++++++++
 .../higgs_v100_low_latency.yaml               | 47 +++++++++++++++
 .../generate/multimodal/higgs_audio_v3/tts.py | 59 +++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 examples/generate/multimodal/higgs_audio_v3/README.md
 create mode 100644 examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml
 create mode 100644 examples/generate/multimodal/higgs_audio_v3/tts.py

diff --git a/examples/generate/multimodal/higgs_audio_v3/README.md b/examples/generate/multimodal/higgs_audio_v3/README.md
new file mode 100644
index 000000000..3675d1786
--- /dev/null
+++ b/examples/generate/multimodal/higgs_audio_v3/README.md
@@ -0,0 +1,35 @@
+# Higgs-Audio-v3 TTS on Tesla V100 (SM70)
+
+Runs [`bosonai/higgs-audio-v3-tts-4b`](https://huggingface.co/bosonai/higgs-audio-v3-tts-4b)
+text-to-speech on a single Tesla V100 with the `FLASH_ATTN_V100` backend and the
+Stage-0 CUDA graph (low-latency) profile, reaching **real-time** generation
+(RTF ~1.0 — about 2.4x faster than the eager profile).
+
+## Requirements
+
+- **1Cat-vLLM** with the SM70 decode CUDA graph kernel **>= `e64d39aa7`**
+  ("Stabilize SM70 Qwen MTP paths"). Earlier kernels cap the scalar-paged decode
+  workspace at the capture-time `seq_len`, so the talker CUDA graph replays a
+  short/stale KV span and produces incorrect audio.
+- **vllm-omni** with the talker CUDA-graph-capture fix
+  ([vllm-project/vllm-omni#4563](https://github.com/vllm-project/vllm-omni/pull/4563)).
+  Without it, Stage-0 capture aborts with
+  `operation not permitted when stream is capturing`.
+
+## Run
+
+```bash
+python examples/generate/multimodal/higgs_audio_v3/tts.py \
+    --text "Hello! This is Higgs Audio version three, generating speech on a Tesla V100." \
+    --deploy-config examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml \
+    --out higgs_out.wav
+```
+
+For the **eager baseline** (correct audio, no CUDA graph — works without the two
+fixes above), set Stage-0 `enforce_eager: true` in the deploy config.
+
+## Notes
+
+- Stage 0 (talker) uses `FLASH_ATTN_V100` + `FULL_DECODE_ONLY` CUDA graph in
+  `float16`; Stage 1 (code2wav) stays `enforce_eager: true` in `float32`.
+- Verified on a V100: the generated audio transcribes back to the input prompt.
diff --git a/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml b/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml
new file mode 100644
index 000000000..b9de9c712
--- /dev/null
+++ b/examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml
@@ -0,0 +1,47 @@
+# Higgs-Audio-v3 low-latency (CUDA graph) deploy profile for Tesla V100 / SM70.
+#
+# Requires:
+#   * 1Cat-vLLM SM70 decode CUDA graph kernel >= e64d39aa7
+#   * vllm-omni talker CUDA-graph-capture fix (vllm-project/vllm-omni#4563)
+#
+# For an eager baseline (no CUDA graph, no fixes needed) set Stage-0
+# `enforce_eager: true` and remove the `compilation_config` block.
+
+async_chunk: true
+
+stages:
+  - stage_id: 0                      # talker (Qwen3-based) — real-time decode
+    max_num_seqs: 16
+    gpu_memory_utilization: 0.6
+    enforce_eager: false
+    dtype: float16
+    attention_backend: FLASH_ATTN_V100
+    trust_remote_code: true
+    enable_prefix_caching: true
+    compilation_config:
+      cudagraph_mode: FULL_DECODE_ONLY
+      cudagraph_capture_sizes: [1]
+      cudagraph_num_of_warmups: 1
+    max_num_batched_tokens: 4096
+    max_model_len: 8192
+    devices: "0"
+    default_sampling_params:
+      temperature: 1.0
+      top_p: 0.95
+      top_k: 50
+      max_tokens: 2048
+      seed: 42
+
+  - stage_id: 1                      # code2wav (codec -> 24 kHz PCM)
+    max_num_seqs: 16
+    gpu_memory_utilization: 0.25
+    enforce_eager: true
+    dtype: float32
+    trust_remote_code: true
+    max_num_batched_tokens: 65536
+    max_model_len: 65536
+    devices: "0"
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
diff --git a/examples/generate/multimodal/higgs_audio_v3/tts.py b/examples/generate/multimodal/higgs_audio_v3/tts.py
new file mode 100644
index 000000000..1469b2ecf
--- /dev/null
+++ b/examples/generate/multimodal/higgs_audio_v3/tts.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Higgs-Audio-v3 text-to-speech on Tesla V100 (SM70).
+
+Runs ``bosonai/higgs-audio-v3-tts-4b`` with the ``FLASH_ATTN_V100`` backend and
+the Stage-0 CUDA graph (low-latency) profile, which reaches real-time generation
+on a single V100. See README.md for the kernel / vllm-omni requirements.
+
+Example:
+    python examples/generate/multimodal/higgs_audio_v3/tts.py \
+        --text "Hello from a Tesla V100." \
+        --deploy-config examples/generate/multimodal/higgs_audio_v3/higgs_v100_low_latency.yaml \
+        --out higgs_out.wav
+"""
+
+import argparse
+
+import numpy as np
+
+
+def main() -> None:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text", default="Hello! This is Higgs Audio version three, "
+                                     "generating speech on a Tesla V100.")
+    p.add_argument("--model", default="bosonai/higgs-audio-v3-tts-4b")
+    p.add_argument("--deploy-config", required=True,
+                   help="Stage deploy YAML (see higgs_v100_low_latency.yaml).")
+    p.add_argument("--out", default="higgs_out.wav")
+    args = p.parse_args()
+
+    from transformers import AutoTokenizer
+
+    from vllm_omni.entrypoints.omni import Omni
+    from vllm_omni.model_executor.models.higgs_audio_v3.higgs_audio_v3_tokenizer import (
+        HiggsAudioV3TokenizerAdapter,
+    )
+
+    tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
+    prompt_token_ids = HiggsAudioV3TokenizerAdapter(tok).build_prompt(args.text)
+
+    omni = Omni(model=args.model, deploy_config=args.deploy_config,
+                trust_remote_code=True)
+    out = omni.generate([{"prompt_token_ids": prompt_token_ids}])[0]
+
+    mm = out.multimodal_output
+    audio = mm.tensors["audio"].detach().cpu().float().numpy().reshape(-1)
+    sr = int(mm.tensors["sr"])
+    print(f"durations: {getattr(out, 'stage_durations', None)}")
+    print(f"audio: {audio.size / max(sr, 1):.2f}s @ {sr} Hz")
+
+    if audio.size:
+        import soundfile as sf
+        sf.write(args.out, audio, sr)
+        print(f"wrote {args.out}")
+    omni.close()
+
+
+if __name__ == "__main__":
+    main()