AI-Hypercomputer · hmhv1222 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/inference/a4/single-host-serving/tensorrt-llm/README.md b/inference/a4/single-host-serving/tensorrt-llm/README.md
diff --git a/inference/a4/single-host-serving/tensorrt-llm/values.yaml b/inference/a4/single-host-serving/tensorrt-llm/values.yaml
@@ -0,0 +1,67 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+queue:
+
+dwsSettings:
+  maxRunDurationSeconds:
+
+huggingface:
+  secretName: hf-secret
+  secretData:
+    token: "hf_api_token"
+
+volumes:
+  gcsVolumes: true
+  ssdMountPath: "/ssd"
+  gcsMounts:
+    - bucketName:
+      mountPath: "/gcs"
+
+service:
+  type: ClusterIP
+  ports:
+    http: 8000
+
+workload:
+  model:
+    name:
+  gpus: 8
+  image:
+  framework:
+  configFile: serving-args.yaml
+  configPath: /workload/configs
+  envs:
+    - name: HF_HUB_ENABLE_HF_TRANSFER
+      value: "1"
+    - name: LAUNCHER_SCRIPT
+      value: "/workload/launcher/launch-workload.sh"
+    - name: SERVER_ARGS_FILE
+      value: "/workload/configs/serving-args.yaml"
+    - name: HF_HOME
+      value: "/ssd"
+    - name: LD_LIBRARY_PATH
+      value: "/usr/local/nvidia/lib64:/usr/local/lib/"
+  benchmarks:
+    experiments:
+      - isl: 1024
+        osl: 4096
+        num_requests: 1000
+
+network:
+  subnetworks[]:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.5
+  ncclSettings:
+    - name: NCCL_DEBUG
+      value: "VERSION"
diff --git a/inference/a4x/single-host-serving/tensorrt-llm/README.md b/inference/a4x/single-host-serving/tensorrt-llm/README.md
@@ -129,7 +129,7 @@ export CLUSTER_REGION=<REGION_of_your_cluster>
 export CLUSTER_NAME=<YOUR_GKE_CLUSTER_NAME>
 export KUEUE_NAME=<YOUR_KUEUE_NAME>
 export GCS_BUCKET=<your-gcs-bucket-for-logs>
-export TRTLLM_VERSION=1.2.0rc2
+export TRTLLM_VERSION=1.3.0rc5
 
 # Set the project for gcloud commands
 gcloud config set project $PROJECT_ID
@@ -199,9 +199,15 @@ kubectl create secret generic hf-secret \
 
 This recipe supports the following models. You can easily swap between them by changing the environment variables in the next step.
 
+Running TRTLLM inference benchmarking on these models are only tested and validated on A4X GKE nodes with certain combination of TP, PP, EP, number of GPU chips, input & output sequence length, precision, etc.
+
+Example model configuration YAML files included in this repo only show a certain combination of parallelism hyperparameters and configs for benchmarking purposes. Input and output length in `gpu-recipes/inference/a4x/single-host-serving/tensorrt-llm/values.yaml` need to be adjusted according to the model and its configs.
+
 | Model Name | Hugging Face ID | Configuration File | Release Name Suffix |
 | :--- | :--- | :--- | :--- |
-| **DeepSeek-R1 671B** | `nvidia/DeepSeek-R1-NVFP4-v2` | `deepseek-r1-nvfp4.yaml` | `deepseek-r1-model` |
+| **DeepSeek-R1 671B** | `nvidia/DeepSeek-R1-NVFP4-v2` | `deepseek-r1-nvfp4.yaml` | `deepseek-r1` |
+| **Llama 3.1 405B NVFP4** | `nvidia/Llama-3.1-405B-Instruct-NVFP4` | `llama-3.1-405b.yaml` | `llama-3-1-405b-nvfp4` |
+| **Llama 3.1 405B FP8** | `meta-llama/Llama-3.1-405B-Instruct-FP8` | `llama-3.1-405b.yaml` | `llama-3-1-405b-fp8` |
 | **Llama 3.1 70B** | `meta-llama/Llama-3.1-70B-Instruct` | `llama-3.1-70b.yaml` | `llama-3-1-70b` |
 | **Llama 3.1 8B** | `meta-llama/Llama-3.1-8B-Instruct` | `llama-3.1-8b.yaml` | `llama-3-1-8b` |
 | **Qwen 3 32B** | `Qwen/Qwen3-32B` | `qwen3-32b.yaml` | `qwen3-32b` |
@@ -223,10 +229,10 @@ The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/mai
 1.  **Configure model-specific variables.** Choose a model from the [table above](#supported-models) and set the variables:
 
     ```bash
-    # Example for Llama 3.1 70B
-    export HF_MODEL_ID="meta-llama/Llama-3.1-70B-Instruct"
-    export CONFIG_FILE="llama-3.1-70b.yaml"
-    export RELEASE_NAME="$USER-serving-llama-3-1-70b"
+    # Example for DeepSeek-R1 NVFP4
+    export HF_MODEL_ID="nvidia/DeepSeek-R1-NVFP4-v2"
+    export CONFIG_FILE="deepseek-r1-nvfp4.yaml"
+    export RELEASE_NAME="$USER-serving-deepseek-r1"
     ```
 
 2.  **Install the helm chart:**
@@ -258,7 +264,7 @@ The recipe uses [`trtllm-bench`](https://github.com/NVIDIA/TensorRT-LLM/blob/mai
 
 [Back to Top](#table-of-contents)
 
-After the model is deployed via Helm as described in the sections [above](#run-the-recipe), use the following steps to monitor the deployment and interact with the model. Replace `<deployment-name>` and `<service-name>` with the appropriate names from the model-specific deployment instructions (e.g., `$USER-serving-deepseek-r1-model` and `$USER-serving-deepseek-r1-model-svc`).
+After the model is deployed via Helm as described in the sections [above](#run-the-recipe), use the following steps to monitor the deployment and interact with the model. Replace `<deployment-name>` and `<service-name>` with the appropriate names from the model-specific deployment instructions (e.g., `$USER-serving-deepseek-r1` and `$USER-serving-deepseek-r1-svc`).
 
 
 <a name="check-status"></a>
@@ -268,7 +274,7 @@ Check the status of your deployment. Replace the name if you deployed a differen
 
 ```bash
 # Example for DeepSeek-R1 671B
-kubectl get deployment/$USER-serving-deepseek-r1-model
+kubectl get deployment/$USER-serving-deepseek-r1
 ```
 
 Wait until the `READY` column shows `1/1`. If it shows `0/1`, the pod is still starting up.
@@ -282,7 +288,7 @@ Wait until the `READY` column shows `1/1`. If it shows `0/1`, the pod is still s
 To see the logs from the TRTLLM server (useful for debugging), use the `-f` flag to follow the log stream:
 
 ```bash
-kubectl logs -f deployment/$USER-serving-deepseek-r1-model
+kubectl logs -f deployment/$USER-serving-deepseek-r1
 ```
 
 You should see logs indicating preparing the model, and then running the throughput benchmark test, similar to this:

diff --git a/inference/a4x/single-host-serving/tensorrt-llm/values.yaml b/inference/a4x/single-host-serving/tensorrt-llm/values.yaml
@@ -51,8 +51,8 @@ workload:
       value: "/workload/configs/serving-args.yaml"
   benchmarks:
     experiments:
-      - isl: 128
-        osl: 128
+      - isl: 1024
+        osl: 1024
         num_requests: 1000
 
 network:

diff --git a/src/frameworks/a4/trtllm-configs/deepseek-r1-nvfp4.yaml b/src/frameworks/a4/trtllm-configs/deepseek-r1-nvfp4.yaml
@@ -0,0 +1,35 @@
+tp_size: 4
+ep_size: 4
+pp_size: 1
+backend: pytorch
+kv_cache_free_gpu_mem_fraction: 0.85
+llm_api_args:
+  cuda_graph_config:
+    batch_sizes:
+    - 1
+    - 2
+    - 4
+    - 8
+    - 16
+    - 20
+    - 24
+    - 32
+    - 64
+    - 96
+    - 128
+    - 160
+    - 192
+    - 256
+    - 320
+    - 384
+    - 512
+    enable_padding: true
+  enable_attention_dp: true
+  enable_chunked_prefill: true
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: CUTLASS
+  print_iter_log: true
diff --git a/src/frameworks/a4/trtllm-configs/qwen3-235b-a22b-nvfp4.yaml b/src/frameworks/a4/trtllm-configs/qwen3-235b-a22b-nvfp4.yaml
@@ -0,0 +1,4 @@
+tp_size: 1
+pp_size: 1
+backend: pytorch
+kv_cache_free_gpu_mem_fraction: 0.90
diff --git a/src/frameworks/a4/trtllm-configs/qwen3-32b.yaml b/src/frameworks/a4/trtllm-configs/qwen3-32b.yaml
@@ -0,0 +1,4 @@
+tp_size: 1
+pp_size: 1
+backend: pytorch
+kv_cache_free_gpu_mem_fraction: 0.90
diff --git a/src/frameworks/a4x/trtllm-configs/llama-3-1-405b.yaml b/src/frameworks/a4x/trtllm-configs/llama-3-1-405b.yaml
@@ -0,0 +1,4 @@
+tp_size: 4
+pp_size: 1
+backend: pytorch
+kv_cache_free_gpu_mem_fraction: 0.90
diff --git a/src/helm-charts/a4/inference-templates/deployment/templates/serving-launcher.yaml b/src/helm-charts/a4/inference-templates/deployment/templates/serving-launcher.yaml
@@ -171,6 +171,8 @@ spec:
             {{- end }}
             - name: NCCL_PLUGIN_PATH
               value: /usr/local/gib/lib64
+            - name: LD_LIBRARY_PATH
+              value: /usr/local/gib/lib64:/usr/local/nvidia/lib64
             {{- if $root.Values.network.gibVersion }}
             - name: NCCL_INIT_SCRIPT
               value: "/usr/local/gib/scripts/set_nccl_env.sh"
@@ -180,6 +182,8 @@ spec:
               value: "{{ $root.Values.workload.model.name }}"
             - name: MODEL_DOWNLOAD_DIR
               value: "/ssd/{{ $root.Values.workload.model.name }}"
+            - name: TRTLLM_DIR
+              value: "/app/tensorrt_llm"
             {{- if $root.Values.workload.envs }}
             {{- toYaml .Values.workload.envs | nindent 12 }}
             {{- end }}
@@ -189,6 +193,7 @@ spec:
           args:
             - |
               #!/bin/bash
+              pip install pyyaml hf_transfer
 
               if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
                 echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
@@ -203,30 +208,46 @@ spec:
               fi
 
               ARGS=()
+              EXTRA_ARGS_FILE="/tmp/extra_llm_api_args.yaml"
 
-              if [ -f "$SERVER_ARGS_FILE" ]; then
-                echo "Loading server arguments from ConfigMap"
-                while IFS=': ' read -r key value || [ -n "$key" ]; do
-                  [[ -z "$key" || "$key" == \#* ]] && continue
-                  key=$(echo "$key" | xargs)
-                  value=$(echo "$value" | xargs)
+              # Use Python to parse the main config file, extract llm_api_args,
+              # and generate the command-line arguments.
+              python -c "
+              import yaml
+              import sys
 
-                  if [ -n "$key" ]; then
-                    # Handle boolean values
-                    if [[ "$value" == "true" ]]; then
-                      # For true values, just add the flag without a value
-                      ARGS+=("--$key")
-                    elif [[ "$value" == "false" ]]; then
-                      ARGS+=("--$key" "false")
-                    elif [ -n "$value" ]; then
-                      # For non-boolean values, add both the flag and its value
-                      ARGS+=("--$key" "$value")
-                    else
-                      ARGS+=("--$key")
-                    fi
-                  fi
-                done < "$SERVER_ARGS_FILE"
-              fi
+              args = []
+              llm_api_args = {}
+              config_file = sys.argv[1]
+              extra_args_file = sys.argv[2]
+
+              try:
+                with open(config_file, 'r') as f:
+                  config = yaml.safe_load(f)
+
+                if 'llm_api_args' in config:
+                  llm_api_args = config.pop('llm_api_args')
+                  with open(extra_args_file, 'w') as f:
+                    yaml.dump(llm_api_args, f)
+
+                for key, value in config.items():
+                  if value is True:
+                    args.append(f'--{key}')
+                  elif value is not False:
+                    args.append(f'--{key}')
+                    args.append(str(value))
+
+                # Print the arguments for the shell script to capture
+                print(' '.join(args))
+
+              except Exception as e:
+                print(f'Error parsing config file: {e}', file=sys.stderr)
+                sys.exit(1)
+              " "$SERVER_ARGS_FILE" "$EXTRA_ARGS_FILE" > /tmp/launcher_args.txt
+
+              # Read the generated arguments into the ARGS array
+              mapfile -t ARGS < <(tr ' ' '\n' < /tmp/launcher_args.txt)
+              rm /tmp/launcher_args.txt
 
               {{ if eq $root.Values.workload.framework "trtllm" }}
               {{- range $root.Values.workload.benchmarks.experiments }}

diff --git a/src/launchers/trtllm-launcher.sh b/src/launchers/trtllm-launcher.sh
@@ -85,7 +85,7 @@ parse_serving_config() {
 
     for ((index = 0; index < ${#SERVING_CONFIG[@]}; )); do
         current_arg="${SERVING_CONFIG[$index]}"
-        next_arg="${SERVING_CONFIG[$((index + 1))]}"
+        next_arg=${SERVING_CONFIG[$((index + 1))]:-}
 
         # Handle --key=value format
         if [[ "$current_arg" =~ ^--[^=]+=.+ ]]; then
@@ -180,6 +180,7 @@ run_benchmark() {
 
     if [[ $backend == "pytorch" ]]; then
         echo "Running throughput benchmark"
+        export NCCL_P2P_LEVEL=PHB
         trtllm-bench \
         --model $model_name \
         --model_path /ssd/${model_name} throughput \