diff --git a/validators/performance/nccl_aks_utils.go b/validators/performance/nccl_aks_utils.go
new file mode 100644
index 000000000..22e87190e
--- /dev/null
+++ b/validators/performance/nccl_aks_utils.go
@@ -0,0 +1,154 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+
+ "github.com/NVIDIA/aicr/pkg/defaults"
+ aicrErrors "github.com/NVIDIA/aicr/pkg/errors"
+ v1 "k8s.io/api/core/v1"
+ apierrors "k8s.io/apimachinery/pkg/api/errors"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/client-go/kubernetes"
+)
+
+// ncclTopoConfigMapName is the name used for the NCCL topology ConfigMap
+// created by the AKS NCCL validator. Cleaned up in cleanupNCCLResources.
+const ncclTopoConfigMapName = "nccl-all-reduce-topo"
+
+// mlnxNICResource is the Kubernetes extended resource name for Mellanox
+// InfiniBand NICs exposed by the NVIDIA Network Operator device plugin.
+const mlnxNICResource = v1.ResourceName("nvidia.com/mlnxnics")
+
+// discoverAKSNodeConfig reads the Mellanox NIC count from a GPU node's
+// allocatable resources. A count of 0 is valid — the Network Operator may
+// not be deployed, but NCCL still uses IB via OFED kernel drivers.
+func discoverAKSNodeConfig(node v1.Node) int {
+ quantity := node.Status.Allocatable[mlnxNICResource]
+ return int(quantity.Value())
+}
+
+// buildMLNXResourceLine returns the YAML line for nvidia.com/mlnxnics
+// resource requests/limits at the correct indentation, or an empty string
+// if count is 0 (same graceful-degradation pattern as buildEFAResourceLine).
+func buildMLNXResourceLine(count int, indent string) string {
+ if count == 0 {
+ return ""
+ }
+ return fmt.Sprintf("%snvidia.com/mlnxnics: \"%d\"", indent, count)
+}
+
+// ndv5TopoXML is the NCCL topology XML for Azure ND H100 v5 / ND H200 v5
+// VMs. Describes the PCIe Gen5 topology: 2 NUMA nodes (Intel Sapphire
+// Rapids), 4 GPU+NIC pairs per NUMA. Each GPU (class 0x030200) is paired
+// with a ConnectX-7 NIC (class 0x020700) under a PCIe bridge at 32 GT/s x16.
+//
+// Source: excalibur (azure-h100.xml) and nccl-doctor (ndv5-topo.xml) —
+// both describe the identical ND H100 v5 hardware topology.
+const ndv5TopoXML = `
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+`
+
+// createTopoConfigMap creates a ConfigMap containing the ND H100 v5 NCCL
+// topology XML. The ConfigMap is mounted into worker pods so NCCL reads
+// the topology at /etc/nccl/topo.xml instead of auto-discovering it.
+// Uses create-or-update semantics per CLAUDE.md Kubernetes patterns.
+func createTopoConfigMap(ctx context.Context, clientset kubernetes.Interface, namespace string) error {
+ slog.Info("Creating NCCL topology ConfigMap", "name", ncclTopoConfigMapName, "namespace", namespace)
+
+ createCtx, cancel := context.WithTimeout(ctx, defaults.DiagnosticTimeout)
+ defer cancel()
+
+ cm := &v1.ConfigMap{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: ncclTopoConfigMapName,
+ Namespace: namespace,
+ },
+ Data: map[string]string{
+ "topo.xml": ndv5TopoXML,
+ },
+ }
+
+ _, err := clientset.CoreV1().ConfigMaps(namespace).Create(createCtx, cm, metav1.CreateOptions{})
+ if err == nil {
+ return nil
+ }
+ if !apierrors.IsAlreadyExists(err) {
+ return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err)
+ }
+
+ // AlreadyExists: update in place (prior run may have left a stale CM).
+ _, err = clientset.CoreV1().ConfigMaps(namespace).Update(createCtx, cm, metav1.UpdateOptions{})
+ if err != nil {
+ return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to update NCCL topology ConfigMap", err)
+ }
+ slog.Info("Updated existing NCCL topology ConfigMap", "name", ncclTopoConfigMapName)
+ return nil
+}
+
+// deleteTopoConfigMap removes the NCCL topology ConfigMap. NotFound is
+// expected for non-AKS platforms and is logged at debug.
+func deleteTopoConfigMap(clientset kubernetes.Interface, namespace string) {
+ deleteCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout)
+ defer cancel()
+
+ err := clientset.CoreV1().ConfigMaps(namespace).Delete(deleteCtx, ncclTopoConfigMapName, metav1.DeleteOptions{})
+ switch {
+ case err == nil:
+ slog.Info("Deleted NCCL topology ConfigMap", "name", ncclTopoConfigMapName)
+ case apierrors.IsNotFound(err):
+ slog.Debug("NCCL topology ConfigMap not present (non-AKS platform), skipping", "name", ncclTopoConfigMapName)
+ default:
+ slog.Error("Warning: Failed to delete NCCL topology ConfigMap", "error", err, "name", ncclTopoConfigMapName)
+ }
+}
diff --git a/validators/performance/nccl_aks_utils_test.go b/validators/performance/nccl_aks_utils_test.go
new file mode 100644
index 000000000..aa1e423ef
--- /dev/null
+++ b/validators/performance/nccl_aks_utils_test.go
@@ -0,0 +1,186 @@
+// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+ "encoding/xml"
+ "strings"
+ "testing"
+
+ v1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func TestDiscoverAKSNodeConfig(t *testing.T) {
+ tests := []struct {
+ name string
+ node v1.Node
+ wantMLNX int
+ }{
+ {
+ name: "ND H100 v5 with 8 Mellanox NICs",
+ node: v1.Node{
+ ObjectMeta: metav1.ObjectMeta{
+ Labels: map[string]string{
+ "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3",
+ },
+ },
+ Status: v1.NodeStatus{
+ Allocatable: v1.ResourceList{
+ v1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"),
+ v1.ResourceName("nvidia.com/mlnxnics"): resource.MustParse("8"),
+ },
+ },
+ },
+ wantMLNX: 8,
+ },
+ {
+ name: "no mlnxnics (Network Operator not deployed)",
+ node: v1.Node{
+ ObjectMeta: metav1.ObjectMeta{
+ Labels: map[string]string{
+ "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3",
+ },
+ },
+ Status: v1.NodeStatus{
+ Allocatable: v1.ResourceList{
+ v1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"),
+ },
+ },
+ },
+ wantMLNX: 0,
+ },
+ {
+ name: "empty allocatable",
+ node: v1.Node{
+ ObjectMeta: metav1.ObjectMeta{
+ Labels: map[string]string{},
+ },
+ Status: v1.NodeStatus{
+ Allocatable: v1.ResourceList{},
+ },
+ },
+ wantMLNX: 0,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := discoverAKSNodeConfig(tt.node)
+ if got != tt.wantMLNX {
+ t.Errorf("discoverAKSNodeConfig() = %d, want %d", got, tt.wantMLNX)
+ }
+ })
+ }
+}
+
+func TestBuildMLNXResourceLine(t *testing.T) {
+ tests := []struct {
+ name string
+ count int
+ indent string
+ want string
+ }{
+ {
+ name: "8 Mellanox NICs",
+ count: 8,
+ indent: " ",
+ want: ` nvidia.com/mlnxnics: "8"`,
+ },
+ {
+ name: "4 Mellanox NICs",
+ count: 4,
+ indent: " ",
+ want: ` nvidia.com/mlnxnics: "4"`,
+ },
+ {
+ name: "no NICs — empty string",
+ count: 0,
+ indent: " ",
+ want: "",
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ got := buildMLNXResourceLine(tt.count, tt.indent)
+ if got != tt.want {
+ t.Errorf("buildMLNXResourceLine(%d) = %q, want %q", tt.count, got, tt.want)
+ }
+ })
+ }
+}
+
+// xmlSystem is a minimal struct for validating the NCCL topology XML constant.
+type xmlSystem struct {
+ XMLName xml.Name `xml:"system"`
+ Version string `xml:"version,attr"`
+ CPUs []xmlCPU `xml:"cpu"`
+}
+
+type xmlCPU struct {
+ NumaID string `xml:"numaid,attr"`
+ PCIs []xmlPCI `xml:"pci"`
+}
+
+type xmlPCI struct {
+ BusID string `xml:"busid,attr"`
+ Class string `xml:"class,attr"`
+ Children []xmlPCI `xml:"pci"`
+}
+
+func TestNdv5TopoXML(t *testing.T) {
+ var sys xmlSystem
+ if err := xml.Unmarshal([]byte(ndv5TopoXML), &sys); err != nil {
+ t.Fatalf("ndv5TopoXML is not valid XML: %v", err)
+ }
+
+ if sys.Version != "1" {
+ t.Errorf("system version = %q, want %q", sys.Version, "1")
+ }
+
+ if len(sys.CPUs) != 2 {
+ t.Fatalf("expected 2 NUMA CPUs, got %d", len(sys.CPUs))
+ }
+
+ // Each NUMA node has 4 PCIe bridges, each with 1 GPU + 1 NIC = 8 GPUs total.
+ totalGPUs := 0
+ totalNICs := 0
+ for _, cpu := range sys.CPUs {
+ if len(cpu.PCIs) != 4 {
+ t.Errorf("NUMA %s: expected 4 PCIe bridges, got %d", cpu.NumaID, len(cpu.PCIs))
+ }
+ for _, bridge := range cpu.PCIs {
+ // Each bridge should be class 0x060400 (PCI-to-PCI bridge)
+ if bridge.Class != "0x060400" {
+ t.Errorf("bridge %s: class = %q, want 0x060400", bridge.BusID, bridge.Class)
+ }
+ for _, child := range bridge.Children {
+ switch {
+ case strings.HasPrefix(child.Class, "0x0302"):
+ totalGPUs++
+ case strings.HasPrefix(child.Class, "0x0207"):
+ totalNICs++
+ }
+ }
+ }
+ }
+
+ if totalGPUs != 8 {
+ t.Errorf("expected 8 GPUs in topology, got %d", totalGPUs)
+ }
+ if totalNICs != 8 {
+ t.Errorf("expected 8 NICs in topology, got %d", totalNICs)
+ }
+}
diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go
index 93861f0b7..1c054204e 100644
--- a/validators/performance/nccl_all_reduce_bw_constraint.go
+++ b/validators/performance/nccl_all_reduce_bw_constraint.go
@@ -152,6 +152,7 @@ var supportedNCCLCombinations = map[ncclVariant]map[recipe.CriteriaServiceType][
variantDefault: {
recipe.CriteriaServiceEKS: {recipe.CriteriaAcceleratorH100},
recipe.CriteriaServiceGKE: {recipe.CriteriaAcceleratorH100},
+ recipe.CriteriaServiceAKS: {recipe.CriteriaAcceleratorH100},
recipe.CriteriaServiceAny: {recipe.CriteriaAcceleratorB200, recipe.CriteriaAcceleratorGB200},
},
variantNET: {
@@ -297,7 +298,7 @@ func runNCCLTrainJob(ctx *validators.Context, gpuConfig *gpuConfiguration,
if applyErr := applyNCCLResources(ctx, dynamicClient, gpuConfig, accelerator, service, variant); applyErr != nil {
return "", aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to apply NCCL resources", applyErr)
}
- defer cleanupNCCLResources(dynamicClient, gpuConfig.Namespace)
+ defer cleanupNCCLResources(dynamicClient, ctx.Clientset, gpuConfig.Namespace)
podHelper := &helper.PodLifecycle{
ClientSet: ctx.Clientset,
@@ -575,6 +576,27 @@ func applyNCCLResources(ctx *validators.Context, dynamicClient dynamic.Interface
slog.Info("Discovered GKE GPU NIC networks", "count", len(gpuNICs), "networks", gpuNICs)
}
+ // For AKS, discover Mellanox NIC count and create NCCL topology ConfigMap.
+ // mlnxnics count of 0 is valid — Network Operator may not be deployed, but
+ // NCCL still uses IB via OFED kernel drivers.
+ if service == recipe.CriteriaServiceAKS {
+ mlnxCount := discoverAKSNodeConfig(config.Nodes[0])
+ // Indentation matches the resource block position in runtime.yaml.
+ const mlnxIndent = " "
+ templateData["MLNX_RESOURCE_LIMITS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent)
+ templateData["MLNX_RESOURCE_REQUESTS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent)
+ templateData["TOPO_CONFIGMAP_NAME"] = ncclTopoConfigMapName
+ if mlnxCount > 0 {
+ slog.Info("Discovered AKS Mellanox NIC configuration", "mlnxnics", mlnxCount)
+ } else {
+ slog.Warn("No nvidia.com/mlnxnics found — Network Operator may not be deployed",
+ "note", "NCCL will still use IB via OFED kernel drivers")
+ }
+ if err := createTopoConfigMap(ctx.Ctx, ctx.Clientset, config.Namespace); err != nil {
+ return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err)
+ }
+ }
+
// For EKS, discover instance type and EFA adapter count from GPU nodes.
// EFA count of 0 is valid — NCCL falls back to TCP (slower but functional).
if service == recipe.CriteriaServiceEKS {
@@ -1134,12 +1156,10 @@ func verifyTransportFromLogs(logs string, variant ncclVariant) error {
}
}
-// cleanupNCCLResources removes the trainjob, runtime, and (if present) the
-// ComputeDomain CR using the dynamic client. Deleting the ComputeDomain
-// cascades to its auto-generated ResourceClaimTemplate via the DRA driver;
-// NotFound on the ComputeDomain is expected for the default/NET variants
-// and is logged at debug rather than error.
-func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) {
+// cleanupNCCLResources removes the trainjob, runtime, topo ConfigMap, and
+// (if present) the ComputeDomain CR. NotFound on the ComputeDomain and topo
+// ConfigMap is expected for non-NVLS and non-AKS platforms respectively.
+func cleanupNCCLResources(dynamicClient dynamic.Interface, clientset kubernetes.Interface, namespace string) {
slog.Info("Cleaning up NCCL test resources...")
cleanupCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout)
@@ -1173,4 +1193,8 @@ func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) {
default:
slog.Warn("failed to delete ComputeDomain", "error", err, "name", ncclComputeDomainName)
}
+
+ // Delete NCCL topology ConfigMap if this was AKS. NotFound is expected
+ // for non-AKS platforms.
+ deleteTopoConfigMap(clientset, namespace)
}
diff --git a/validators/performance/nccl_test.go b/validators/performance/nccl_test.go
index 597a9f146..e80b5762c 100644
--- a/validators/performance/nccl_test.go
+++ b/validators/performance/nccl_test.go
@@ -413,6 +413,12 @@ func TestPlatformWorkerScheduling(t *testing.T) {
t.Errorf("GKE tolerations count = %d, want 2", len(tols))
}
})
+ t.Run("AKS returns nil (IB auto-detected)", func(t *testing.T) {
+ ns, tols := platformWorkerScheduling(recipe.CriteriaServiceAKS, "")
+ if ns != nil || tols != nil {
+ t.Errorf("AKS service should return nil, got ns=%v tols=%v", ns, tols)
+ }
+ })
t.Run("unknown service returns nil", func(t *testing.T) {
ns, tols := platformWorkerScheduling("unknown", "")
if ns != nil || tols != nil {
@@ -476,6 +482,14 @@ func TestTemplatePath(t *testing.T) {
filename: "runtime.yaml",
expected: filepath.Join("testdata", "gb200", "any", "runtime.yaml"),
},
+ {
+ name: "aks h100 runtime default",
+ accelerator: recipe.CriteriaAcceleratorH100,
+ service: recipe.CriteriaServiceAKS,
+ variant: variantDefault,
+ filename: "runtime.yaml",
+ expected: filepath.Join("testdata", "h100", "aks", "runtime.yaml"),
+ },
{
name: "gb200 eks NET variant",
accelerator: recipe.CriteriaAcceleratorGB200,
@@ -666,6 +680,9 @@ func TestSupportedNCCLCombinations_Variants(t *testing.T) {
if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceEKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 {
t.Errorf("variantDefault EKS = %v, want [H100]", accels)
}
+ if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 {
+ t.Errorf("variantDefault AKS = %v, want [H100]", accels)
+ }
if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAny]; len(accels) != 2 {
t.Errorf("variantDefault Any count = %d, want 2 (B200, GB200)", len(accels))
}
diff --git a/validators/performance/testdata/h100/aks/ndv5-topo.xml b/validators/performance/testdata/h100/aks/ndv5-topo.xml
new file mode 100644
index 000000000..fd970cb9f
--- /dev/null
+++ b/validators/performance/testdata/h100/aks/ndv5-topo.xml
@@ -0,0 +1,48 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/validators/performance/testdata/h100/aks/runtime.yaml b/validators/performance/testdata/h100/aks/runtime.yaml
new file mode 100644
index 000000000..cd573f03a
--- /dev/null
+++ b/validators/performance/testdata/h100/aks/runtime.yaml
@@ -0,0 +1,205 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# AKS NCCL All-Reduce TrainingRuntime — InfiniBand (ND H100 v5 / ND H200 v5).
+#
+# Azure ND-series GPU VMs use InfiniBand for GPU-to-GPU communication via
+# NVIDIA ConnectX-7 NDR adapters (8× 400 Gb/s per node). IB is kernel-native
+# through the Mellanox OFED (MOFED) driver stack — no userspace sidecar
+# (unlike GKE TCPXO) and no cloud-specific NCCL plugin (unlike EKS EFA).
+#
+# NCCL auto-detects IB HCAs and routes traffic over them by default. Key
+# tuning from excalibur/nccl-doctor references:
+# - NCCL_IB_PCI_RELAXED_ORDERING=1: required for IB perf on Azure ND-series
+# - NCCL_SOCKET_IFNAME=eth0: keeps NCCL OOB control traffic off IB fabric
+# - No custom MCA btl/oob settings: Azure IB uses default OpenMPI transport
+# (unlike EKS EFA which needs btl ^openib). Excalibur only sets plm_rsh_args.
+# - IPC_LOCK capability for RDMA memory registration
+#
+# - NCCL_TOPO_FILE=/etc/nccl/topo.xml: ndv5-topo.xml mounted via ConfigMap
+# (2-NUMA 8-GPU 8-NIC PCIe topology from excalibur/nccl-doctor)
+# - nvidia.com/mlnxnics: dynamically discovered from node allocatable
+# (Network Operator device plugin; count of 0 → line omitted gracefully)
+#
+# Must stay in sync with ncclTrainingRuntimeName in nccl_all_reduce_bw_constraint.go.
+
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainingRuntime
+metadata:
+ name: nccl-all-reduce-runtime
+ namespace: ${NAMESPACE}
+ labels:
+ trainer.kubeflow.org/framework: mpi
+spec:
+ mlPolicy:
+ mpi:
+ mpiImplementation: OpenMPI
+ numProcPerNode: ${GPU_COUNT_PER_NODE}
+ runLauncherAsNode: false
+ sshAuthMountPath: /tmp/mpi-keys
+ template:
+ spec:
+ network:
+ enableDNSHostnames: true
+ publishNotReadyAddresses: true
+ replicatedJobs:
+ - name: launcher
+ replicas: 1
+ template:
+ spec:
+ template:
+ spec:
+ tolerations:
+ - operator: Exists
+ initContainers:
+ - name: fix-ssh-perms
+ image: nvcr.io/nvidia/pytorch:25.06-py3
+ command:
+ - /bin/sh
+ - -c
+ - |
+ mkdir -p /root/.ssh
+ cp /tmp/mpi-keys/id_rsa /root/.ssh/id_rsa
+ cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys
+ chmod 700 /root/.ssh
+ chmod 600 /root/.ssh/id_rsa /root/.ssh/authorized_keys
+ volumeMounts:
+ - name: mpi-ssh-auth
+ mountPath: /tmp/mpi-keys
+ readOnly: true
+ - name: ssh-config
+ mountPath: /root/.ssh
+ containers:
+ - name: node
+ image: nvcr.io/nvidia/pytorch:25.06-py3
+ env:
+ - name: LD_LIBRARY_PATH
+ value: "/usr/local/nvidia/lib64:/usr/local/cuda/lib64"
+ command:
+ - /usr/local/mpi/bin/mpirun
+ args:
+ - -np
+ - "${GPU_COUNT}"
+ - --allow-run-as-root
+ - --mca
+ - plm_rsh_args
+ - -o StrictHostKeyChecking=no -o ConnectionAttempts=10
+ # Azure IB uses default OpenMPI transport — no custom btl/oob
+ # MCA settings needed (unlike EKS EFA). Excalibur confirms
+ # only plm_rsh_args is required for Azure.
+ - -x
+ - LD_LIBRARY_PATH
+ - -x
+ - NCCL_DEBUG=WARN
+ # Required for IB performance on Azure ND-series VMs.
+ # Both excalibur and nccl-doctor set this.
+ - -x
+ - NCCL_IB_PCI_RELAXED_ORDERING=1
+ # Pin NCCL OOB control socket to eth0 so it stays off IB.
+ - -x
+ - NCCL_SOCKET_IFNAME=eth0
+ # Explicit topology file for ND H100 v5 (mounted via ConfigMap).
+ - -x
+ - NCCL_TOPO_FILE=/etc/nccl/topo.xml
+ - /usr/local/bin/${TEST_TYPE}_mpi
+ - -b
+ - ${MIN_MESSAGE_SIZE}
+ - -e
+ - ${MAX_MESSAGE_SIZE}
+ - -f
+ - "2"
+ - -g
+ - "1"
+ resources:
+ limits:
+ cpu: "2"
+ memory: 128Mi
+ volumeMounts:
+ - name: ssh-config
+ mountPath: /root/.ssh
+ volumes:
+ - name: ssh-config
+ emptyDir: {}
+ - name: node
+ template:
+ spec:
+ template:
+ spec:
+ initContainers:
+ - name: fix-ssh-perms
+ image: nvcr.io/nvidia/pytorch:25.06-py3
+ command:
+ - /bin/sh
+ - -c
+ - |
+ apt-get update &&
+ apt-get install -y --no-install-recommends openssh-server &&
+ mkdir -p /var/run/sshd &&
+ chmod 0755 /var/run/sshd &&
+ mkdir -p /root/.ssh &&
+ cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys &&
+ chmod 700 /root/.ssh &&
+ chmod 600 /root/.ssh/authorized_keys
+ volumeMounts:
+ - name: mpi-ssh-auth
+ mountPath: /tmp/mpi-keys
+ readOnly: true
+ - name: ssh-config
+ mountPath: /root/.ssh
+ containers:
+ - name: node
+ image: nvcr.io/nvidia/pytorch:25.06-py3
+ command: ["sh", "-c"]
+ args:
+ - |
+ apt-get update &&
+ apt-get install -y --no-install-recommends openssh-server &&
+ mkdir -p /var/run/sshd &&
+ chmod 0755 /var/run/sshd &&
+ mkdir -p /root/.ssh &&
+ cp /tmp/mpi-keys/* /root/.ssh/ &&
+ chmod 700 /root/.ssh &&
+ chmod 600 /root/.ssh/authorized_keys &&
+ /usr/sbin/sshd -De
+ resources:
+ limits:
+ nvidia.com/gpu: ${GPU_COUNT_PER_NODE}
+${MLNX_RESOURCE_LIMITS}
+ requests:
+ nvidia.com/gpu: ${GPU_COUNT_PER_NODE}
+${MLNX_RESOURCE_REQUESTS}
+ securityContext:
+ capabilities:
+ add: ["IPC_LOCK"]
+ volumeMounts:
+ - name: ssh-config
+ mountPath: /root/.ssh
+ - name: dshm
+ mountPath: /dev/shm
+ - name: nccl-topo
+ mountPath: /etc/nccl
+ readOnly: true
+ volumes:
+ - name: ssh-config
+ emptyDir: {}
+ - name: dshm
+ emptyDir:
+ medium: Memory
+ - name: nccl-topo
+ configMap:
+ name: ${TOPO_CONFIGMAP_NAME}
+ successPolicy:
+ operator: All
+ targetReplicatedJobs:
+ - launcher