diff --git a/validators/performance/nccl_aks_utils.go b/validators/performance/nccl_aks_utils.go new file mode 100644 index 000000000..22e87190e --- /dev/null +++ b/validators/performance/nccl_aks_utils.go @@ -0,0 +1,154 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "fmt" + "log/slog" + + "github.com/NVIDIA/aicr/pkg/defaults" + aicrErrors "github.com/NVIDIA/aicr/pkg/errors" + v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// ncclTopoConfigMapName is the name used for the NCCL topology ConfigMap +// created by the AKS NCCL validator. Cleaned up in cleanupNCCLResources. +const ncclTopoConfigMapName = "nccl-all-reduce-topo" + +// mlnxNICResource is the Kubernetes extended resource name for Mellanox +// InfiniBand NICs exposed by the NVIDIA Network Operator device plugin. +const mlnxNICResource = v1.ResourceName("nvidia.com/mlnxnics") + +// discoverAKSNodeConfig reads the Mellanox NIC count from a GPU node's +// allocatable resources. A count of 0 is valid — the Network Operator may +// not be deployed, but NCCL still uses IB via OFED kernel drivers. +func discoverAKSNodeConfig(node v1.Node) int { + quantity := node.Status.Allocatable[mlnxNICResource] + return int(quantity.Value()) +} + +// buildMLNXResourceLine returns the YAML line for nvidia.com/mlnxnics +// resource requests/limits at the correct indentation, or an empty string +// if count is 0 (same graceful-degradation pattern as buildEFAResourceLine). +func buildMLNXResourceLine(count int, indent string) string { + if count == 0 { + return "" + } + return fmt.Sprintf("%snvidia.com/mlnxnics: \"%d\"", indent, count) +} + +// ndv5TopoXML is the NCCL topology XML for Azure ND H100 v5 / ND H200 v5 +// VMs. Describes the PCIe Gen5 topology: 2 NUMA nodes (Intel Sapphire +// Rapids), 4 GPU+NIC pairs per NUMA. Each GPU (class 0x030200) is paired +// with a ConnectX-7 NIC (class 0x020700) under a PCIe bridge at 32 GT/s x16. +// +// Source: excalibur (azure-h100.xml) and nccl-doctor (ndv5-topo.xml) — +// both describe the identical ND H100 v5 hardware topology. +const ndv5TopoXML = ` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +` + +// createTopoConfigMap creates a ConfigMap containing the ND H100 v5 NCCL +// topology XML. The ConfigMap is mounted into worker pods so NCCL reads +// the topology at /etc/nccl/topo.xml instead of auto-discovering it. +// Uses create-or-update semantics per CLAUDE.md Kubernetes patterns. +func createTopoConfigMap(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + slog.Info("Creating NCCL topology ConfigMap", "name", ncclTopoConfigMapName, "namespace", namespace) + + createCtx, cancel := context.WithTimeout(ctx, defaults.DiagnosticTimeout) + defer cancel() + + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: ncclTopoConfigMapName, + Namespace: namespace, + }, + Data: map[string]string{ + "topo.xml": ndv5TopoXML, + }, + } + + _, err := clientset.CoreV1().ConfigMaps(namespace).Create(createCtx, cm, metav1.CreateOptions{}) + if err == nil { + return nil + } + if !apierrors.IsAlreadyExists(err) { + return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err) + } + + // AlreadyExists: update in place (prior run may have left a stale CM). + _, err = clientset.CoreV1().ConfigMaps(namespace).Update(createCtx, cm, metav1.UpdateOptions{}) + if err != nil { + return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to update NCCL topology ConfigMap", err) + } + slog.Info("Updated existing NCCL topology ConfigMap", "name", ncclTopoConfigMapName) + return nil +} + +// deleteTopoConfigMap removes the NCCL topology ConfigMap. NotFound is +// expected for non-AKS platforms and is logged at debug. +func deleteTopoConfigMap(clientset kubernetes.Interface, namespace string) { + deleteCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout) + defer cancel() + + err := clientset.CoreV1().ConfigMaps(namespace).Delete(deleteCtx, ncclTopoConfigMapName, metav1.DeleteOptions{}) + switch { + case err == nil: + slog.Info("Deleted NCCL topology ConfigMap", "name", ncclTopoConfigMapName) + case apierrors.IsNotFound(err): + slog.Debug("NCCL topology ConfigMap not present (non-AKS platform), skipping", "name", ncclTopoConfigMapName) + default: + slog.Error("Warning: Failed to delete NCCL topology ConfigMap", "error", err, "name", ncclTopoConfigMapName) + } +} diff --git a/validators/performance/nccl_aks_utils_test.go b/validators/performance/nccl_aks_utils_test.go new file mode 100644 index 000000000..aa1e423ef --- /dev/null +++ b/validators/performance/nccl_aks_utils_test.go @@ -0,0 +1,186 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "encoding/xml" + "strings" + "testing" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestDiscoverAKSNodeConfig(t *testing.T) { + tests := []struct { + name string + node v1.Node + wantMLNX int + }{ + { + name: "ND H100 v5 with 8 Mellanox NICs", + node: v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3", + }, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"), + v1.ResourceName("nvidia.com/mlnxnics"): resource.MustParse("8"), + }, + }, + }, + wantMLNX: 8, + }, + { + name: "no mlnxnics (Network Operator not deployed)", + node: v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3", + }, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"), + }, + }, + }, + wantMLNX: 0, + }, + { + name: "empty allocatable", + node: v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{}, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{}, + }, + }, + wantMLNX: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := discoverAKSNodeConfig(tt.node) + if got != tt.wantMLNX { + t.Errorf("discoverAKSNodeConfig() = %d, want %d", got, tt.wantMLNX) + } + }) + } +} + +func TestBuildMLNXResourceLine(t *testing.T) { + tests := []struct { + name string + count int + indent string + want string + }{ + { + name: "8 Mellanox NICs", + count: 8, + indent: " ", + want: ` nvidia.com/mlnxnics: "8"`, + }, + { + name: "4 Mellanox NICs", + count: 4, + indent: " ", + want: ` nvidia.com/mlnxnics: "4"`, + }, + { + name: "no NICs — empty string", + count: 0, + indent: " ", + want: "", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := buildMLNXResourceLine(tt.count, tt.indent) + if got != tt.want { + t.Errorf("buildMLNXResourceLine(%d) = %q, want %q", tt.count, got, tt.want) + } + }) + } +} + +// xmlSystem is a minimal struct for validating the NCCL topology XML constant. +type xmlSystem struct { + XMLName xml.Name `xml:"system"` + Version string `xml:"version,attr"` + CPUs []xmlCPU `xml:"cpu"` +} + +type xmlCPU struct { + NumaID string `xml:"numaid,attr"` + PCIs []xmlPCI `xml:"pci"` +} + +type xmlPCI struct { + BusID string `xml:"busid,attr"` + Class string `xml:"class,attr"` + Children []xmlPCI `xml:"pci"` +} + +func TestNdv5TopoXML(t *testing.T) { + var sys xmlSystem + if err := xml.Unmarshal([]byte(ndv5TopoXML), &sys); err != nil { + t.Fatalf("ndv5TopoXML is not valid XML: %v", err) + } + + if sys.Version != "1" { + t.Errorf("system version = %q, want %q", sys.Version, "1") + } + + if len(sys.CPUs) != 2 { + t.Fatalf("expected 2 NUMA CPUs, got %d", len(sys.CPUs)) + } + + // Each NUMA node has 4 PCIe bridges, each with 1 GPU + 1 NIC = 8 GPUs total. + totalGPUs := 0 + totalNICs := 0 + for _, cpu := range sys.CPUs { + if len(cpu.PCIs) != 4 { + t.Errorf("NUMA %s: expected 4 PCIe bridges, got %d", cpu.NumaID, len(cpu.PCIs)) + } + for _, bridge := range cpu.PCIs { + // Each bridge should be class 0x060400 (PCI-to-PCI bridge) + if bridge.Class != "0x060400" { + t.Errorf("bridge %s: class = %q, want 0x060400", bridge.BusID, bridge.Class) + } + for _, child := range bridge.Children { + switch { + case strings.HasPrefix(child.Class, "0x0302"): + totalGPUs++ + case strings.HasPrefix(child.Class, "0x0207"): + totalNICs++ + } + } + } + } + + if totalGPUs != 8 { + t.Errorf("expected 8 GPUs in topology, got %d", totalGPUs) + } + if totalNICs != 8 { + t.Errorf("expected 8 NICs in topology, got %d", totalNICs) + } +} diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go index 93861f0b7..1c054204e 100644 --- a/validators/performance/nccl_all_reduce_bw_constraint.go +++ b/validators/performance/nccl_all_reduce_bw_constraint.go @@ -152,6 +152,7 @@ var supportedNCCLCombinations = map[ncclVariant]map[recipe.CriteriaServiceType][ variantDefault: { recipe.CriteriaServiceEKS: {recipe.CriteriaAcceleratorH100}, recipe.CriteriaServiceGKE: {recipe.CriteriaAcceleratorH100}, + recipe.CriteriaServiceAKS: {recipe.CriteriaAcceleratorH100}, recipe.CriteriaServiceAny: {recipe.CriteriaAcceleratorB200, recipe.CriteriaAcceleratorGB200}, }, variantNET: { @@ -297,7 +298,7 @@ func runNCCLTrainJob(ctx *validators.Context, gpuConfig *gpuConfiguration, if applyErr := applyNCCLResources(ctx, dynamicClient, gpuConfig, accelerator, service, variant); applyErr != nil { return "", aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to apply NCCL resources", applyErr) } - defer cleanupNCCLResources(dynamicClient, gpuConfig.Namespace) + defer cleanupNCCLResources(dynamicClient, ctx.Clientset, gpuConfig.Namespace) podHelper := &helper.PodLifecycle{ ClientSet: ctx.Clientset, @@ -575,6 +576,27 @@ func applyNCCLResources(ctx *validators.Context, dynamicClient dynamic.Interface slog.Info("Discovered GKE GPU NIC networks", "count", len(gpuNICs), "networks", gpuNICs) } + // For AKS, discover Mellanox NIC count and create NCCL topology ConfigMap. + // mlnxnics count of 0 is valid — Network Operator may not be deployed, but + // NCCL still uses IB via OFED kernel drivers. + if service == recipe.CriteriaServiceAKS { + mlnxCount := discoverAKSNodeConfig(config.Nodes[0]) + // Indentation matches the resource block position in runtime.yaml. + const mlnxIndent = " " + templateData["MLNX_RESOURCE_LIMITS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent) + templateData["MLNX_RESOURCE_REQUESTS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent) + templateData["TOPO_CONFIGMAP_NAME"] = ncclTopoConfigMapName + if mlnxCount > 0 { + slog.Info("Discovered AKS Mellanox NIC configuration", "mlnxnics", mlnxCount) + } else { + slog.Warn("No nvidia.com/mlnxnics found — Network Operator may not be deployed", + "note", "NCCL will still use IB via OFED kernel drivers") + } + if err := createTopoConfigMap(ctx.Ctx, ctx.Clientset, config.Namespace); err != nil { + return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err) + } + } + // For EKS, discover instance type and EFA adapter count from GPU nodes. // EFA count of 0 is valid — NCCL falls back to TCP (slower but functional). if service == recipe.CriteriaServiceEKS { @@ -1134,12 +1156,10 @@ func verifyTransportFromLogs(logs string, variant ncclVariant) error { } } -// cleanupNCCLResources removes the trainjob, runtime, and (if present) the -// ComputeDomain CR using the dynamic client. Deleting the ComputeDomain -// cascades to its auto-generated ResourceClaimTemplate via the DRA driver; -// NotFound on the ComputeDomain is expected for the default/NET variants -// and is logged at debug rather than error. -func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) { +// cleanupNCCLResources removes the trainjob, runtime, topo ConfigMap, and +// (if present) the ComputeDomain CR. NotFound on the ComputeDomain and topo +// ConfigMap is expected for non-NVLS and non-AKS platforms respectively. +func cleanupNCCLResources(dynamicClient dynamic.Interface, clientset kubernetes.Interface, namespace string) { slog.Info("Cleaning up NCCL test resources...") cleanupCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout) @@ -1173,4 +1193,8 @@ func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) { default: slog.Warn("failed to delete ComputeDomain", "error", err, "name", ncclComputeDomainName) } + + // Delete NCCL topology ConfigMap if this was AKS. NotFound is expected + // for non-AKS platforms. + deleteTopoConfigMap(clientset, namespace) } diff --git a/validators/performance/nccl_test.go b/validators/performance/nccl_test.go index 597a9f146..e80b5762c 100644 --- a/validators/performance/nccl_test.go +++ b/validators/performance/nccl_test.go @@ -413,6 +413,12 @@ func TestPlatformWorkerScheduling(t *testing.T) { t.Errorf("GKE tolerations count = %d, want 2", len(tols)) } }) + t.Run("AKS returns nil (IB auto-detected)", func(t *testing.T) { + ns, tols := platformWorkerScheduling(recipe.CriteriaServiceAKS, "") + if ns != nil || tols != nil { + t.Errorf("AKS service should return nil, got ns=%v tols=%v", ns, tols) + } + }) t.Run("unknown service returns nil", func(t *testing.T) { ns, tols := platformWorkerScheduling("unknown", "") if ns != nil || tols != nil { @@ -476,6 +482,14 @@ func TestTemplatePath(t *testing.T) { filename: "runtime.yaml", expected: filepath.Join("testdata", "gb200", "any", "runtime.yaml"), }, + { + name: "aks h100 runtime default", + accelerator: recipe.CriteriaAcceleratorH100, + service: recipe.CriteriaServiceAKS, + variant: variantDefault, + filename: "runtime.yaml", + expected: filepath.Join("testdata", "h100", "aks", "runtime.yaml"), + }, { name: "gb200 eks NET variant", accelerator: recipe.CriteriaAcceleratorGB200, @@ -666,6 +680,9 @@ func TestSupportedNCCLCombinations_Variants(t *testing.T) { if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceEKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 { t.Errorf("variantDefault EKS = %v, want [H100]", accels) } + if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 { + t.Errorf("variantDefault AKS = %v, want [H100]", accels) + } if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAny]; len(accels) != 2 { t.Errorf("variantDefault Any count = %d, want 2 (B200, GB200)", len(accels)) } diff --git a/validators/performance/testdata/h100/aks/ndv5-topo.xml b/validators/performance/testdata/h100/aks/ndv5-topo.xml new file mode 100644 index 000000000..fd970cb9f --- /dev/null +++ b/validators/performance/testdata/h100/aks/ndv5-topo.xml @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/validators/performance/testdata/h100/aks/runtime.yaml b/validators/performance/testdata/h100/aks/runtime.yaml new file mode 100644 index 000000000..cd573f03a --- /dev/null +++ b/validators/performance/testdata/h100/aks/runtime.yaml @@ -0,0 +1,205 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# AKS NCCL All-Reduce TrainingRuntime — InfiniBand (ND H100 v5 / ND H200 v5). +# +# Azure ND-series GPU VMs use InfiniBand for GPU-to-GPU communication via +# NVIDIA ConnectX-7 NDR adapters (8× 400 Gb/s per node). IB is kernel-native +# through the Mellanox OFED (MOFED) driver stack — no userspace sidecar +# (unlike GKE TCPXO) and no cloud-specific NCCL plugin (unlike EKS EFA). +# +# NCCL auto-detects IB HCAs and routes traffic over them by default. Key +# tuning from excalibur/nccl-doctor references: +# - NCCL_IB_PCI_RELAXED_ORDERING=1: required for IB perf on Azure ND-series +# - NCCL_SOCKET_IFNAME=eth0: keeps NCCL OOB control traffic off IB fabric +# - No custom MCA btl/oob settings: Azure IB uses default OpenMPI transport +# (unlike EKS EFA which needs btl ^openib). Excalibur only sets plm_rsh_args. +# - IPC_LOCK capability for RDMA memory registration +# +# - NCCL_TOPO_FILE=/etc/nccl/topo.xml: ndv5-topo.xml mounted via ConfigMap +# (2-NUMA 8-GPU 8-NIC PCIe topology from excalibur/nccl-doctor) +# - nvidia.com/mlnxnics: dynamically discovered from node allocatable +# (Network Operator device plugin; count of 0 → line omitted gracefully) +# +# Must stay in sync with ncclTrainingRuntimeName in nccl_all_reduce_bw_constraint.go. + +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: TrainingRuntime +metadata: + name: nccl-all-reduce-runtime + namespace: ${NAMESPACE} + labels: + trainer.kubeflow.org/framework: mpi +spec: + mlPolicy: + mpi: + mpiImplementation: OpenMPI + numProcPerNode: ${GPU_COUNT_PER_NODE} + runLauncherAsNode: false + sshAuthMountPath: /tmp/mpi-keys + template: + spec: + network: + enableDNSHostnames: true + publishNotReadyAddresses: true + replicatedJobs: + - name: launcher + replicas: 1 + template: + spec: + template: + spec: + tolerations: + - operator: Exists + initContainers: + - name: fix-ssh-perms + image: nvcr.io/nvidia/pytorch:25.06-py3 + command: + - /bin/sh + - -c + - | + mkdir -p /root/.ssh + cp /tmp/mpi-keys/id_rsa /root/.ssh/id_rsa + cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys + chmod 700 /root/.ssh + chmod 600 /root/.ssh/id_rsa /root/.ssh/authorized_keys + volumeMounts: + - name: mpi-ssh-auth + mountPath: /tmp/mpi-keys + readOnly: true + - name: ssh-config + mountPath: /root/.ssh + containers: + - name: node + image: nvcr.io/nvidia/pytorch:25.06-py3 + env: + - name: LD_LIBRARY_PATH + value: "/usr/local/nvidia/lib64:/usr/local/cuda/lib64" + command: + - /usr/local/mpi/bin/mpirun + args: + - -np + - "${GPU_COUNT}" + - --allow-run-as-root + - --mca + - plm_rsh_args + - -o StrictHostKeyChecking=no -o ConnectionAttempts=10 + # Azure IB uses default OpenMPI transport — no custom btl/oob + # MCA settings needed (unlike EKS EFA). Excalibur confirms + # only plm_rsh_args is required for Azure. + - -x + - LD_LIBRARY_PATH + - -x + - NCCL_DEBUG=WARN + # Required for IB performance on Azure ND-series VMs. + # Both excalibur and nccl-doctor set this. + - -x + - NCCL_IB_PCI_RELAXED_ORDERING=1 + # Pin NCCL OOB control socket to eth0 so it stays off IB. + - -x + - NCCL_SOCKET_IFNAME=eth0 + # Explicit topology file for ND H100 v5 (mounted via ConfigMap). + - -x + - NCCL_TOPO_FILE=/etc/nccl/topo.xml + - /usr/local/bin/${TEST_TYPE}_mpi + - -b + - ${MIN_MESSAGE_SIZE} + - -e + - ${MAX_MESSAGE_SIZE} + - -f + - "2" + - -g + - "1" + resources: + limits: + cpu: "2" + memory: 128Mi + volumeMounts: + - name: ssh-config + mountPath: /root/.ssh + volumes: + - name: ssh-config + emptyDir: {} + - name: node + template: + spec: + template: + spec: + initContainers: + - name: fix-ssh-perms + image: nvcr.io/nvidia/pytorch:25.06-py3 + command: + - /bin/sh + - -c + - | + apt-get update && + apt-get install -y --no-install-recommends openssh-server && + mkdir -p /var/run/sshd && + chmod 0755 /var/run/sshd && + mkdir -p /root/.ssh && + cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys && + chmod 700 /root/.ssh && + chmod 600 /root/.ssh/authorized_keys + volumeMounts: + - name: mpi-ssh-auth + mountPath: /tmp/mpi-keys + readOnly: true + - name: ssh-config + mountPath: /root/.ssh + containers: + - name: node + image: nvcr.io/nvidia/pytorch:25.06-py3 + command: ["sh", "-c"] + args: + - | + apt-get update && + apt-get install -y --no-install-recommends openssh-server && + mkdir -p /var/run/sshd && + chmod 0755 /var/run/sshd && + mkdir -p /root/.ssh && + cp /tmp/mpi-keys/* /root/.ssh/ && + chmod 700 /root/.ssh && + chmod 600 /root/.ssh/authorized_keys && + /usr/sbin/sshd -De + resources: + limits: + nvidia.com/gpu: ${GPU_COUNT_PER_NODE} +${MLNX_RESOURCE_LIMITS} + requests: + nvidia.com/gpu: ${GPU_COUNT_PER_NODE} +${MLNX_RESOURCE_REQUESTS} + securityContext: + capabilities: + add: ["IPC_LOCK"] + volumeMounts: + - name: ssh-config + mountPath: /root/.ssh + - name: dshm + mountPath: /dev/shm + - name: nccl-topo + mountPath: /etc/nccl + readOnly: true + volumes: + - name: ssh-config + emptyDir: {} + - name: dshm + emptyDir: + medium: Memory + - name: nccl-topo + configMap: + name: ${TOPO_CONFIGMAP_NAME} + successPolicy: + operator: All + targetReplicatedJobs: + - launcher