diff --git a/validators/performance/nccl_aks_utils.go b/validators/performance/nccl_aks_utils.go
new file mode 100644
index 000000000..22e87190e
--- /dev/null
+++ b/validators/performance/nccl_aks_utils.go
@@ -0,0 +1,154 @@
+// Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+
+	"github.com/NVIDIA/aicr/pkg/defaults"
+	aicrErrors "github.com/NVIDIA/aicr/pkg/errors"
+	v1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+)
+
+// ncclTopoConfigMapName is the name used for the NCCL topology ConfigMap
+// created by the AKS NCCL validator. Cleaned up in cleanupNCCLResources.
+const ncclTopoConfigMapName = "nccl-all-reduce-topo"
+
+// mlnxNICResource is the Kubernetes extended resource name for Mellanox
+// InfiniBand NICs exposed by the NVIDIA Network Operator device plugin.
+const mlnxNICResource = v1.ResourceName("nvidia.com/mlnxnics")
+
+// discoverAKSNodeConfig reads the Mellanox NIC count from a GPU node's
+// allocatable resources. A count of 0 is valid — the Network Operator may
+// not be deployed, but NCCL still uses IB via OFED kernel drivers.
+func discoverAKSNodeConfig(node v1.Node) int {
+	quantity := node.Status.Allocatable[mlnxNICResource]
+	return int(quantity.Value())
+}
+
+// buildMLNXResourceLine returns the YAML line for nvidia.com/mlnxnics
+// resource requests/limits at the correct indentation, or an empty string
+// if count is 0 (same graceful-degradation pattern as buildEFAResourceLine).
+func buildMLNXResourceLine(count int, indent string) string {
+	if count == 0 {
+		return ""
+	}
+	return fmt.Sprintf("%snvidia.com/mlnxnics: \"%d\"", indent, count)
+}
+
+// ndv5TopoXML is the NCCL topology XML for Azure ND H100 v5 / ND H200 v5
+// VMs. Describes the PCIe Gen5 topology: 2 NUMA nodes (Intel Sapphire
+// Rapids), 4 GPU+NIC pairs per NUMA. Each GPU (class 0x030200) is paired
+// with a ConnectX-7 NIC (class 0x020700) under a PCIe bridge at 32 GT/s x16.
+//
+// Source: excalibur (azure-h100.xml) and nccl-doctor (ndv5-topo.xml) —
+// both describe the identical ND H100 v5 hardware topology.
+const ndv5TopoXML = `<system version="1">
+  <cpu numaid="0" affinity="ffffffff,ffff0000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:03.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0008:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,0000ffff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
+    <pci busid="ffff:ff:05.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0009:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:06.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000a:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:07.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:08.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+  </cpu>
+</system>`
+
+// createTopoConfigMap creates a ConfigMap containing the ND H100 v5 NCCL
+// topology XML. The ConfigMap is mounted into worker pods so NCCL reads
+// the topology at /etc/nccl/topo.xml instead of auto-discovering it.
+// Uses create-or-update semantics per CLAUDE.md Kubernetes patterns.
+func createTopoConfigMap(ctx context.Context, clientset kubernetes.Interface, namespace string) error {
+	slog.Info("Creating NCCL topology ConfigMap", "name", ncclTopoConfigMapName, "namespace", namespace)
+
+	createCtx, cancel := context.WithTimeout(ctx, defaults.DiagnosticTimeout)
+	defer cancel()
+
+	cm := &v1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      ncclTopoConfigMapName,
+			Namespace: namespace,
+		},
+		Data: map[string]string{
+			"topo.xml": ndv5TopoXML,
+		},
+	}
+
+	_, err := clientset.CoreV1().ConfigMaps(namespace).Create(createCtx, cm, metav1.CreateOptions{})
+	if err == nil {
+		return nil
+	}
+	if !apierrors.IsAlreadyExists(err) {
+		return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err)
+	}
+
+	// AlreadyExists: update in place (prior run may have left a stale CM).
+	_, err = clientset.CoreV1().ConfigMaps(namespace).Update(createCtx, cm, metav1.UpdateOptions{})
+	if err != nil {
+		return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to update NCCL topology ConfigMap", err)
+	}
+	slog.Info("Updated existing NCCL topology ConfigMap", "name", ncclTopoConfigMapName)
+	return nil
+}
+
+// deleteTopoConfigMap removes the NCCL topology ConfigMap. NotFound is
+// expected for non-AKS platforms and is logged at debug.
+func deleteTopoConfigMap(clientset kubernetes.Interface, namespace string) {
+	deleteCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout)
+	defer cancel()
+
+	err := clientset.CoreV1().ConfigMaps(namespace).Delete(deleteCtx, ncclTopoConfigMapName, metav1.DeleteOptions{})
+	switch {
+	case err == nil:
+		slog.Info("Deleted NCCL topology ConfigMap", "name", ncclTopoConfigMapName)
+	case apierrors.IsNotFound(err):
+		slog.Debug("NCCL topology ConfigMap not present (non-AKS platform), skipping", "name", ncclTopoConfigMapName)
+	default:
+		slog.Error("Warning: Failed to delete NCCL topology ConfigMap", "error", err, "name", ncclTopoConfigMapName)
+	}
+}
diff --git a/validators/performance/nccl_aks_utils_test.go b/validators/performance/nccl_aks_utils_test.go
new file mode 100644
index 000000000..aa1e423ef
--- /dev/null
+++ b/validators/performance/nccl_aks_utils_test.go
@@ -0,0 +1,186 @@
+// Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+import (
+	"encoding/xml"
+	"strings"
+	"testing"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func TestDiscoverAKSNodeConfig(t *testing.T) {
+	tests := []struct {
+		name     string
+		node     v1.Node
+		wantMLNX int
+	}{
+		{
+			name: "ND H100 v5 with 8 Mellanox NICs",
+			node: v1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						"nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3",
+					},
+				},
+				Status: v1.NodeStatus{
+					Allocatable: v1.ResourceList{
+						v1.ResourceName("nvidia.com/gpu"):      resource.MustParse("8"),
+						v1.ResourceName("nvidia.com/mlnxnics"): resource.MustParse("8"),
+					},
+				},
+			},
+			wantMLNX: 8,
+		},
+		{
+			name: "no mlnxnics (Network Operator not deployed)",
+			node: v1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{
+						"nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3",
+					},
+				},
+				Status: v1.NodeStatus{
+					Allocatable: v1.ResourceList{
+						v1.ResourceName("nvidia.com/gpu"): resource.MustParse("8"),
+					},
+				},
+			},
+			wantMLNX: 0,
+		},
+		{
+			name: "empty allocatable",
+			node: v1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{},
+				},
+				Status: v1.NodeStatus{
+					Allocatable: v1.ResourceList{},
+				},
+			},
+			wantMLNX: 0,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := discoverAKSNodeConfig(tt.node)
+			if got != tt.wantMLNX {
+				t.Errorf("discoverAKSNodeConfig() = %d, want %d", got, tt.wantMLNX)
+			}
+		})
+	}
+}
+
+func TestBuildMLNXResourceLine(t *testing.T) {
+	tests := []struct {
+		name   string
+		count  int
+		indent string
+		want   string
+	}{
+		{
+			name:   "8 Mellanox NICs",
+			count:  8,
+			indent: "                      ",
+			want:   `                      nvidia.com/mlnxnics: "8"`,
+		},
+		{
+			name:   "4 Mellanox NICs",
+			count:  4,
+			indent: "                      ",
+			want:   `                      nvidia.com/mlnxnics: "4"`,
+		},
+		{
+			name:   "no NICs — empty string",
+			count:  0,
+			indent: "                      ",
+			want:   "",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := buildMLNXResourceLine(tt.count, tt.indent)
+			if got != tt.want {
+				t.Errorf("buildMLNXResourceLine(%d) = %q, want %q", tt.count, got, tt.want)
+			}
+		})
+	}
+}
+
+// xmlSystem is a minimal struct for validating the NCCL topology XML constant.
+type xmlSystem struct {
+	XMLName xml.Name `xml:"system"`
+	Version string   `xml:"version,attr"`
+	CPUs    []xmlCPU `xml:"cpu"`
+}
+
+type xmlCPU struct {
+	NumaID string   `xml:"numaid,attr"`
+	PCIs   []xmlPCI `xml:"pci"`
+}
+
+type xmlPCI struct {
+	BusID    string   `xml:"busid,attr"`
+	Class    string   `xml:"class,attr"`
+	Children []xmlPCI `xml:"pci"`
+}
+
+func TestNdv5TopoXML(t *testing.T) {
+	var sys xmlSystem
+	if err := xml.Unmarshal([]byte(ndv5TopoXML), &sys); err != nil {
+		t.Fatalf("ndv5TopoXML is not valid XML: %v", err)
+	}
+
+	if sys.Version != "1" {
+		t.Errorf("system version = %q, want %q", sys.Version, "1")
+	}
+
+	if len(sys.CPUs) != 2 {
+		t.Fatalf("expected 2 NUMA CPUs, got %d", len(sys.CPUs))
+	}
+
+	// Each NUMA node has 4 PCIe bridges, each with 1 GPU + 1 NIC = 8 GPUs total.
+	totalGPUs := 0
+	totalNICs := 0
+	for _, cpu := range sys.CPUs {
+		if len(cpu.PCIs) != 4 {
+			t.Errorf("NUMA %s: expected 4 PCIe bridges, got %d", cpu.NumaID, len(cpu.PCIs))
+		}
+		for _, bridge := range cpu.PCIs {
+			// Each bridge should be class 0x060400 (PCI-to-PCI bridge)
+			if bridge.Class != "0x060400" {
+				t.Errorf("bridge %s: class = %q, want 0x060400", bridge.BusID, bridge.Class)
+			}
+			for _, child := range bridge.Children {
+				switch {
+				case strings.HasPrefix(child.Class, "0x0302"):
+					totalGPUs++
+				case strings.HasPrefix(child.Class, "0x0207"):
+					totalNICs++
+				}
+			}
+		}
+	}
+
+	if totalGPUs != 8 {
+		t.Errorf("expected 8 GPUs in topology, got %d", totalGPUs)
+	}
+	if totalNICs != 8 {
+		t.Errorf("expected 8 NICs in topology, got %d", totalNICs)
+	}
+}
diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go
index 93861f0b7..1c054204e 100644
--- a/validators/performance/nccl_all_reduce_bw_constraint.go
+++ b/validators/performance/nccl_all_reduce_bw_constraint.go
@@ -152,6 +152,7 @@ var supportedNCCLCombinations = map[ncclVariant]map[recipe.CriteriaServiceType][
 	variantDefault: {
 		recipe.CriteriaServiceEKS: {recipe.CriteriaAcceleratorH100},
 		recipe.CriteriaServiceGKE: {recipe.CriteriaAcceleratorH100},
+		recipe.CriteriaServiceAKS: {recipe.CriteriaAcceleratorH100},
 		recipe.CriteriaServiceAny: {recipe.CriteriaAcceleratorB200, recipe.CriteriaAcceleratorGB200},
 	},
 	variantNET: {
@@ -297,7 +298,7 @@ func runNCCLTrainJob(ctx *validators.Context, gpuConfig *gpuConfiguration,
 	if applyErr := applyNCCLResources(ctx, dynamicClient, gpuConfig, accelerator, service, variant); applyErr != nil {
 		return "", aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to apply NCCL resources", applyErr)
 	}
-	defer cleanupNCCLResources(dynamicClient, gpuConfig.Namespace)
+	defer cleanupNCCLResources(dynamicClient, ctx.Clientset, gpuConfig.Namespace)
 
 	podHelper := &helper.PodLifecycle{
 		ClientSet: ctx.Clientset,
@@ -575,6 +576,27 @@ func applyNCCLResources(ctx *validators.Context, dynamicClient dynamic.Interface
 		slog.Info("Discovered GKE GPU NIC networks", "count", len(gpuNICs), "networks", gpuNICs)
 	}
 
+	// For AKS, discover Mellanox NIC count and create NCCL topology ConfigMap.
+	// mlnxnics count of 0 is valid — Network Operator may not be deployed, but
+	// NCCL still uses IB via OFED kernel drivers.
+	if service == recipe.CriteriaServiceAKS {
+		mlnxCount := discoverAKSNodeConfig(config.Nodes[0])
+		// Indentation matches the resource block position in runtime.yaml.
+		const mlnxIndent = "                      "
+		templateData["MLNX_RESOURCE_LIMITS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent)
+		templateData["MLNX_RESOURCE_REQUESTS"] = buildMLNXResourceLine(mlnxCount, mlnxIndent)
+		templateData["TOPO_CONFIGMAP_NAME"] = ncclTopoConfigMapName
+		if mlnxCount > 0 {
+			slog.Info("Discovered AKS Mellanox NIC configuration", "mlnxnics", mlnxCount)
+		} else {
+			slog.Warn("No nvidia.com/mlnxnics found — Network Operator may not be deployed",
+				"note", "NCCL will still use IB via OFED kernel drivers")
+		}
+		if err := createTopoConfigMap(ctx.Ctx, ctx.Clientset, config.Namespace); err != nil {
+			return aicrErrors.Wrap(aicrErrors.ErrCodeInternal, "failed to create NCCL topology ConfigMap", err)
+		}
+	}
+
 	// For EKS, discover instance type and EFA adapter count from GPU nodes.
 	// EFA count of 0 is valid — NCCL falls back to TCP (slower but functional).
 	if service == recipe.CriteriaServiceEKS {
@@ -1134,12 +1156,10 @@ func verifyTransportFromLogs(logs string, variant ncclVariant) error {
 	}
 }
 
-// cleanupNCCLResources removes the trainjob, runtime, and (if present) the
-// ComputeDomain CR using the dynamic client. Deleting the ComputeDomain
-// cascades to its auto-generated ResourceClaimTemplate via the DRA driver;
-// NotFound on the ComputeDomain is expected for the default/NET variants
-// and is logged at debug rather than error.
-func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) {
+// cleanupNCCLResources removes the trainjob, runtime, topo ConfigMap, and
+// (if present) the ComputeDomain CR. NotFound on the ComputeDomain and topo
+// ConfigMap is expected for non-NVLS and non-AKS platforms respectively.
+func cleanupNCCLResources(dynamicClient dynamic.Interface, clientset kubernetes.Interface, namespace string) {
 	slog.Info("Cleaning up NCCL test resources...")
 
 	cleanupCtx, cancel := context.WithTimeout(context.Background(), defaults.DiagnosticTimeout)
@@ -1173,4 +1193,8 @@ func cleanupNCCLResources(dynamicClient dynamic.Interface, namespace string) {
 	default:
 		slog.Warn("failed to delete ComputeDomain", "error", err, "name", ncclComputeDomainName)
 	}
+
+	// Delete NCCL topology ConfigMap if this was AKS. NotFound is expected
+	// for non-AKS platforms.
+	deleteTopoConfigMap(clientset, namespace)
 }
diff --git a/validators/performance/nccl_test.go b/validators/performance/nccl_test.go
index 597a9f146..e80b5762c 100644
--- a/validators/performance/nccl_test.go
+++ b/validators/performance/nccl_test.go
@@ -413,6 +413,12 @@ func TestPlatformWorkerScheduling(t *testing.T) {
 			t.Errorf("GKE tolerations count = %d, want 2", len(tols))
 		}
 	})
+	t.Run("AKS returns nil (IB auto-detected)", func(t *testing.T) {
+		ns, tols := platformWorkerScheduling(recipe.CriteriaServiceAKS, "")
+		if ns != nil || tols != nil {
+			t.Errorf("AKS service should return nil, got ns=%v tols=%v", ns, tols)
+		}
+	})
 	t.Run("unknown service returns nil", func(t *testing.T) {
 		ns, tols := platformWorkerScheduling("unknown", "")
 		if ns != nil || tols != nil {
@@ -476,6 +482,14 @@ func TestTemplatePath(t *testing.T) {
 			filename:    "runtime.yaml",
 			expected:    filepath.Join("testdata", "gb200", "any", "runtime.yaml"),
 		},
+		{
+			name:        "aks h100 runtime default",
+			accelerator: recipe.CriteriaAcceleratorH100,
+			service:     recipe.CriteriaServiceAKS,
+			variant:     variantDefault,
+			filename:    "runtime.yaml",
+			expected:    filepath.Join("testdata", "h100", "aks", "runtime.yaml"),
+		},
 		{
 			name:        "gb200 eks NET variant",
 			accelerator: recipe.CriteriaAcceleratorGB200,
@@ -666,6 +680,9 @@ func TestSupportedNCCLCombinations_Variants(t *testing.T) {
 	if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceEKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 {
 		t.Errorf("variantDefault EKS = %v, want [H100]", accels)
 	}
+	if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAKS]; len(accels) != 1 || accels[0] != recipe.CriteriaAcceleratorH100 {
+		t.Errorf("variantDefault AKS = %v, want [H100]", accels)
+	}
 	if accels := supportedNCCLCombinations[variantDefault][recipe.CriteriaServiceAny]; len(accels) != 2 {
 		t.Errorf("variantDefault Any count = %d, want 2 (B200, GB200)", len(accels))
 	}
diff --git a/validators/performance/testdata/h100/aks/ndv5-topo.xml b/validators/performance/testdata/h100/aks/ndv5-topo.xml
new file mode 100644
index 000000000..fd970cb9f
--- /dev/null
+++ b/validators/performance/testdata/h100/aks/ndv5-topo.xml
@@ -0,0 +1,48 @@
+<!--
+  NCCL topology file for Azure ND H100 v5 / ND H200 v5 VMs.
+
+  Describes the PCIe Gen5 topology: 2 NUMA nodes (Intel Sapphire Rapids),
+  4 GPU+NIC pairs per NUMA. Each GPU (class 0x030200) is paired with a
+  ConnectX-7 NIC (class 0x020700) under a PCIe bridge at 32 GT/s x16.
+
+  Source: excalibur (azure-h100.xml) and nccl-doctor (ndv5-topo.xml) —
+  both describe the identical ND H100 v5 hardware topology.
+-->
+<system version="1">
+  <cpu numaid="0" affinity="ffffffff,ffff0000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:03.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0008:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,0000ffff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
+    <pci busid="ffff:ff:05.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="0009:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:06.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000a:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:07.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+    <pci busid="ffff:ff:08.0" class="0x060400" link_speed="32.0 GT/s PCIe" link_width="16">
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="32.0 GT/s PCIe" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="32.0 GT/s PCIe" link_width="16"/>
+    </pci>
+  </cpu>
+</system>
diff --git a/validators/performance/testdata/h100/aks/runtime.yaml b/validators/performance/testdata/h100/aks/runtime.yaml
new file mode 100644
index 000000000..cd573f03a
--- /dev/null
+++ b/validators/performance/testdata/h100/aks/runtime.yaml
@@ -0,0 +1,205 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# AKS NCCL All-Reduce TrainingRuntime — InfiniBand (ND H100 v5 / ND H200 v5).
+#
+# Azure ND-series GPU VMs use InfiniBand for GPU-to-GPU communication via
+# NVIDIA ConnectX-7 NDR adapters (8× 400 Gb/s per node). IB is kernel-native
+# through the Mellanox OFED (MOFED) driver stack — no userspace sidecar
+# (unlike GKE TCPXO) and no cloud-specific NCCL plugin (unlike EKS EFA).
+#
+# NCCL auto-detects IB HCAs and routes traffic over them by default. Key
+# tuning from excalibur/nccl-doctor references:
+#   - NCCL_IB_PCI_RELAXED_ORDERING=1: required for IB perf on Azure ND-series
+#   - NCCL_SOCKET_IFNAME=eth0: keeps NCCL OOB control traffic off IB fabric
+#   - No custom MCA btl/oob settings: Azure IB uses default OpenMPI transport
+#     (unlike EKS EFA which needs btl ^openib). Excalibur only sets plm_rsh_args.
+#   - IPC_LOCK capability for RDMA memory registration
+#
+#   - NCCL_TOPO_FILE=/etc/nccl/topo.xml: ndv5-topo.xml mounted via ConfigMap
+#     (2-NUMA 8-GPU 8-NIC PCIe topology from excalibur/nccl-doctor)
+#   - nvidia.com/mlnxnics: dynamically discovered from node allocatable
+#     (Network Operator device plugin; count of 0 → line omitted gracefully)
+#
+# Must stay in sync with ncclTrainingRuntimeName in nccl_all_reduce_bw_constraint.go.
+
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: TrainingRuntime
+metadata:
+  name: nccl-all-reduce-runtime
+  namespace: ${NAMESPACE}
+  labels:
+    trainer.kubeflow.org/framework: mpi
+spec:
+  mlPolicy:
+    mpi:
+      mpiImplementation: OpenMPI
+      numProcPerNode: ${GPU_COUNT_PER_NODE}
+      runLauncherAsNode: false
+      sshAuthMountPath: /tmp/mpi-keys
+  template:
+    spec:
+      network:
+        enableDNSHostnames: true
+        publishNotReadyAddresses: true
+      replicatedJobs:
+      - name: launcher
+        replicas: 1
+        template:
+          spec:
+            template:
+              spec:
+                tolerations:
+                - operator: Exists
+                initContainers:
+                - name: fix-ssh-perms
+                  image: nvcr.io/nvidia/pytorch:25.06-py3
+                  command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    mkdir -p /root/.ssh
+                    cp /tmp/mpi-keys/id_rsa /root/.ssh/id_rsa
+                    cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys
+                    chmod 700 /root/.ssh
+                    chmod 600 /root/.ssh/id_rsa /root/.ssh/authorized_keys
+                  volumeMounts:
+                  - name: mpi-ssh-auth
+                    mountPath: /tmp/mpi-keys
+                    readOnly: true
+                  - name: ssh-config
+                    mountPath: /root/.ssh
+                containers:
+                - name: node
+                  image: nvcr.io/nvidia/pytorch:25.06-py3
+                  env:
+                  - name: LD_LIBRARY_PATH
+                    value: "/usr/local/nvidia/lib64:/usr/local/cuda/lib64"
+                  command:
+                  - /usr/local/mpi/bin/mpirun
+                  args:
+                  - -np
+                  - "${GPU_COUNT}"
+                  - --allow-run-as-root
+                  - --mca
+                  - plm_rsh_args
+                  - -o StrictHostKeyChecking=no -o ConnectionAttempts=10
+                  # Azure IB uses default OpenMPI transport — no custom btl/oob
+                  # MCA settings needed (unlike EKS EFA). Excalibur confirms
+                  # only plm_rsh_args is required for Azure.
+                  - -x
+                  - LD_LIBRARY_PATH
+                  - -x
+                  - NCCL_DEBUG=WARN
+                  # Required for IB performance on Azure ND-series VMs.
+                  # Both excalibur and nccl-doctor set this.
+                  - -x
+                  - NCCL_IB_PCI_RELAXED_ORDERING=1
+                  # Pin NCCL OOB control socket to eth0 so it stays off IB.
+                  - -x
+                  - NCCL_SOCKET_IFNAME=eth0
+                  # Explicit topology file for ND H100 v5 (mounted via ConfigMap).
+                  - -x
+                  - NCCL_TOPO_FILE=/etc/nccl/topo.xml
+                  - /usr/local/bin/${TEST_TYPE}_mpi
+                  - -b
+                  - ${MIN_MESSAGE_SIZE}
+                  - -e
+                  - ${MAX_MESSAGE_SIZE}
+                  - -f
+                  - "2"
+                  - -g
+                  - "1"
+                  resources:
+                    limits:
+                      cpu: "2"
+                      memory: 128Mi
+                  volumeMounts:
+                  - name: ssh-config
+                    mountPath: /root/.ssh
+                volumes:
+                - name: ssh-config
+                  emptyDir: {}
+      - name: node
+        template:
+          spec:
+            template:
+              spec:
+                initContainers:
+                - name: fix-ssh-perms
+                  image: nvcr.io/nvidia/pytorch:25.06-py3
+                  command:
+                  - /bin/sh
+                  - -c
+                  - |
+                    apt-get update &&
+                    apt-get install -y --no-install-recommends openssh-server &&
+                    mkdir -p /var/run/sshd &&
+                    chmod 0755 /var/run/sshd &&
+                    mkdir -p /root/.ssh &&
+                    cp /tmp/mpi-keys/authorized_keys /root/.ssh/authorized_keys &&
+                    chmod 700 /root/.ssh &&
+                    chmod 600 /root/.ssh/authorized_keys
+                  volumeMounts:
+                  - name: mpi-ssh-auth
+                    mountPath: /tmp/mpi-keys
+                    readOnly: true
+                  - name: ssh-config
+                    mountPath: /root/.ssh
+                containers:
+                - name: node
+                  image: nvcr.io/nvidia/pytorch:25.06-py3
+                  command: ["sh", "-c"]
+                  args:
+                  - |
+                    apt-get update &&
+                    apt-get install -y --no-install-recommends openssh-server &&
+                    mkdir -p /var/run/sshd &&
+                    chmod 0755 /var/run/sshd &&
+                    mkdir -p /root/.ssh &&
+                    cp /tmp/mpi-keys/* /root/.ssh/ &&
+                    chmod 700 /root/.ssh &&
+                    chmod 600 /root/.ssh/authorized_keys &&
+                    /usr/sbin/sshd -De
+                  resources:
+                    limits:
+                      nvidia.com/gpu: ${GPU_COUNT_PER_NODE}
+${MLNX_RESOURCE_LIMITS}
+                    requests:
+                      nvidia.com/gpu: ${GPU_COUNT_PER_NODE}
+${MLNX_RESOURCE_REQUESTS}
+                  securityContext:
+                    capabilities:
+                      add: ["IPC_LOCK"]
+                  volumeMounts:
+                  - name: ssh-config
+                    mountPath: /root/.ssh
+                  - name: dshm
+                    mountPath: /dev/shm
+                  - name: nccl-topo
+                    mountPath: /etc/nccl
+                    readOnly: true
+                volumes:
+                - name: ssh-config
+                  emptyDir: {}
+                - name: dshm
+                  emptyDir:
+                    medium: Memory
+                - name: nccl-topo
+                  configMap:
+                    name: ${TOPO_CONFIGMAP_NAME}
+      successPolicy:
+        operator: All
+        targetReplicatedJobs:
+        - launcher