NVIDIA · yuanchen8911 · Apr 30, 2026 · Apr 30, 2026
@@ -97,12 +97,30 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  podTemplateOverrides:
-    - targetJobs:
-        - name: node
-      spec:
-        tolerations:
-          - operator: Exists
+  # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced
+  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
+  runtimePatches:
+    - manager: aicr.nvidia.com/demo
+      trainingRuntimeSpec:
+        template:
+          spec:
+            replicatedJobs:
+              - name: node
+                template:
+                  spec:
+                    template:
+                      spec:
+                        nodeSelector:
+                          nodeGroup: gpu-worker
+                        tolerations:
+                          - key: dedicated
+                            operator: Equal
+                            value: worker-workload
+                            effect: NoSchedule
+                          - key: dedicated
+                            operator: Equal
+                            value: worker-workload
+                            effect: NoExecute
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org

@@ -99,12 +99,32 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  podTemplateOverrides:
-    - targetJobs:
-        - name: node
-      spec:
-        tolerations:
-          - operator: Exists
+  # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate
+  # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed
+  # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced
+  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
+  runtimePatches:
+    - manager: aicr.nvidia.com/demo
+      trainingRuntimeSpec:
+        template:
+          spec:
+            replicatedJobs:
+              - name: node
+                template:
+                  spec:
+                    template:
+                      spec:
+                        nodeSelector:
+                          nodeGroup: gpu-worker
+                        tolerations:
+                          - key: dedicated
+                            operator: Equal
+                            value: gpu-workload
+                            effect: NoSchedule
+                          - key: nvidia.com/gpu
+                            operator: Equal
+                            value: present
+                            effect: NoSchedule
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org

@@ -430,7 +430,7 @@ Deployers respect the `deploymentOrder` field from the recipe to ensure componen
 ```yaml
 componentRefs:
   - name: cert-manager
-    version: v1.17.2
+    version: v1.20.2
   - name: gpu-operator
     version: v25.3.3
   - name: network-operator

@@ -333,7 +333,7 @@ spec:
     - name: cert-manager
       type: Helm
       source: https://charts.jetstack.io
-      version: v1.17.2
+      version: v1.20.2
       valuesFile: components/cert-manager/values.yaml
 
     - name: gpu-operator
@@ -615,7 +615,7 @@ Components can declare dependencies via `dependencyRefs`:
 componentRefs:
   - name: cert-manager
     type: Helm
-    version: v1.17.2
+    version: v1.20.2
 
   - name: gpu-operator
     type: Helm
@@ -997,7 +997,7 @@ curl "http://localhost:8080/v1/recipe?os=ubuntu&service=eks&accelerator=gb200&in
       "name": "cert-manager",
       "type": "Helm",
       "source": "https://charts.jetstack.io",
-      "version": "v1.17.2",
+      "version": "v1.20.2",
       "valuesFile": "components/cert-manager/values.yaml"
     },
     {

@@ -24,7 +24,7 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/
 | **prometheus-adapter** | Exposes custom metrics from Prometheus to the Kubernetes metrics API. Enables HPA scaling based on GPU utilization and other custom metrics. | [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) |
 | **aws-ebs-csi-driver** | CSI driver for Amazon EBS volumes. Provides persistent storage for workloads on EKS. EKS-specific. | [AWS EBS CSI Driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver) |
 | **k8s-ephemeral-storage-metrics** | Exports ephemeral storage usage metrics per pod. Useful for monitoring scratch space consumption on GPU nodes. | [k8s-ephemeral-storage-metrics](https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics) |
-| **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/NVIDIA/KAI-Scheduler) |
+| **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/kai-scheduler/KAI-Scheduler) |
 | **grove** | Pod lifecycle management for Dynamo inference platform. Installed as a standalone component. | [Grove](https://github.com/ai-dynamo/grove) |
 | **dynamo-platform** | NVIDIA Dynamo inference serving platform with bundled CRDs. Distributed inference with prefix-cache-aware routing and disaggregated prefill/decode. | [Dynamo](https://github.com/ai-dynamo/dynamo) |
 | **kgateway-crds** | Custom Resource Definitions for kgateway (Kubernetes Gateway API implementation). | [kgateway](https://github.com/kgateway-dev/kgateway) |

@@ -76,8 +76,8 @@ componentRefs:
     namespace: kai-scheduler
     chart: kai-scheduler
     type: Helm
-    source: oci://ghcr.io/nvidia/kai-scheduler
-    version: v0.13.0
+    source: oci://ghcr.io/kai-scheduler/kai-scheduler
+    version: v0.14.1
     valuesFile: components/kai-scheduler/values.yaml
     dependencyRefs:
       - gpu-operator

@@ -1895,8 +1895,8 @@ func TestBundleGolden_KaiSchedulerPresent(t *testing.T) {
 					Name:      "kai-scheduler",
 					Namespace: "kai-scheduler",
 					Chart:     "kai-scheduler",
-					Version:   "v0.13.0",
-					Source:    "oci://ghcr.io/nvidia/kai-scheduler",
+					Version:   "v0.14.1",
+					Source:    "oci://ghcr.io/kai-scheduler/kai-scheduler",
 				},
 			},
 			DeploymentOrder: []string{"kai-scheduler"},

@@ -1,3 +1,3 @@
-CHART='oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler'
+CHART='oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler'
 REPO=''
-VERSION='v0.13.0'
+VERSION='v0.14.1'
@@ -18,7 +18,7 @@ via its own `install.sh`:
 
 | Component | Version | Namespace | Source |
 |-----------|---------|-----------|--------|
-| kai-scheduler | v0.13.0 | kai-scheduler | kai-scheduler (oci://ghcr.io/nvidia/kai-scheduler) |
+| kai-scheduler | v0.14.1 | kai-scheduler | kai-scheduler (oci://ghcr.io/kai-scheduler/kai-scheduler) |
 
 
 

@@ -25,8 +25,11 @@ metadata:
 spec:
   mlPolicy:
     numNodes: 1
-    torch:
-      numProcPerNode: auto
+    # numProcPerNode was removed from mlPolicy.torch in kubeflow-trainer v2.2.0
+    # (kubeflow/trainer#3239) — Torch now infers parallelism from
+    # the container's nvidia.com/gpu resource limit. mlPolicy.mpi.numProcPerNode
+    # is unaffected.
+    torch: {}
   template:
     spec:
       replicatedJobs:

@@ -88,8 +88,8 @@ spec:
 
     - name: kai-scheduler
       type: Helm
-      source: oci://ghcr.io/nvidia/kai-scheduler
-      version: v0.13.0
+      source: oci://ghcr.io/kai-scheduler/kai-scheduler
+      version: v0.14.1
       valuesFile: components/kai-scheduler/values.yaml
       dependencyRefs:
         - gpu-operator
@@ -364,9 +364,9 @@ components:
     healthCheck:
       assertFile: checks/kai-scheduler/health-check.yaml
     helm:
-      defaultRepository: oci://ghcr.io/nvidia/kai-scheduler
+      defaultRepository: oci://ghcr.io/kai-scheduler/kai-scheduler
       defaultChart: kai-scheduler
-      defaultVersion: v0.13.0
+      defaultVersion: v0.14.1
       defaultNamespace: kai-scheduler
     nodeScheduling:
       system:
@@ -485,7 +485,7 @@ components:
     helm:
       defaultRepository: oci://ghcr.io/kubeflow/charts
       defaultChart: kubeflow-trainer
-      defaultVersion: 2.1.0
+      defaultVersion: 2.2.0
       defaultNamespace: kubeflow
     nodeScheduling:
       system:

@@ -47,8 +47,8 @@ import (
 )
 
 const (
-	// trainerArchiveURL is the GitHub tar.gz archive for Kubeflow Trainer v2.1.0.
-	trainerArchiveURL = "https://github.com/kubeflow/trainer/archive/refs/tags/v2.1.0.tar.gz"
+	// trainerArchiveURL is the GitHub tar.gz archive for Kubeflow Trainer v2.2.0.
+	trainerArchiveURL = "https://github.com/kubeflow/trainer/archive/refs/tags/v2.2.0.tar.gz"
 
 	// trainerKustomizePath is the path within the extracted archive to the manager overlay.
 	trainerKustomizePath = "manifests/overlays/manager"
@@ -89,7 +89,7 @@ func isTrainerInstalled(ctx context.Context, dynamicClient dynamic.Interface) (b
 	return true, nil
 }
 
-// installTrainer downloads the Kubeflow Trainer v2.1.0 archive from GitHub, builds the
+// installTrainer downloads the Kubeflow Trainer v2.2.0 archive from GitHub, builds the
 // kustomize manager overlay entirely in Go (no CLI), and applies every resource to the
 // cluster via the dynamic client.  It returns the list of resources it created so the
 // caller can defer deleteTrainer for cleanup.