From 1d77c491612129b5ea95d000268f4a24415cc4cd Mon Sep 17 00:00:00 2001
From: Yuan Chen <yuanchen97@gmail.com>
Date: Thu, 30 Apr 2026 15:23:18 -0700
Subject: [PATCH] fix(recipes): bake AICR scheduling into torch-distributed
 runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pytorch demo TrainJobs in demos/cuj1-{eks,gke}.md carry per-cluster
scheduling boilerplate (`podTemplateOverrides` with cluster-specific
tolerations) so the resulting pods land on AICR's tainted GPU nodes.
Each TrainJob author has to repeat this; each demo has to be edited
per-cluster vocabulary; and the override mechanism keeps changing
upstream (PodTemplateOverrides was deprecated in v2.1, replaced by
RuntimePatches in v2.2 — kubeflow/trainer#3309).

Move the per-cluster scheduling into the runtime instead. AICR's
existing `nodeScheduling.accelerated` bundler injection (already used
by gpu-operator, nfd, nodewright-customizations, kgateway) writes the
CLI flag values into the chart's values.yaml at the listed paths.
kubeflow-trainer was the only manifestFiles-using component without an
`accelerated:` block. This commit adds it and templates the
torch-distributed ClusterTrainingRuntime to consume the injected
values, mirroring nodewright-customizations/manifests/tuning.yaml.

Three coordinated changes:

1. recipes/registry.yaml — add `nodeScheduling.accelerated` block to
   the kubeflow-trainer entry. Targets top-level keys
   `acceleratedNodeSelector` and `acceleratedTolerations`.

2. recipes/components/kubeflow-trainer/manifests/
   torch-distributed-cluster-training-runtime.yaml — replace the
   static pod-spec scheduling region with Helm template directives:

       {{- $kft := index .Values "kubeflow-trainer" }}
       {{- with $kft.acceleratedNodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 20 }}
       {{- end }}
       {{- with $kft.acceleratedTolerations }}
       tolerations:
         {{- toYaml . | nindent 20 }}
       {{- end }}

   `index .Values "kubeflow-trainer"` matches the bundler's
   `manifest.RenderInput.Values` shape (values nested under
   ComponentName). The bundler renders this template at bundle time —
   the artifact in `bundle/<NNN>-kubeflow-trainer-post/templates/`
   is plain YAML with concrete values substituted.

3. demos/cuj1-eks.md and demos/cuj1-gke.md — drop the entire
   `podTemplateOverrides` block. Demo TrainJob is just `trainer:` +
   `runtimeRef:`.

API-version-agnostic: works on kubeflow-trainer v2.1 (PodTemplateOverrides
era) and v2.2+ (RuntimePatches era) identically, because the TrainJob
no longer overrides anything — the runtime carries the scheduling.

Validated end-to-end on a real EKS H100 cluster:
helm-upgrade kubeflow-trainer-post → CTR live with baked tolerations
+ nodeSelector → bare pytorch-mnist TrainJob admits, schedules with
the correct tolerations + nodeSelector inherited from the runtime,
trains to completion (accuracy=0.7424 in 21s).

`pkg/recipe.TestManifestHelmHooksRequired` still passes — the
`helm.sh/hook` annotations are preserved.
---
 demos/cuj1-eks.md                             | 28 +++--------------
 demos/cuj1-gke.md                             | 30 +++----------------
 ...-distributed-cluster-training-runtime.yaml | 16 ++++++++++
 recipes/registry.yaml                         | 10 +++++++
 4 files changed, 34 insertions(+), 50 deletions(-)
diff --git a/demos/cuj1-eks.md b/demos/cuj1-eks.md
index 2fd5b4e41..4db45f6de 100644
--- a/demos/cuj1-eks.md
+++ b/demos/cuj1-eks.md
@@ -97,30 +97,10 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced
-  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
-  runtimePatches:
-    - manager: aicr.nvidia.com/demo
-      trainingRuntimeSpec:
-        template:
-          spec:
-            replicatedJobs:
-              - name: node
-                template:
-                  spec:
-                    template:
-                      spec:
-                        nodeSelector:
-                          nodeGroup: gpu-worker
-                        tolerations:
-                          - key: dedicated
-                            operator: Equal
-                            value: worker-workload
-                            effect: NoSchedule
-                          - key: dedicated
-                            operator: Equal
-                            value: worker-workload
-                            effect: NoExecute
+  # No podTemplateOverrides / runtimePatches needed — the torch-distributed
+  # ClusterTrainingRuntime carries the cluster-aware nodeSelector and
+  # tolerations baked in at bundle time from --accelerated-node-selector /
+  # --accelerated-node-toleration flags.
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org
diff --git a/demos/cuj1-gke.md b/demos/cuj1-gke.md
index 98215b7b8..a1879d27c 100644
--- a/demos/cuj1-gke.md
+++ b/demos/cuj1-gke.md
@@ -99,32 +99,10 @@ spec:
         nvidia.com/gpu: 1
       limits:
         nvidia.com/gpu: 1
-  # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate
-  # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed
-  # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced
-  # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309).
-  runtimePatches:
-    - manager: aicr.nvidia.com/demo
-      trainingRuntimeSpec:
-        template:
-          spec:
-            replicatedJobs:
-              - name: node
-                template:
-                  spec:
-                    template:
-                      spec:
-                        nodeSelector:
-                          nodeGroup: gpu-worker
-                        tolerations:
-                          - key: dedicated
-                            operator: Equal
-                            value: gpu-workload
-                            effect: NoSchedule
-                          - key: nvidia.com/gpu
-                            operator: Equal
-                            value: present
-                            effect: NoSchedule
+  # No podTemplateOverrides / runtimePatches needed — the torch-distributed
+  # ClusterTrainingRuntime carries the cluster-aware nodeSelector and
+  # tolerations baked in at bundle time from --accelerated-node-selector /
+  # --accelerated-node-toleration flags.
   runtimeRef:
     name: torch-distributed
     apiGroup: trainer.kubeflow.org
diff --git a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
index 012668cd0..a65bb1f01 100644
--- a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+++ b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
@@ -41,6 +41,22 @@ spec:
             spec:
               template:
                 spec:
+                  # nodeSelector and tolerations are injected by the AICR bundler
+                  # from --accelerated-node-selector / --accelerated-node-toleration
+                  # flags via the registry's nodeScheduling.accelerated paths
+                  # (see recipes/registry.yaml). This lets users submit a bare
+                  # TrainJob with no podTemplateOverrides / runtimePatches — the
+                  # runtime carries the per-cluster scheduling vocabulary baked
+                  # in at bundle time.
+                  {{- $kft := index .Values "kubeflow-trainer" }}
+                  {{- with $kft.acceleratedNodeSelector }}
+                  nodeSelector:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
+                  {{- with $kft.acceleratedTolerations }}
+                  tolerations:
+                    {{- toYaml . | nindent 20 }}
+                  {{- end }}
                   containers:
                     - name: node
                       image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime
diff --git a/recipes/registry.yaml b/recipes/registry.yaml
index ff94a24ba..af170a7b1 100644
--- a/recipes/registry.yaml
+++ b/recipes/registry.yaml
@@ -495,3 +495,13 @@ components:
         tolerationPaths:
           - manager.tolerations
           - jobset.controller.tolerations
+      # accelerated paths target top-level keys; consumed by the
+      # torch-distributed ClusterTrainingRuntime template in
+      # components/kubeflow-trainer/manifests/. Lets users submit a bare
+      # TrainJob with no podTemplateOverrides / runtimePatches — the
+      # runtime carries the per-cluster scheduling baked in at bundle time.
+      accelerated:
+        nodeSelectorPaths:
+          - acceleratedNodeSelector
+        tolerationPaths:
+          - acceleratedTolerations