From 1d77c491612129b5ea95d000268f4a24415cc4cd Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Thu, 30 Apr 2026 15:23:18 -0700 Subject: [PATCH] fix(recipes): bake AICR scheduling into torch-distributed runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pytorch demo TrainJobs in demos/cuj1-{eks,gke}.md carry per-cluster scheduling boilerplate (`podTemplateOverrides` with cluster-specific tolerations) so the resulting pods land on AICR's tainted GPU nodes. Each TrainJob author has to repeat this; each demo has to be edited per-cluster vocabulary; and the override mechanism keeps changing upstream (PodTemplateOverrides was deprecated in v2.1, replaced by RuntimePatches in v2.2 — kubeflow/trainer#3309). Move the per-cluster scheduling into the runtime instead. AICR's existing `nodeScheduling.accelerated` bundler injection (already used by gpu-operator, nfd, nodewright-customizations, kgateway) writes the CLI flag values into the chart's values.yaml at the listed paths. kubeflow-trainer was the only manifestFiles-using component without an `accelerated:` block. This commit adds it and templates the torch-distributed ClusterTrainingRuntime to consume the injected values, mirroring nodewright-customizations/manifests/tuning.yaml. Three coordinated changes: 1. recipes/registry.yaml — add `nodeScheduling.accelerated` block to the kubeflow-trainer entry. Targets top-level keys `acceleratedNodeSelector` and `acceleratedTolerations`. 2. recipes/components/kubeflow-trainer/manifests/ torch-distributed-cluster-training-runtime.yaml — replace the static pod-spec scheduling region with Helm template directives: {{- $kft := index .Values "kubeflow-trainer" }} {{- with $kft.acceleratedNodeSelector }} nodeSelector: {{- toYaml . | nindent 20 }} {{- end }} {{- with $kft.acceleratedTolerations }} tolerations: {{- toYaml . | nindent 20 }} {{- end }} `index .Values "kubeflow-trainer"` matches the bundler's `manifest.RenderInput.Values` shape (values nested under ComponentName). The bundler renders this template at bundle time — the artifact in `bundle/-kubeflow-trainer-post/templates/` is plain YAML with concrete values substituted. 3. demos/cuj1-eks.md and demos/cuj1-gke.md — drop the entire `podTemplateOverrides` block. Demo TrainJob is just `trainer:` + `runtimeRef:`. API-version-agnostic: works on kubeflow-trainer v2.1 (PodTemplateOverrides era) and v2.2+ (RuntimePatches era) identically, because the TrainJob no longer overrides anything — the runtime carries the scheduling. Validated end-to-end on a real EKS H100 cluster: helm-upgrade kubeflow-trainer-post → CTR live with baked tolerations + nodeSelector → bare pytorch-mnist TrainJob admits, schedules with the correct tolerations + nodeSelector inherited from the runtime, trains to completion (accuracy=0.7424 in 21s). `pkg/recipe.TestManifestHelmHooksRequired` still passes — the `helm.sh/hook` annotations are preserved. --- demos/cuj1-eks.md | 28 +++-------------- demos/cuj1-gke.md | 30 +++---------------- ...-distributed-cluster-training-runtime.yaml | 16 ++++++++++ recipes/registry.yaml | 10 +++++++ 4 files changed, 34 insertions(+), 50 deletions(-) diff --git a/demos/cuj1-eks.md b/demos/cuj1-eks.md index 2fd5b4e41..4db45f6de 100644 --- a/demos/cuj1-eks.md +++ b/demos/cuj1-eks.md @@ -97,30 +97,10 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced - # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). - runtimePatches: - - manager: aicr.nvidia.com/demo - trainingRuntimeSpec: - template: - spec: - replicatedJobs: - - name: node - template: - spec: - template: - spec: - nodeSelector: - nodeGroup: gpu-worker - tolerations: - - key: dedicated - operator: Equal - value: worker-workload - effect: NoSchedule - - key: dedicated - operator: Equal - value: worker-workload - effect: NoExecute + # No podTemplateOverrides / runtimePatches needed — the torch-distributed + # ClusterTrainingRuntime carries the cluster-aware nodeSelector and + # tolerations baked in at bundle time from --accelerated-node-selector / + # --accelerated-node-toleration flags. runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/demos/cuj1-gke.md b/demos/cuj1-gke.md index 98215b7b8..a1879d27c 100644 --- a/demos/cuj1-gke.md +++ b/demos/cuj1-gke.md @@ -99,32 +99,10 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate - # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed - # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced - # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). - runtimePatches: - - manager: aicr.nvidia.com/demo - trainingRuntimeSpec: - template: - spec: - replicatedJobs: - - name: node - template: - spec: - template: - spec: - nodeSelector: - nodeGroup: gpu-worker - tolerations: - - key: dedicated - operator: Equal - value: gpu-workload - effect: NoSchedule - - key: nvidia.com/gpu - operator: Equal - value: present - effect: NoSchedule + # No podTemplateOverrides / runtimePatches needed — the torch-distributed + # ClusterTrainingRuntime carries the cluster-aware nodeSelector and + # tolerations baked in at bundle time from --accelerated-node-selector / + # --accelerated-node-toleration flags. runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml index 012668cd0..a65bb1f01 100644 --- a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml +++ b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml @@ -41,6 +41,22 @@ spec: spec: template: spec: + # nodeSelector and tolerations are injected by the AICR bundler + # from --accelerated-node-selector / --accelerated-node-toleration + # flags via the registry's nodeScheduling.accelerated paths + # (see recipes/registry.yaml). This lets users submit a bare + # TrainJob with no podTemplateOverrides / runtimePatches — the + # runtime carries the per-cluster scheduling vocabulary baked + # in at bundle time. + {{- $kft := index .Values "kubeflow-trainer" }} + {{- with $kft.acceleratedNodeSelector }} + nodeSelector: + {{- toYaml . | nindent 20 }} + {{- end }} + {{- with $kft.acceleratedTolerations }} + tolerations: + {{- toYaml . | nindent 20 }} + {{- end }} containers: - name: node image: pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime diff --git a/recipes/registry.yaml b/recipes/registry.yaml index ff94a24ba..af170a7b1 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -495,3 +495,13 @@ components: tolerationPaths: - manager.tolerations - jobset.controller.tolerations + # accelerated paths target top-level keys; consumed by the + # torch-distributed ClusterTrainingRuntime template in + # components/kubeflow-trainer/manifests/. Lets users submit a bare + # TrainJob with no podTemplateOverrides / runtimePatches — the + # runtime carries the per-cluster scheduling baked in at bundle time. + accelerated: + nodeSelectorPaths: + - acceleratedNodeSelector + tolerationPaths: + - acceleratedTolerations