diff --git a/demos/cuj1-eks.md b/demos/cuj1-eks.md index eda8f9481..2fd5b4e41 100644 --- a/demos/cuj1-eks.md +++ b/demos/cuj1-eks.md @@ -97,12 +97,30 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - podTemplateOverrides: - - targetJobs: - - name: node - spec: - tolerations: - - operator: Exists + # Inject AICR-standard GPU node scheduling. kubeflow-trainer v2.2.0 replaced + # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). + runtimePatches: + - manager: aicr.nvidia.com/demo + trainingRuntimeSpec: + template: + spec: + replicatedJobs: + - name: node + template: + spec: + template: + spec: + nodeSelector: + nodeGroup: gpu-worker + tolerations: + - key: dedicated + operator: Equal + value: worker-workload + effect: NoSchedule + - key: dedicated + operator: Equal + value: worker-workload + effect: NoExecute runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/demos/cuj1-gke.md b/demos/cuj1-gke.md index 7673a9c34..98215b7b8 100644 --- a/demos/cuj1-gke.md +++ b/demos/cuj1-gke.md @@ -99,12 +99,32 @@ spec: nvidia.com/gpu: 1 limits: nvidia.com/gpu: 1 - podTemplateOverrides: - - targetJobs: - - name: node - spec: - tolerations: - - operator: Exists + # Inject GKE GPU node scheduling. Matches the snapshot/bundle/validate + # tolerations above (`dedicated=gpu-workload:NoSchedule` plus the GKE-managed + # `nvidia.com/gpu=present:NoSchedule` taint). kubeflow-trainer v2.2.0 replaced + # podTemplateOverrides with the runtimePatches API (PR kubeflow/trainer#3309). + runtimePatches: + - manager: aicr.nvidia.com/demo + trainingRuntimeSpec: + template: + spec: + replicatedJobs: + - name: node + template: + spec: + template: + spec: + nodeSelector: + nodeGroup: gpu-worker + tolerations: + - key: dedicated + operator: Equal + value: gpu-workload + effect: NoSchedule + - key: nvidia.com/gpu + operator: Equal + value: present + effect: NoSchedule runtimeRef: name: torch-distributed apiGroup: trainer.kubeflow.org diff --git a/docs/contributor/component.md b/docs/contributor/component.md index 4a61706e5..365d69cc6 100644 --- a/docs/contributor/component.md +++ b/docs/contributor/component.md @@ -430,7 +430,7 @@ Deployers respect the `deploymentOrder` field from the recipe to ensure componen ```yaml componentRefs: - name: cert-manager - version: v1.17.2 + version: v1.20.2 - name: gpu-operator version: v25.3.3 - name: network-operator diff --git a/docs/contributor/data.md b/docs/contributor/data.md index 7acfa3f6f..a31ae0664 100644 --- a/docs/contributor/data.md +++ b/docs/contributor/data.md @@ -333,7 +333,7 @@ spec: - name: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: gpu-operator @@ -615,7 +615,7 @@ Components can declare dependencies via `dependencyRefs`: componentRefs: - name: cert-manager type: Helm - version: v1.17.2 + version: v1.20.2 - name: gpu-operator type: Helm @@ -997,7 +997,7 @@ curl "http://localhost:8080/v1/recipe?os=ubuntu&service=eks&accelerator=gb200&in "name": "cert-manager", "type": "Helm", "source": "https://charts.jetstack.io", - "version": "v1.17.2", + "version": "v1.20.2", "valuesFile": "components/cert-manager/values.yaml" }, { diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index 396a338bb..54035e229 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -24,7 +24,7 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **prometheus-adapter** | Exposes custom metrics from Prometheus to the Kubernetes metrics API. Enables HPA scaling based on GPU utilization and other custom metrics. | [prometheus-adapter](https://github.com/kubernetes-sigs/prometheus-adapter) | | **aws-ebs-csi-driver** | CSI driver for Amazon EBS volumes. Provides persistent storage for workloads on EKS. EKS-specific. | [AWS EBS CSI Driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver) | | **k8s-ephemeral-storage-metrics** | Exports ephemeral storage usage metrics per pod. Useful for monitoring scratch space consumption on GPU nodes. | [k8s-ephemeral-storage-metrics](https://github.com/jmcgrath207/k8s-ephemeral-storage-metrics) | -| **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/NVIDIA/KAI-Scheduler) | +| **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/kai-scheduler/KAI-Scheduler) | | **grove** | Pod lifecycle management for Dynamo inference platform. Installed as a standalone component. | [Grove](https://github.com/ai-dynamo/grove) | | **dynamo-platform** | NVIDIA Dynamo inference serving platform with bundled CRDs. Distributed inference with prefix-cache-aware routing and disaggregated prefill/decode. | [Dynamo](https://github.com/ai-dynamo/dynamo) | | **kgateway-crds** | Custom Resource Definitions for kgateway (Kubernetes Gateway API implementation). | [kgateway](https://github.com/kgateway-dev/kgateway) | diff --git a/examples/recipes/aks-training.yaml b/examples/recipes/aks-training.yaml index f3325670c..2c7aebaa8 100644 --- a/examples/recipes/aks-training.yaml +++ b/examples/recipes/aks-training.yaml @@ -76,8 +76,8 @@ componentRefs: namespace: kai-scheduler chart: kai-scheduler type: Helm - source: oci://ghcr.io/nvidia/kai-scheduler - version: v0.13.0 + source: oci://ghcr.io/kai-scheduler/kai-scheduler + version: v0.14.1 valuesFile: components/kai-scheduler/values.yaml dependencyRefs: - gpu-operator diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 48c21acd4..4e16f87e7 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -1895,8 +1895,8 @@ func TestBundleGolden_KaiSchedulerPresent(t *testing.T) { Name: "kai-scheduler", Namespace: "kai-scheduler", Chart: "kai-scheduler", - Version: "v0.13.0", - Source: "oci://ghcr.io/nvidia/kai-scheduler", + Version: "v0.14.1", + Source: "oci://ghcr.io/kai-scheduler/kai-scheduler", }, }, DeploymentOrder: []string{"kai-scheduler"}, diff --git a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/001-kai-scheduler/upstream.env b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/001-kai-scheduler/upstream.env index 29f65e84e..b3cad815e 100644 --- a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/001-kai-scheduler/upstream.env +++ b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/001-kai-scheduler/upstream.env @@ -1,3 +1,3 @@ -CHART='oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler' +CHART='oci://ghcr.io/kai-scheduler/kai-scheduler/kai-scheduler' REPO='' -VERSION='v0.13.0' +VERSION='v0.14.1' diff --git a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/README.md b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/README.md index 776d26421..bb144f929 100644 --- a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/README.md +++ b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/README.md @@ -18,7 +18,7 @@ via its own `install.sh`: | Component | Version | Namespace | Source | |-----------|---------|-----------|--------| -| kai-scheduler | v0.13.0 | kai-scheduler | kai-scheduler (oci://ghcr.io/nvidia/kai-scheduler) | +| kai-scheduler | v0.14.1 | kai-scheduler | kai-scheduler (oci://ghcr.io/kai-scheduler/kai-scheduler) | diff --git a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml index 501c0b4f8..012668cd0 100644 --- a/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml +++ b/recipes/components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml @@ -25,8 +25,11 @@ metadata: spec: mlPolicy: numNodes: 1 - torch: - numProcPerNode: auto + # numProcPerNode was removed from mlPolicy.torch in kubeflow-trainer v2.2.0 + # (kubeflow/trainer#3239) — Torch now infers parallelism from + # the container's nvidia.com/gpu resource limit. mlPolicy.mpi.numProcPerNode + # is unaffected. + torch: {} template: spec: replicatedJobs: diff --git a/recipes/overlays/base.yaml b/recipes/overlays/base.yaml index 7a637b0ea..4acd3decc 100644 --- a/recipes/overlays/base.yaml +++ b/recipes/overlays/base.yaml @@ -88,8 +88,8 @@ spec: - name: kai-scheduler type: Helm - source: oci://ghcr.io/nvidia/kai-scheduler - version: v0.13.0 + source: oci://ghcr.io/kai-scheduler/kai-scheduler + version: v0.14.1 valuesFile: components/kai-scheduler/values.yaml dependencyRefs: - gpu-operator diff --git a/recipes/registry.yaml b/recipes/registry.yaml index d1b029d0a..ff94a24ba 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -364,9 +364,9 @@ components: healthCheck: assertFile: checks/kai-scheduler/health-check.yaml helm: - defaultRepository: oci://ghcr.io/nvidia/kai-scheduler + defaultRepository: oci://ghcr.io/kai-scheduler/kai-scheduler defaultChart: kai-scheduler - defaultVersion: v0.13.0 + defaultVersion: v0.14.1 defaultNamespace: kai-scheduler nodeScheduling: system: @@ -485,7 +485,7 @@ components: helm: defaultRepository: oci://ghcr.io/kubeflow/charts defaultChart: kubeflow-trainer - defaultVersion: 2.1.0 + defaultVersion: 2.2.0 defaultNamespace: kubeflow nodeScheduling: system: diff --git a/validators/performance/trainer_lifecycle.go b/validators/performance/trainer_lifecycle.go index 5497dc8d8..65caf0d1e 100644 --- a/validators/performance/trainer_lifecycle.go +++ b/validators/performance/trainer_lifecycle.go @@ -47,8 +47,8 @@ import ( ) const ( - // trainerArchiveURL is the GitHub tar.gz archive for Kubeflow Trainer v2.1.0. - trainerArchiveURL = "https://github.com/kubeflow/trainer/archive/refs/tags/v2.1.0.tar.gz" + // trainerArchiveURL is the GitHub tar.gz archive for Kubeflow Trainer v2.2.0. + trainerArchiveURL = "https://github.com/kubeflow/trainer/archive/refs/tags/v2.2.0.tar.gz" // trainerKustomizePath is the path within the extracted archive to the manager overlay. trainerKustomizePath = "manifests/overlays/manager" @@ -89,7 +89,7 @@ func isTrainerInstalled(ctx context.Context, dynamicClient dynamic.Interface) (b return true, nil } -// installTrainer downloads the Kubeflow Trainer v2.1.0 archive from GitHub, builds the +// installTrainer downloads the Kubeflow Trainer v2.2.0 archive from GitHub, builds the // kustomize manager overlay entirely in Go (no CLI), and applies every resource to the // cluster via the dynamic client. It returns the list of resources it created so the // caller can defer deleteTrainer for cleanup.