From fb835493d7558220a37dceb20bb1f168227b365b Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 29 Apr 2026 13:23:42 -0700 Subject: [PATCH] chore(recipes): bump 6 components to upstream latest (phase 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 of the version refresh tracked in #698: minor and patch bumps across registry defaults and overlay/mixin pins. No values schema changes required. aws-ebs-csi-driver 2.55.0 -> 2.59.0 cert-manager v1.17.2 -> v1.20.2 kube-prometheus-stack 82.8.0 -> 84.4.0 kueue 0.17.0 -> 0.17.1 nodewright-operator v0.14.0 -> v0.15.1 nvsentinel v1.1.0 -> v1.3.0 Excluded from this PR: - kgateway / kgateway-crds (v2.0.0 -> v2.2.3) — v2.2.3 silently drops the `inferenceExtension.enabled` value (no longer in the chart's values.yaml). v2.0.0 renders inf_ext_rbac.yaml (ClusterRole granting access to inference.networking.x-k8s.io inferencemodels/inferencepools) plus KGW_ENABLE_INFER_EXT env; v2.2.3 renders neither. AICR uses kgateway specifically for the CNCF AI Conformance "Advanced Ingress for AI/ML Inference" requirement, so a silent feature regression here would break inference bundles. Migration to v2.2.3 needs a values + RBAC rework — deferred. - aws-efa (v0.5.3 -> v0.5.26) — 23 minors require values cleanup including a real security-posture change (chart now defaults to privileged: true for EFA hardware access, conflicting with our hardened allowPrivilegeEscalation: false override). Deferred to a follow-up so the change can get proper EKS/security review. - kai-scheduler (v0.13.0 -> v0.14.1) — KAI-Scheduler was transferred from NVIDIA/ to kai-scheduler/ org and chart publishing moved with it. New OCI namespace is `ghcr.io/kai-scheduler/kai-scheduler` (the old `ghcr.io/nvidia/kai-scheduler` is frozen at v0.13.0). This is an OCI-source migration plus a bump — coupled changes worth their own follow-up PR rather than mixing into pure pin bumps here. - kubeflow-trainer (2.1.0 -> 2.2.0) — chart bump is coupled to a Go change in validators/performance/trainer_lifecycle.go (the hardcoded fallback archive URL needs to track the chart pin). The validator + chart bumps belong together in a follow-up PR to keep this PR pure config / no Go changes. Companion changes: - examples/recipes/{kind,eks-training,aks-training,eks-gb200- ubuntu-training-with-validation}.yaml: refresh the cert-manager, nodewright-operator, kube-prometheus-stack, and nvsentinel pins to match the bumped registry defaults. Matches the convention from prior bump PRs (#283, #336, #450). - examples/recipes/aks-training.yaml: also remove an orphaned `manifestFiles:` reference to components/nvsentinel/manifests/allow-intra-namespace.yaml that has been broken since #415 (the workaround source file was deleted in #309 when nvsentinel was bumped past v0.7.0, but the AKS example was added later by copying from another template and kept the now-stale reference). Bundling examples/recipes/aks-training.yaml currently fails with "file does not exist"; this fix restores it. Refs: #698 Closes: #716 --- examples/recipes/aks-training.yaml | 10 ++++------ .../eks-gb200-ubuntu-training-with-validation.yaml | 6 +++--- examples/recipes/eks-training.yaml | 6 +++--- examples/recipes/kind.yaml | 6 +++--- recipes/overlays/base.yaml | 8 ++++---- recipes/overlays/eks.yaml | 2 +- recipes/registry.yaml | 10 +++++----- 7 files changed, 23 insertions(+), 25 deletions(-) diff --git a/examples/recipes/aks-training.yaml b/examples/recipes/aks-training.yaml index e980f6d45..f3325670c 100644 --- a/examples/recipes/aks-training.yaml +++ b/examples/recipes/aks-training.yaml @@ -44,7 +44,7 @@ componentRefs: chart: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: gpu-operator namespace: gpu-operator @@ -86,7 +86,7 @@ componentRefs: chart: kube-prometheus-stack type: Helm source: https://prometheus-community.github.io/helm-charts - version: 82.8.0 + version: 84.4.0 valuesFile: components/kube-prometheus-stack/values.yaml overrides: prometheus: @@ -119,13 +119,11 @@ componentRefs: chart: nvsentinel type: Helm source: oci://ghcr.io/nvidia - version: v0.10.0 + version: v1.3.0 valuesFile: components/nvsentinel/values.yaml dependencyRefs: - cert-manager - gpu-operator - manifestFiles: - - components/nvsentinel/manifests/allow-intra-namespace.yaml - name: prometheus-adapter namespace: monitoring chart: prometheus-adapter @@ -140,7 +138,7 @@ componentRefs: chart: skyhook-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia/skyhook - version: v0.13.1 + version: v0.15.1 valuesFile: components/nodewright-operator/values.yaml deploymentOrder: - cert-manager diff --git a/examples/recipes/eks-gb200-ubuntu-training-with-validation.yaml b/examples/recipes/eks-gb200-ubuntu-training-with-validation.yaml index 2239beed5..51d8a2807 100644 --- a/examples/recipes/eks-gb200-ubuntu-training-with-validation.yaml +++ b/examples/recipes/eks-gb200-ubuntu-training-with-validation.yaml @@ -83,7 +83,7 @@ componentRefs: chart: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: gpu-operator @@ -131,7 +131,7 @@ componentRefs: chart: nvsentinel type: Helm source: oci://ghcr.io/nvidia - version: v0.6.0 + version: v1.3.0 valuesFile: components/nvsentinel/values.yaml dependencyRefs: - cert-manager @@ -141,7 +141,7 @@ componentRefs: chart: skyhook-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia/skyhook - version: 0.14.0 + version: v0.15.1 valuesFile: components/nodewright-operator/values.yaml overrides: customization: ubuntu diff --git a/examples/recipes/eks-training.yaml b/examples/recipes/eks-training.yaml index fa51c96f0..9213890d1 100644 --- a/examples/recipes/eks-training.yaml +++ b/examples/recipes/eks-training.yaml @@ -34,7 +34,7 @@ componentRefs: chart: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: gpu-operator namespace: gpu-operator @@ -50,7 +50,7 @@ componentRefs: chart: nvsentinel type: Helm source: oci://ghcr.io/nvidia - version: v0.6.0 + version: v1.3.0 valuesFile: components/nvsentinel/values.yaml dependencyRefs: - cert-manager @@ -59,7 +59,7 @@ componentRefs: chart: skyhook-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia/skyhook - version: 0.14.0 + version: v0.15.1 valuesFile: components/nodewright-operator/values.yaml deploymentOrder: - cert-manager diff --git a/examples/recipes/kind.yaml b/examples/recipes/kind.yaml index 84c5135e7..eded830e4 100644 --- a/examples/recipes/kind.yaml +++ b/examples/recipes/kind.yaml @@ -30,21 +30,21 @@ componentRefs: chart: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: nodewright-operator namespace: skyhook chart: skyhook-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia/skyhook - version: 0.14.0 + version: v0.15.1 valuesFile: components/nodewright-operator/values.yaml - name: kube-prometheus-stack namespace: nvidia-system chart: kube-prometheus-stack type: Helm source: https://prometheus-community.github.io/helm-charts - version: 82.8.0 + version: 84.4.0 valuesFile: components/kube-prometheus-stack/values.yaml - name: k8s-ephemeral-storage-metrics namespace: nvidia-system diff --git a/recipes/overlays/base.yaml b/recipes/overlays/base.yaml index 56dabf98a..7a637b0ea 100644 --- a/recipes/overlays/base.yaml +++ b/recipes/overlays/base.yaml @@ -34,7 +34,7 @@ spec: - name: cert-manager type: Helm source: https://charts.jetstack.io - version: v1.17.2 + version: v1.20.2 valuesFile: components/cert-manager/values.yaml - name: gpu-operator @@ -52,7 +52,7 @@ spec: - name: nvsentinel type: Helm source: oci://ghcr.io/nvidia - version: v1.1.0 + version: v1.3.0 valuesFile: components/nvsentinel/values.yaml dependencyRefs: - cert-manager @@ -61,13 +61,13 @@ spec: - name: nodewright-operator type: Helm source: https://helm.ngc.nvidia.com/nvidia/skyhook - version: v0.14.0 + version: v0.15.1 valuesFile: components/nodewright-operator/values.yaml - name: kube-prometheus-stack type: Helm source: https://prometheus-community.github.io/helm-charts - version: 82.8.0 + version: 84.4.0 valuesFile: components/kube-prometheus-stack/values.yaml - name: k8s-ephemeral-storage-metrics diff --git a/recipes/overlays/eks.yaml b/recipes/overlays/eks.yaml index 173bbe1ba..5128e52dc 100644 --- a/recipes/overlays/eks.yaml +++ b/recipes/overlays/eks.yaml @@ -38,7 +38,7 @@ spec: - name: aws-ebs-csi-driver type: Helm source: https://kubernetes-sigs.github.io/aws-ebs-csi-driver - version: 2.55.0 + version: 2.59.0 valuesFile: components/aws-ebs-csi-driver/values.yaml # Enable Prometheus persistent storage for EKS (requires EBS CSI driver) diff --git a/recipes/registry.yaml b/recipes/registry.yaml index bc7b73c31..d7f71d32d 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -145,7 +145,7 @@ components: helm: defaultRepository: https://charts.jetstack.io defaultChart: jetstack/cert-manager - defaultVersion: v1.17.2 + defaultVersion: v1.20.2 defaultNamespace: cert-manager nodeScheduling: system: @@ -228,7 +228,7 @@ components: helm: defaultRepository: https://helm.ngc.nvidia.com/nvidia defaultChart: nvidia/nvsentinel - defaultVersion: v1.1.0 + defaultVersion: v1.3.0 defaultNamespace: nvsentinel nodeScheduling: system: @@ -269,7 +269,7 @@ components: helm: defaultRepository: https://prometheus-community.github.io/helm-charts defaultChart: prometheus-community/kube-prometheus-stack - defaultVersion: 82.8.0 + defaultVersion: 84.4.0 defaultNamespace: monitoring nodeScheduling: system: @@ -320,7 +320,7 @@ components: helm: defaultRepository: https://kubernetes-sigs.github.io/aws-ebs-csi-driver defaultChart: aws-ebs-csi-driver/aws-ebs-csi-driver - defaultVersion: 2.55.0 + defaultVersion: 2.59.0 defaultNamespace: kube-system nodeScheduling: system: @@ -461,7 +461,7 @@ components: helm: defaultRepository: oci://registry.k8s.io/kueue/charts defaultChart: kueue - defaultVersion: "0.17.0" + defaultVersion: "0.17.1" defaultNamespace: kueue-system nodeScheduling: system: