From 3315615234b7923f00cfd388aa35d442a8960778 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 24 Apr 2026 15:27:39 -0700 Subject: [PATCH] fix(bundler): v0.12 rc1 smoke-test follow-ups --- docs/user/cli-reference.md | 4 ++-- pkg/bundler/config/config_test.go | 6 ++--- pkg/bundler/deployer/argocdhelm/argocdhelm.go | 2 ++ pkg/bundler/deployer/helm/helm.go | 2 ++ .../deployer/helm/templates/README.md.tmpl | 22 ++++++++++++++----- .../helm/templates/component-README.md.tmpl | 4 ++++ pkg/snapshotter/agent_test.go | 4 ++-- 7 files changed, 31 insertions(+), 13 deletions(-) diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 4a4e930dd..302684d5a 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -991,7 +991,7 @@ aicr bundle -r recipe.yaml --nodes 8 -o ./bundles # Day 2 options: workload-gate and workload-selector for nodewright aicr bundle -r recipe.yaml \ - --workload-gate skyhook.io/runtime-required=true:NoSchedule \ + --workload-gate skyhook.nvidia.com/runtime-required=true:NoSchedule \ --workload-selector workload-type=training \ -o ./bundles @@ -1230,7 +1230,7 @@ aicr bundle -r recipe.yaml \ ```shell # Generate bundle with day 2 options for training workloads aicr bundle -r recipe.yaml \ - --workload-gate skyhook.io/runtime-required=true:NoSchedule \ + --workload-gate skyhook.nvidia.com/runtime-required=true:NoSchedule \ --workload-selector workload-type=training \ --workload-selector intent=training \ --accelerated-node-selector accelerator=nvidia-h100 \ diff --git a/pkg/bundler/config/config_test.go b/pkg/bundler/config/config_test.go index e3d608cb5..b3205c91c 100644 --- a/pkg/bundler/config/config_test.go +++ b/pkg/bundler/config/config_test.go @@ -831,7 +831,7 @@ func TestDeployerTypeString(t *testing.T) { func TestWorkloadGateTaintOptions(t *testing.T) { t.Run("WithWorkloadGateTaint with valid taint", func(t *testing.T) { taint := &corev1.Taint{ - Key: "skyhook.io/runtime-required", + Key: "skyhook.nvidia.com/runtime-required", Value: "true", Effect: corev1.TaintEffectNoSchedule, } @@ -841,8 +841,8 @@ func TestWorkloadGateTaintOptions(t *testing.T) { if got == nil { t.Fatal("WorkloadGateTaint() returned nil") } - if got.Key != "skyhook.io/runtime-required" { - t.Errorf("WorkloadGateTaint().Key = %s, want skyhook.io/runtime-required", got.Key) + if got.Key != "skyhook.nvidia.com/runtime-required" { + t.Errorf("WorkloadGateTaint().Key = %s, want skyhook.nvidia.com/runtime-required", got.Key) } if got.Value != "true" { t.Errorf("WorkloadGateTaint().Value = %s, want true", got.Value) diff --git a/pkg/bundler/deployer/argocdhelm/argocdhelm.go b/pkg/bundler/deployer/argocdhelm/argocdhelm.go index 373f5c103..768efb5d5 100644 --- a/pkg/bundler/deployer/argocdhelm/argocdhelm.go +++ b/pkg/bundler/deployer/argocdhelm/argocdhelm.go @@ -277,6 +277,8 @@ func (g *Generator) writeStaticValuesAndBuildStubs(outputDir string) ([]string, component.RemoveValueByPath(staticValues, path) component.SetValueByPath(stubs, path, val) } else { + slog.Warn("dynamic path not found in component values; introducing empty placeholder", + "component", ref.Name, "path", path) component.SetValueByPath(stubs, path, "") } } diff --git a/pkg/bundler/deployer/helm/helm.go b/pkg/bundler/deployer/helm/helm.go index 0f808d172..a71aa8ac8 100644 --- a/pkg/bundler/deployer/helm/helm.go +++ b/pkg/bundler/deployer/helm/helm.go @@ -520,6 +520,8 @@ func writeClusterValuesFile(values map[string]any, dynamicPaths []string, compon component.RemoveValueByPath(values, path) } else { val = "" + slog.Warn("dynamic path not found in component values; introducing empty placeholder", + "component", componentName, "path", path) } component.SetValueByPath(clusterValues, path, val) } diff --git a/pkg/bundler/deployer/helm/templates/README.md.tmpl b/pkg/bundler/deployer/helm/templates/README.md.tmpl index 34e33bb3e..3c3e874f4 100644 --- a/pkg/bundler/deployer/helm/templates/README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/README.md.tmpl @@ -80,6 +80,7 @@ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ + -f {{ .Name }}/cluster-values.yaml \ --wait --timeout 10m {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ @@ -87,6 +88,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f {{ .Name }}/values.yaml \ + -f {{ .Name }}/cluster-values.yaml \ --wait --timeout 10m {{ end -}} ``` @@ -100,19 +102,27 @@ kubectl apply -f {{ .Name }}/manifests/ ## Customization -Each Helm component has its own `values.yaml` in its directory. -Edit the file before deploying to customize component configuration: +Each Helm component has two values files in its directory: -```bash -vim gpu-operator/values.yaml -``` +- `values.yaml` — resolved configuration from the recipe. Edit to override defaults: + + ```bash + vim gpu-operator/values.yaml + ``` + +- `cluster-values.yaml` — install-time parameters. Any paths declared with + `aicr bundle --dynamic :` are pulled out of `values.yaml` + and placed here for you to fill in. The file is always created (empty if + no dynamic paths were declared) and passed to `helm upgrade --install` + alongside `values.yaml` by both `deploy.sh` and the per-component commands + in the "Manual Installation" section above. ## Upgrade To upgrade a specific Helm component: ```bash -helm upgrade --version -n -f /values.yaml --wait --timeout 10m +helm upgrade --version -n -f /values.yaml -f /cluster-values.yaml --wait --timeout 10m ``` ## Uninstall diff --git a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl index 79b49b379..068bfcd28 100644 --- a/pkg/bundler/deployer/helm/templates/component-README.md.tmpl +++ b/pkg/bundler/deployer/helm/templates/component-README.md.tmpl @@ -46,6 +46,7 @@ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ + -f cluster-values.yaml \ --wait --timeout 10m {{ else -}} helm upgrade --install {{ .Name }} {{ .ChartName }} \ @@ -53,6 +54,7 @@ helm upgrade --install {{ .Name }} {{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} --create-namespace \ -f values.yaml \ + -f cluster-values.yaml \ --wait --timeout 10m {{ end -}} ``` @@ -71,6 +73,7 @@ helm upgrade {{ .Name }} {{ .Repository }}/{{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ + -f cluster-values.yaml \ --wait --timeout 10m {{ else -}} helm upgrade {{ .Name }} {{ .ChartName }} \ @@ -78,6 +81,7 @@ helm upgrade {{ .Name }} {{ .ChartName }} \ --version {{ .Version }} \ -n {{ .Namespace }} \ -f values.yaml \ + -f cluster-values.yaml \ --wait --timeout 10m {{ end -}} ``` diff --git a/pkg/snapshotter/agent_test.go b/pkg/snapshotter/agent_test.go index b818e0262..41a2218a7 100644 --- a/pkg/snapshotter/agent_test.go +++ b/pkg/snapshotter/agent_test.go @@ -141,9 +141,9 @@ func TestParseTaint(t *testing.T) { }{ { name: "taint with key, value, and effect", - taintStr: "skyhook.io/runtime-required=true:NoSchedule", + taintStr: "skyhook.nvidia.com/runtime-required=true:NoSchedule", want: &corev1.Taint{ - Key: "skyhook.io/runtime-required", + Key: "skyhook.nvidia.com/runtime-required", Value: "true", Effect: corev1.TaintEffectNoSchedule, },