From f5f3bb4cd44d724e243609a183b5ac83db3b402b Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 12:09:11 +0200 Subject: [PATCH 01/14] feat(slinky-slurm): add cluster chart and EKS/Kind leaves MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the slinky `slurm` cluster chart into AICR as an opt-in component (`platform-slurm-cluster` mixin) and ship the first two leaf recipes that consume both the Slinky operator and cluster: `h100-eks-ubuntu- training-slurm` and `h100-kind-training-slurm`. Component values mirror the chart's native `slinky` keys for nodesets and loginsets so user overrides merge cleanly with chart defaults (avoids the `nodeset.logfile` typed-merge failure observed when using custom sub-keys). Production-leaning defaults: priority/multifactor scheduler tuning via `extraConfMap`, cgroup/gres config files, ClusterIP restapi. Accounting (slurmdbd) and DCGM job-mapping are disabled by default — accounting needs an external MariaDB AICR does not bundle, DCGM needs dcgm-exporter on workers; both opt in via valuesFile / --set. The accounting.storageConfig block is kept as an inert example because the chart wraps the entire Accounting CR in `if accounting.enabled`. Health check asserts both the Slinky CRs (Controller, LoginSet, NodeSet, RestApi at apiVersion `v1beta1` under the `slinky-slurm` release name) and the underlying workload readiness (Deployment/StatefulSet `availableReplicas > 0`) before the generic pod-health step. KWOK validate-scheduling.sh now extracts the `platform` criterion and passes `--platform slurm` so *-slurm overlays resolve to the slinky bundle they ship. The flag is intentionally scoped to slurm: a full matrix run showed the harness's sequential CRD cleanup is broken for NFD/KAI/run.ai (orphans block pre-flight on the second recipe onward), so widening to kubeflow/dynamo is deferred to a follow-up. Registry adds nodeScheduling paths for `nodesets.slinky.podSpec.*`, `loginsets.slinky.podSpec.*`, `controller.podSpec.*`, and `restapi.podSpec.*` so both `--system-node-*` and `--accelerated-node-*` flags steer the correct pods. Deployment-order guard gains the two new leaves; a targeted components test verifies the registry paths line up with the actual map-keys in components/slinky-slurm/values.yaml. EKS leaf intentionally has no `validation:` block so it inherits the full set from `h100-eks-training` (via `h100-eks-ubuntu-training`); the Kind leaf follows the same pattern. docs/user/container-images.md regenerated via `make bom-docs`. --- docs/integrator/recipe-development.md | 2 + docs/user/component-catalog.md | 5 +- docs/user/container-images.md | 13 +- kwok/scripts/apply-nodes.sh | 11 +- kwok/scripts/run-all-recipes.sh | 12 ++ kwok/scripts/validate-scheduling.sh | 17 +- pkg/recipe/components_test.go | 73 ++++++++ pkg/recipe/deployment_order_guard_test.go | 44 +++++ .../slinky-slurm-operator/health-check.yaml | 22 +-- recipes/checks/slinky-slurm/health-check.yaml | 141 ++++++++++++++ recipes/components/slinky-slurm/values.yaml | 177 ++++++++++++++++++ recipes/mixins/platform-slurm-cluster.yaml | 30 +++ .../h100-eks-ubuntu-training-slurm.yaml | 50 +++++ .../overlays/h100-kind-training-slurm.yaml | 45 +++++ recipes/registry.yaml | 39 +++- 15 files changed, 650 insertions(+), 31 deletions(-) create mode 100644 recipes/checks/slinky-slurm/health-check.yaml create mode 100644 recipes/components/slinky-slurm/values.yaml create mode 100644 recipes/mixins/platform-slurm-cluster.yaml create mode 100644 recipes/overlays/h100-eks-ubuntu-training-slurm.yaml create mode 100644 recipes/overlays/h100-kind-training-slurm.yaml diff --git a/docs/integrator/recipe-development.md b/docs/integrator/recipe-development.md index 752564708..e27037e5c 100644 --- a/docs/integrator/recipe-development.md +++ b/docs/integrator/recipe-development.md @@ -115,6 +115,8 @@ spec: Mixins use `kind: RecipeMixin` and carry only `constraints` and `componentRefs`. They live in `recipes/mixins/` and are applied after inheritance chain merging. See [Data Architecture](../contributor/data.md#mixin-composition) for details. +A platform may split into multiple mixins when parts of the stack are independently opt-in. For example, `--platform slurm` resolves through two mixins: `platform-slurm` always contributes the SchedMD Slinky operator and CRDs, and `platform-slurm-cluster` is opt-in for the Slinky-managed Slurm cluster instance (Controller / LoginSet / NodeSet / RestApi). A leaf that wants operator-only composes just `platform-slurm`; a leaf that wants the cluster too composes both — see `recipes/overlays/h100-eks-ubuntu-training-slurm.yaml` for the latter. + When authoring a recipe targeting Talos (`criteria.os: talos`), append the `os-talos` mixin to your overlay's `spec.mixins` list (e.g. `spec.mixins: [os-talos]`, or `[platform-kubeflow, os-talos]` if you already mix in a non-OS fragment). OS-scoped mixins are mutually exclusive — combining `os-ubuntu` and `os-talos` in one overlay is a recipe authoring error, not a supported composition. The mixin overrides namespaces for affected components and supplies PSA-privileged Namespace manifests via `componentRefs[].preManifestFiles`, which are applied before each chart — see [Talos integration](talos-integration.md) for the component list and labels. **Cross-cutting overlays with wildcard criteria** apply across one criteria dimension without being referenced via `spec.base` or listed in `spec.mixins`. The resolver can return multiple independent maximal-leaf overlays for a single query, so a `service: any` overlay is picked up alongside the service-specific maximal leaf and its inheritance chain: diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index 48e42d86e..b31bfbd55 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -34,7 +34,8 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **kueue** | Kubernetes-native job queuing system. Manages quotas and admits jobs for batch and AI workloads. | [Kueue](https://github.com/kubernetes-sigs/kueue) | | **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) | | **slinky-slurm-operator-crds** | Custom Resource Definitions for the SchedMD Slinky Slurm operator. Installs the `slinky.slurm.net` CRDs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). Installed separately to support CRD lifecycle management. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | -| **slinky-slurm-operator** | SchedMD Slinky Slurm operator and admission webhook. Manages the lifecycle of Slurm clusters declared via Slinky CRs. Cluster-instance CRs (Controller, NodeSet, LoginSet, ...) are user-authored — AICR ships only the operator, mirroring how dynamo-platform and kubeflow-trainer ship operator-only. | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | +| **slinky-slurm-operator** | SchedMD Slinky Slurm operator and admission webhook. Manages the lifecycle of Slurm clusters declared via Slinky CRs (Controller, NodeSet, LoginSet, Accounting, RestApi, Token). | [Slinky Slurm Operator](https://github.com/SlinkyProject/slurm-operator) | +| **slinky-slurm** | Slinky-managed Slurm cluster instance: Controller (slurmctld) + LoginSet (sackd/sshd) + NodeSet (slurmd) + RestApi (slurmrestd). Reconciled by `slinky-slurm-operator`. Opt-in via the `platform-slurm-cluster` mixin (alongside `platform-slurm` for the operator). Accounting (slurmdbd) requires an external MariaDB and is disabled in defaults — see `recipes/components/slinky-slurm/values.yaml`. | [Slinky Slurm Cluster Chart](https://github.com/SlinkyProject/slurm-operator/tree/main/helm/slurm) | ## How Components Are Selected @@ -43,7 +44,7 @@ Not every component appears in every recipe. The recipe engine selects component - **Base components** (cert-manager, kube-prometheus-stack) appear in most recipes. - **Cloud-specific components** (aws-efa, aws-ebs-csi-driver) are added when the service matches. - **Intent-specific components** (agentgateway, agentgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). -- **Platform-specific components** (slinky-slurm-operator, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. +- **Platform-specific components** (slinky-slurm-operator, slinky-slurm, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. For `--platform slurm`, the cluster (`slinky-slurm`) is opt-in via the `platform-slurm-cluster` mixin alongside the always-applied operator (`platform-slurm`); leaves that want operator-only compose just `platform-slurm`. - **Accelerator/OS-specific tuning** (nodewright-customizations, nvidia-dra-driver-gpu) varies by hardware and OS combination. ### NFD Topology Updater diff --git a/docs/user/container-images.md b/docs/user/container-images.md index e44182ca1..ab2dbc9a4 100644 --- a/docs/user/container-images.md +++ b/docs/user/container-images.md @@ -19,8 +19,8 @@ A machine-readable **CycloneDX 1.6 JSON** companion to this page is produced by ## Summary -- Components: **25** -- Unique images: **71** +- Components: **26** +- Unique images: **76** - Distinct registries: **11** Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.agentgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev` @@ -52,6 +52,7 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.agentgateway.dev | nvsentinel | helm | nvsentinel | v1.3.0 | 6 | | prometheus-adapter | helm | prometheus-community/prometheus-adapter | 5.3.0 | 1 | | prometheus-operator-crds | helm | prometheus-community/prometheus-operator-crds | 28.0.1 | 0 | +| slinky-slurm | helm | slurm | 1.1.0 | 5 | | slinky-slurm-operator | helm | slurm-operator | 1.1.0 | 2 | | slinky-slurm-operator-crds | helm | slurm-operator-crds | 1.1.0 | 0 | @@ -197,6 +198,14 @@ _No images extracted._ _No images extracted._ +### slinky-slurm + +- `docker.io/library/alpine:3.23.3` +- `ghcr.io/slinkyproject/login:25.11-ubuntu24.04` +- `ghcr.io/slinkyproject/slurmctld:25.11-ubuntu24.04` +- `ghcr.io/slinkyproject/slurmd:25.11-ubuntu24.04` +- `ghcr.io/slinkyproject/slurmrestd:25.11-ubuntu24.04` + ### slinky-slurm-operator - `ghcr.io/slinkyproject/slurm-operator-webhook:1.1.0` diff --git a/kwok/scripts/apply-nodes.sh b/kwok/scripts/apply-nodes.sh index 1c3c066d0..0f51b9525 100755 --- a/kwok/scripts/apply-nodes.sh +++ b/kwok/scripts/apply-nodes.sh @@ -96,7 +96,7 @@ generate_node() { local max_pods="110" # System nodes get control-plane label for operator controllers - if [[ "$node_type" == "system" ]]; then + if [[ "$node_type" == "system" || "$node_type" == "kwok-system" ]]; then extra_labels=" node-role.kubernetes.io/control-plane: \"\"" fi @@ -179,11 +179,18 @@ create_nodes() { sys_mem=$(yq eval '.spec.resources.memory' "$sys_profile_path") sys_storage=$(yq eval '.spec.resources.storage' "$sys_profile_path") + # KWOK fake "system" nodes get aicr.nvidia.com/node-type=kwok-system + # (not =system) so they do NOT match --system-node-selector + # aicr.nvidia.com/node-type=system. That selector is owned by the real + # Kind control-plane node (labeled by run-all-recipes.sh ensure_cluster). + # Reason: workloads that provide admission webhooks (cert-manager) must + # land on a real node — KWOK fakes report pods Ready without running a + # container, leaving the webhook unreachable. log_info "Creating $SYSTEM_NODE_COUNT system nodes ($sys_instance)" for ((i = 0; i < SYSTEM_NODE_COUNT; i++)); do local zone node_name="system-${i}" zone="${DEFAULT_ZONES[$((i % ${#DEFAULT_ZONES[@]}))]}" - generate_node "$node_name" "system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ + generate_node "$node_name" "kwok-system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ "$DEFAULT_K8S_VERSION" "$sys_cpu" "$sys_mem" "$sys_storage" "$sys_arch" "$sys_os" \ > "${temp_dir}/${node_name}.yaml" log_info " $node_name ($zone)" diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index b2c43fd31..9019f7983 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -102,6 +102,18 @@ ensure_cluster() { helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi + # Pin --system-node-selector aicr.nvidia.com/node-type=system to the real + # Kind control-plane node. KWOK fake "system" nodes carry =kwok-system + # (set by apply-nodes.sh) so they do NOT match. Without this, charts with + # admission webhooks (cert-manager) land on a fake — pods report Ready + # without a running container, webhook is unreachable, downstream installs + # that submit cert-manager.io/Certificate (slinky-slurm-operator) fail. + local cp_node + cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -n "$cp_node" ]]; then + kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null + fi + # Patch kindnet to exclude KWOK nodes if kubectl get daemonset -n kube-system kindnet &>/dev/null; then kubectl patch daemonset -n kube-system kindnet --type=json -p='[ diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index b44d48830..9aa4f39fe 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -311,21 +311,30 @@ generate_bundle() { exit 1 fi - # Extract criteria from overlay - local service accelerator intent os + # Without --platform, *-slurm overlays resolve to their non-platform + # parent and the bundle omits the slinky-slurm operator/cluster. + # Scoped to slurm: kubeflow/dynamo are not yet validated under KWOK. + local service accelerator intent os platform service=$(yq eval '.spec.criteria.service // ""' "$recipe_overlay") accelerator=$(yq eval '.spec.criteria.accelerator // ""' "$recipe_overlay") intent=$(yq eval '.spec.criteria.intent // ""' "$recipe_overlay") os=$(yq eval '.spec.criteria.os // ""' "$recipe_overlay") + platform=$(yq eval '.spec.criteria.platform // ""' "$recipe_overlay") - log_info "Criteria: service=$service accelerator=$accelerator intent=$intent os=$os" + log_info "Criteria: service=$service accelerator=$accelerator intent=$intent os=$os platform=$platform" - # Build recipe command with available criteria local recipe_args=() [[ -n "$service" ]] && recipe_args+=(--service "$service") [[ -n "$accelerator" ]] && recipe_args+=(--accelerator "$accelerator") [[ -n "$intent" ]] && recipe_args+=(--intent "$intent") [[ -n "$os" ]] && recipe_args+=(--os "$os") + # Only forward --platform for platforms validated under KWOK. Other + # platforms (kubeflow, dynamo, nim) historically resolve to their + # non-platform parent here; preserve that behavior to avoid regressing + # existing matrix lanes. Extend as additional platforms are validated. + if [[ "$platform" == "slurm" ]]; then + recipe_args+=(--platform "$platform") + fi # Generate resolved recipe from criteria log_info "Generating resolved recipe..." diff --git a/pkg/recipe/components_test.go b/pkg/recipe/components_test.go index b3135e39d..b642c3551 100644 --- a/pkg/recipe/components_test.go +++ b/pkg/recipe/components_test.go @@ -15,6 +15,7 @@ package recipe import ( + "maps" "slices" "strings" "testing" @@ -210,6 +211,78 @@ func TestComponentRegistry_NodeSchedulingPaths(t *testing.T) { } } +// Pins the `slinky` map-key choice for slinky-slurm on both sides: +// the registry's nodeScheduling paths AND components/slinky-slurm/ +// values.yaml must reference the same key, or injected tolerations +// land on a non-existent map entry. +func TestComponentRegistry_SlinkySlurm_NodeSchedulingPaths(t *testing.T) { + registry, err := GetComponentRegistry() + if err != nil { + t.Fatalf("failed to load component registry: %v", err) + } + + slurmCluster := registry.Get("slinky-slurm") + if slurmCluster == nil { + t.Fatal("slinky-slurm not found in registry") + } + + wantSysToleration := []string{ + "controller.podSpec.tolerations", + "restapi.podSpec.tolerations", + "loginsets.slinky.podSpec.tolerations", + } + gotSysToleration := slurmCluster.GetSystemTolerationPaths() + for _, p := range wantSysToleration { + if !slices.Contains(gotSysToleration, p) { + t.Errorf("slinky-slurm system toleration paths missing %q (got %v)", p, gotSysToleration) + } + } + + wantSysSelector := []string{ + "controller.podSpec.nodeSelector", + "restapi.podSpec.nodeSelector", + "loginsets.slinky.podSpec.nodeSelector", + } + gotSysSelector := slurmCluster.GetSystemNodeSelectorPaths() + for _, p := range wantSysSelector { + if !slices.Contains(gotSysSelector, p) { + t.Errorf("slinky-slurm system node selector paths missing %q (got %v)", p, gotSysSelector) + } + } + + gotAccelSelector := slurmCluster.GetAcceleratedNodeSelectorPaths() + if !slices.Contains(gotAccelSelector, "nodesets.slinky.podSpec.nodeSelector") { + t.Errorf("slinky-slurm accelerated node selector paths missing %q (got %v)", + "nodesets.slinky.podSpec.nodeSelector", gotAccelSelector) + } + gotAccelToleration := slurmCluster.GetAcceleratedTolerationPaths() + if !slices.Contains(gotAccelToleration, "nodesets.slinky.podSpec.tolerations") { + t.Errorf("slinky-slurm accelerated toleration paths missing %q (got %v)", + "nodesets.slinky.podSpec.tolerations", gotAccelToleration) + } + + const valuesPath = "components/slinky-slurm/values.yaml" + content, err := GetEmbeddedFS().ReadFile(valuesPath) + if err != nil { + t.Fatalf("failed to read %s: %v", valuesPath, err) + } + var values struct { + Nodesets map[string]any `yaml:"nodesets"` + Loginsets map[string]any `yaml:"loginsets"` + } + if err := yaml.Unmarshal(content, &values); err != nil { + t.Fatalf("failed to parse %s: %v", valuesPath, err) + } + if _, ok := values.Nodesets["slinky"]; !ok { + t.Errorf("%s must define nodesets.slinky to match the registry's "+ + "nodeScheduling paths (got nodesets keys: %v)", valuesPath, slices.Sorted(maps.Keys(values.Nodesets))) + } + if _, ok := values.Loginsets["slinky"]; !ok { + t.Errorf("%s must define loginsets.slinky to match the registry's "+ + "nodeScheduling paths (got loginsets keys: %v)", valuesPath, slices.Sorted(maps.Keys(values.Loginsets))) + } +} + func TestComponentRegistry_TaintStrPaths(t *testing.T) { registry, err := GetComponentRegistry() if err != nil { diff --git a/pkg/recipe/deployment_order_guard_test.go b/pkg/recipe/deployment_order_guard_test.go index e653dddd0..d374aa815 100644 --- a/pkg/recipe/deployment_order_guard_test.go +++ b/pkg/recipe/deployment_order_guard_test.go @@ -175,6 +175,50 @@ func TestDeploymentOrderGuards(t *testing.T) { {"gpu-operator", "nvsentinel"}, }, }, + { + name: "h100-eks-ubuntu-training-slurm", + criteria: func() *Criteria { + c := NewCriteria() + c.Service = CriteriaServiceEKS + c.Accelerator = CriteriaAcceleratorH100 + c.OS = CriteriaOSUbuntu + c.Intent = CriteriaIntentTraining + c.Platform = CriteriaPlatformSlurm + return c + }, + requiredDeps: map[string][]string{ + "slinky-slurm-operator": {"cert-manager", "slinky-slurm-operator-crds"}, + "slinky-slurm": {"slinky-slurm-operator", "slinky-slurm-operator-crds"}, + }, + requiredOrdering: [][2]string{ + {"cert-manager", "slinky-slurm-operator"}, + {"slinky-slurm-operator-crds", "slinky-slurm-operator"}, + {"slinky-slurm-operator", "slinky-slurm"}, + {"slinky-slurm-operator-crds", "slinky-slurm"}, + {"gpu-operator", "nvsentinel"}, + }, + }, + { + name: "h100-kind-training-slurm", + criteria: func() *Criteria { + c := NewCriteria() + c.Service = CriteriaServiceKind + c.Accelerator = CriteriaAcceleratorH100 + c.Intent = CriteriaIntentTraining + c.Platform = CriteriaPlatformSlurm + return c + }, + requiredDeps: map[string][]string{ + "slinky-slurm-operator": {"cert-manager", "slinky-slurm-operator-crds"}, + "slinky-slurm": {"slinky-slurm-operator", "slinky-slurm-operator-crds"}, + }, + requiredOrdering: [][2]string{ + {"cert-manager", "slinky-slurm-operator"}, + {"slinky-slurm-operator-crds", "slinky-slurm-operator"}, + {"slinky-slurm-operator", "slinky-slurm"}, + {"slinky-slurm-operator-crds", "slinky-slurm"}, + }, + }, } for _, tt := range tests { diff --git a/recipes/checks/slinky-slurm-operator/health-check.yaml b/recipes/checks/slinky-slurm-operator/health-check.yaml index cadc44126..07b811800 100644 --- a/recipes/checks/slinky-slurm-operator/health-check.yaml +++ b/recipes/checks/slinky-slurm-operator/health-check.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -28,9 +28,6 @@ spec: steps: - name: validate-operator-deployment try: - # Guard against vacuous pass on empty namespace: verify the - # slurm-operator deployment exists and has at least one ready - # replica. - assert: resource: apiVersion: apps/v1 @@ -42,8 +39,7 @@ spec: (availableReplicas > `0`): true - name: validate-webhook-deployment try: - # The webhook must be ready before any Slinky CRs (Controller, - # NodeSet, etc.) can be created, so assert it independently. + # Webhook must be ready before any Slinky CR can be created. - assert: resource: apiVersion: apps/v1 @@ -55,17 +51,9 @@ spec: (availableReplicas > `0`): true - name: validate-all-pods-healthy try: - # Assert no pods are in unhealthy phases. - # Pods must be Running (long-lived) or Succeeded (completed jobs). - # This catches Pending (init containers, scheduling), Failed, and - # Unknown. - # - # chainsaw `error` assertions pass when no matching resource exists, - # which would let this step trivially pass on an empty namespace. - # The two preceding deployment-availability steps prevent that: - # they require both deployments to have at least one ready replica - # in `slinky`, which guarantees pods exist and are inspectable - # before this step runs. + # Catch Pending / Failed / Unknown phases. The preceding + # deployment asserts fail loudly if the workloads are missing, + # so chainsaw `error` here is not asked to detect that case. - error: resource: apiVersion: v1 diff --git a/recipes/checks/slinky-slurm/health-check.yaml b/recipes/checks/slinky-slurm/health-check.yaml new file mode 100644 index 000000000..cb2a9df2d --- /dev/null +++ b/recipes/checks/slinky-slurm/health-check.yaml @@ -0,0 +1,141 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Slinky Slurm cluster health check. +# +# Names assume the AICR localformat release name `slinky-slurm`; the +# trailing `-slinky` comes from the chart's default map-key for +# loginsets/nodesets. Other deployers using a different release name +# need a parameterized variant of this check. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: slinky-slurm-health-check +spec: + timeouts: + assert: 10m + steps: + - name: validate-controller-cr + try: + - assert: + resource: + apiVersion: slinky.slurm.net/v1beta1 + kind: Controller + metadata: + name: slinky-slurm + namespace: slurm + - name: validate-loginset-cr + try: + - assert: + resource: + apiVersion: slinky.slurm.net/v1beta1 + kind: LoginSet + metadata: + name: slinky-slurm-login-slinky + namespace: slurm + - name: validate-nodeset-cr + try: + - assert: + resource: + apiVersion: slinky.slurm.net/v1beta1 + kind: NodeSet + metadata: + name: slinky-slurm-worker-slinky + namespace: slurm + - name: validate-restapi-cr + try: + - assert: + resource: + apiVersion: slinky.slurm.net/v1beta1 + kind: RestApi + metadata: + name: slinky-slurm + namespace: slurm + # CR existence alone does not prove reconciliation into running + # pods: assert each workload reports a ready replica before the + # pod-phase guard runs against the namespace. + - name: validate-controller-statefulset-ready + try: + - assert: + resource: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: slinky-slurm-controller + namespace: slurm + status: + (availableReplicas > `0`): true + - name: validate-login-deployment-ready + try: + - assert: + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: slinky-slurm-login-slinky + namespace: slurm + status: + (availableReplicas > `0`): true + - name: validate-restapi-deployment-ready + try: + - assert: + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: slinky-slurm-restapi + namespace: slurm + status: + (availableReplicas > `0`): true + - name: validate-nodeset-ready + try: + # NodeSet exposes its own status; operator tracks per-node counts. + - assert: + resource: + apiVersion: slinky.slurm.net/v1beta1 + kind: NodeSet + metadata: + name: slinky-slurm-worker-slinky + namespace: slurm + status: + (availableReplicas > `0`): true + - name: validate-all-pods-healthy + try: + # Catch Pending / Failed / Unknown phases. The preceding + # workload asserts fail loudly if the workloads are missing, + # so chainsaw `error` here is not asked to detect that case. + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: slurm + status: + phase: Pending + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: slurm + status: + phase: Failed + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: slurm + status: + phase: Unknown diff --git a/recipes/components/slinky-slurm/values.yaml b/recipes/components/slinky-slurm/values.yaml new file mode 100644 index 000000000..fe5c7d341 --- /dev/null +++ b/recipes/components/slinky-slurm/values.yaml @@ -0,0 +1,177 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Slinky Slurm cluster Helm values. +# +# Overrides the chart's default `slinky` map-keys for `nodesets` / +# `loginsets`: those keys carry the full required sub-tree (image, +# slurmd, logfile sidecar, partition); any other key would have to +# redefine it all. The cross-check in pkg/recipe TestComponentRegistry_ +# SlinkySlurm_NodeSchedulingPaths enforces this alignment with the +# registry's nodeScheduling paths. +# +# Accounting (slurmdbd) and DCGM job-mapping are disabled by default: +# accounting needs an external MariaDB AICR does not bundle; DCGM needs +# dcgm-exporter on workers. Both opt in via valuesFile / --set. + +# Cgroup isolation + NVIDIA GPU autodetect (no-op without GPUs). +configFiles: + cgroup.conf: | + CgroupPlugin=autodetect + IgnoreSystemd=yes + EnableControllers=yes + ConstrainCores=yes + ConstrainRAMSpace=yes + ConstrainDevices=yes + ConstrainSwapSpace=yes + AllowedRAMSpace=95.0 + AllowedSwapSpace=100.0 + gres.conf: | + AutoDetect=nvidia + +# File-based local users (no LDAP/AD provider bundled). +sssd: + conf: | + [sssd] + config_file_version = 2 + services = nss,pam + domains = LOCAL + + [nss] + filter_groups = root,slurm + filter_users = root,slurm + + [pam] + + [domain/LOCAL] + id_provider = files + auth_provider = files + +controller: + # Pin the logfile sidecar to an immutable Alpine tag (chart default is + # :latest). Re-verify on defaultVersion bumps in case the chart pins + # it upstream. + logfile: + image: + tag: "3.23.3" + persistence: + enabled: true + storageClassName: null + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 4Gi + # All values quoted as strings: extraConfMap is map[string]string and + # bare YAML 1.1 scalars (e.g. `no`) would coerce to boolean false. + # Accounting-only directives belong in the accounting opt-in valuesFile. + extraConfMap: + PriorityType: "priority/multifactor" + PriorityWeightAge: "2000" + PriorityWeightFairShare: "10000" + PriorityWeightJobSize: "1000" + PriorityWeightPartition: "1000" + PriorityWeightQOS: "10000" + PriorityMaxAge: "7-0" + PriorityDecayHalfLife: "7-0" + PriorityFlags: "NO_NORMAL_ALL" + SelectType: "select/cons_tres" + EnforcePartLimits: "no" + ScronParameters: "enable" + # /metrics endpoint on; ServiceMonitor stays at chart default (off) + # so the chart doesn't render a prometheus-operator CR on clusters + # without it. + metrics: + enabled: true + +restapi: + replicas: 1 + slurmrestd: + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + # Chart wraps Service overrides as {metadata, spec}; bare service.type + # is rejected by the RestApi CRD. + service: + spec: + type: ClusterIP + +# Accounting (slurmdbd) is wrapped in `if .Values.accounting.enabled` +# in the chart, so storageConfig below is inert until enabled. Values +# mirror the chart defaults (Service named `mariadb`, Secret named +# `mariadb-password` with key `password`) so operators see the expected +# external MariaDB shape; this is also the wiring AICR will produce +# when a MariaDB component is bundled. +accounting: + enabled: false + storageConfig: + host: mariadb + port: 3306 + database: slurm_acct_db + username: slurm + passwordKeyRef: + name: mariadb-password + key: password + +# Secure default: the login pod is intentionally unreachable via SSH +# (empty rootSshAuthorizedKeys + PasswordAuthentication off) until an +# operator supplies real keys. Do not "fix" without wiring key delivery. +loginsets: + slinky: + enabled: true + replicas: 1 + # Pin the initconf sidecar to an immutable Alpine tag (chart default + # is :latest). + initconf: + image: + tag: "3.23.3" + rootSshAuthorizedKeys: | + # PLACEHOLDER -- override via valuesFile or --set before deploy + # ssh-ed25519 AAAA... user@example.com + extraSshdConfig: | + PasswordAuthentication no + PermitEmptyPasswords no + ChallengeResponseAuthentication no + +# Only one partition can be Default=YES in slurm.conf; an override +# valuesFile adding another Default=YES partition makes slurmctld fail +# to start. Note slurmd does NOT auto-register its pod resource limits +# (known upstream issue); for GPU clusters set explicit Gres/Features +# via `nodesets.slinky.extraConfMap` and the matching nvidia.com/gpu +# limit on `nodesets.slinky.slurmd.resources.limits`. +nodesets: + slinky: + enabled: true + scalingMode: StatefulSet + replicas: 1 + # Pin the logfile sidecar to an immutable Alpine tag (chart default + # is :latest). + logfile: + image: + tag: "3.23.3" + partition: + enabled: true + configMap: + State: UP + Default: "YES" + MaxTime: UNLIMITED + +vendor: + nvidia: + dcgm: + enabled: false diff --git a/recipes/mixins/platform-slurm-cluster.yaml b/recipes/mixins/platform-slurm-cluster.yaml new file mode 100644 index 000000000..71ac2ad8b --- /dev/null +++ b/recipes/mixins/platform-slurm-cluster.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Opt-in mixin: a Slinky-managed Slurm cluster instance (Controller / +# LoginSet / NodeSet / RestApi) on top of the operator from +# platform-slurm. Leaves wanting only the operator compose +# platform-slurm alone; leaves wanting a runnable cluster compose both. +kind: RecipeMixin +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: platform-slurm-cluster +spec: + componentRefs: + - name: slinky-slurm + type: Helm + valuesFile: components/slinky-slurm/values.yaml + dependencyRefs: + - slinky-slurm-operator + - slinky-slurm-operator-crds diff --git a/recipes/overlays/h100-eks-ubuntu-training-slurm.yaml b/recipes/overlays/h100-eks-ubuntu-training-slurm.yaml new file mode 100644 index 000000000..0824fc35d --- /dev/null +++ b/recipes/overlays/h100-eks-ubuntu-training-slurm.yaml @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h100-eks-ubuntu-training-slurm + +spec: + # H100 + EKS + Ubuntu + training with the Slinky operator and a + # Slinky-managed Slurm cluster. EKS-specific cluster tuning (gp3 + # storage, GPU GRES, DCGM job-mapping) is layered at install time + # via `aicr bundle ... --set slurmcluster:...` or a valuesFile. + base: h100-eks-ubuntu-training + + criteria: + service: eks + accelerator: h100 + os: ubuntu + intent: training + platform: slurm + + mixins: + - os-ubuntu + - platform-slurm + - platform-slurm-cluster + + constraints: + - name: K8s.server.version + value: ">= 1.32.4" + + # Mixin-contributed components cannot be overridden from a leaf; use + # `--set slurmcluster:...` or a valuesFile at install time instead. + # Accelerated nodeSelector/tolerations on slurmd are injected via the + # registry's nodesets.slinky.podSpec.{nodeSelector,tolerations} paths. + componentRefs: [] + + # Validation is inherited from h100-eks-training (operator-health, + # expected-resources, gpu-operator-version, check-nvidia-smi). diff --git a/recipes/overlays/h100-kind-training-slurm.yaml b/recipes/overlays/h100-kind-training-slurm.yaml new file mode 100644 index 000000000..60368a99f --- /dev/null +++ b/recipes/overlays/h100-kind-training-slurm.yaml @@ -0,0 +1,45 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: h100-kind-training-slurm + +spec: + # H100 + Kind + training with the Slinky operator and a Slinky-managed + # Slurm cluster. Kind has no GPUs, so the NodeSet runs CPU-only (no + # GRES, no nvidia.com/gpu, no DCGM); useful for no-GPU CI end-to-end. + base: h100-kind-training + + criteria: + service: kind + accelerator: h100 + intent: training + platform: slurm + + mixins: + - platform-slurm + - platform-slurm-cluster + + # DRA (GA in K8s 1.34) — restated from the parent for clarity. + constraints: + - name: K8s.server.version + value: ">= 1.34" + + # Mixin-contributed components cannot be overridden from a leaf; use + # `--set slurmcluster:...` or a valuesFile at install time instead. + componentRefs: [] + + # Validation is inherited from h100-kind-training. diff --git a/recipes/registry.yaml b/recipes/registry.yaml index bf9ee4c1a..51ecbe1cb 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -606,10 +606,8 @@ components: - name: slinky-slurm-operator displayName: slinky-slurm-operator - # The short alias `slurm` is reserved here for the operator. When the Slinky `slurm` - # cluster chart (oci://ghcr.io/slinkyproject/charts/slurm) is added - # to AICR, we will assign it a distinct short alias (e.g. - # `slurm-cluster`) so `slurm` continues to route to the operator. + # Short alias `slurm` routes to the operator; the cluster chart + # (slinky-slurm below) uses the distinct alias `slurmcluster`. valueOverrideKeys: - slinkyslurmoperator - slurmoperator @@ -643,3 +641,36 @@ components: tolerationPaths: - operator.tolerations - webhook.tolerations + + - name: slinky-slurm + displayName: slinky-slurm + # Cluster instance chart; wired in via platform-slurm-cluster mixin. + valueOverrideKeys: + - slinkyslurm + - slurmcluster + healthCheck: + assertFile: checks/slinky-slurm/health-check.yaml + helm: + defaultRepository: oci://ghcr.io/slinkyproject/charts + defaultChart: slurm + # When bumping defaultVersion, re-verify the `slinky` default + # map-keys for nodesets/loginsets still exist (the cross-check in + # pkg/recipe TestComponentRegistry_SlinkySlurm_NodeSchedulingPaths + # enforces this, but the chart change must come first). + defaultVersion: "1.1.0" + defaultNamespace: slurm + nodeScheduling: + system: + nodeSelectorPaths: + - controller.podSpec.nodeSelector + - restapi.podSpec.nodeSelector + - loginsets.slinky.podSpec.nodeSelector + tolerationPaths: + - controller.podSpec.tolerations + - restapi.podSpec.tolerations + - loginsets.slinky.podSpec.tolerations + accelerated: + nodeSelectorPaths: + - nodesets.slinky.podSpec.nodeSelector + tolerationPaths: + - nodesets.slinky.podSpec.tolerations From 88418d1896308de419f94c136e614a39c32a06dd Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:05:13 +0200 Subject: [PATCH 02/14] ci(kwok): log hint when --platform is unsupported Mark flagged that the slurm-only allowlist will silently bit-rot when a future *-kubeflow or *-dynamo leaf with criteria.platform lands here. Add a log_info so the next person sees the hint instead of mysterious bundle diffs. Cannot fail-closed yet: existing kubeflow/dynamo Tier 2 lanes carry criteria.platform today and have historically resolved to their non-platform parent under KWOK. Widening the allowlist is blocked on the harness CRD-cleanup bug; tracked as a follow-up. --- kwok/scripts/validate-scheduling.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index 9aa4f39fe..a248a1fa2 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -334,6 +334,8 @@ generate_bundle() { # existing matrix lanes. Extend as additional platforms are validated. if [[ "$platform" == "slurm" ]]; then recipe_args+=(--platform "$platform") + elif [[ -n "$platform" ]]; then + log_info "platform=$platform not yet validated under KWOK — resolving without --platform" fi # Generate resolved recipe from criteria From a35dc2b09eee588b09b1347efa650eedd3ca9a51 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:05:37 +0200 Subject: [PATCH 03/14] test(slinky-slurm-operator): restore vacuous-pass note in health check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark's earlier review explained that chainsaw `error` asserts pass vacuously when no resource matches — so the preceding deployment availability checks are load-bearing, not redundant. Previous trim of that comment lost the non-obvious invariant; restore it. --- recipes/checks/slinky-slurm-operator/health-check.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/recipes/checks/slinky-slurm-operator/health-check.yaml b/recipes/checks/slinky-slurm-operator/health-check.yaml index 07b811800..1fcf997c4 100644 --- a/recipes/checks/slinky-slurm-operator/health-check.yaml +++ b/recipes/checks/slinky-slurm-operator/health-check.yaml @@ -51,9 +51,10 @@ spec: (availableReplicas > `0`): true - name: validate-all-pods-healthy try: - # Catch Pending / Failed / Unknown phases. The preceding - # deployment asserts fail loudly if the workloads are missing, - # so chainsaw `error` here is not asked to detect that case. + # Catch Pending / Failed / Unknown phases. Chainsaw `error` passes + # vacuously when no resource matches, so the preceding deployment- + # availability asserts are load-bearing: they guarantee the pods + # exist before this step runs. - error: resource: apiVersion: v1 From 46dd144f963c3ab085e6cc02308e9c987398a611 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:06:43 +0200 Subject: [PATCH 04/14] test(slinky-slurm): restore vacuous-pass note in cluster health check Mirror the wording fix applied to slinky-slurm-operator/health-check.yaml: spell out that chainsaw `error` passes vacuously on empty namespaces, so the preceding StatefulSet/Deployment/NodeSet availability asserts are load-bearing. --- recipes/checks/slinky-slurm/health-check.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/recipes/checks/slinky-slurm/health-check.yaml b/recipes/checks/slinky-slurm/health-check.yaml index cb2a9df2d..1a92c8fff 100644 --- a/recipes/checks/slinky-slurm/health-check.yaml +++ b/recipes/checks/slinky-slurm/health-check.yaml @@ -112,9 +112,10 @@ spec: (availableReplicas > `0`): true - name: validate-all-pods-healthy try: - # Catch Pending / Failed / Unknown phases. The preceding - # workload asserts fail loudly if the workloads are missing, - # so chainsaw `error` here is not asked to detect that case. + # Catch Pending / Failed / Unknown phases. Chainsaw `error` passes + # vacuously when no resource matches, so the preceding workload- + # readiness asserts (StatefulSet / Deployments / NodeSet) are + # load-bearing: they guarantee the pods exist before this step runs. - error: resource: apiVersion: v1 From cee8186b90c61b38c7a4fad8ff2677a20457607c Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:07:49 +0200 Subject: [PATCH 05/14] test(slinky-slurm): note release-name fragility in health check Mark called out that hardcoding `slinky-slurm-*` in the asserts silently breaks for deployers that override the Helm release name (flux/argocd path-based naming, multi-tenant installs). Add a TODO pointing at the parameterization gap so the next reader doesn't mistake this for a deliberate single-release contract. --- recipes/checks/slinky-slurm/health-check.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/recipes/checks/slinky-slurm/health-check.yaml b/recipes/checks/slinky-slurm/health-check.yaml index 1a92c8fff..a05752408 100644 --- a/recipes/checks/slinky-slurm/health-check.yaml +++ b/recipes/checks/slinky-slurm/health-check.yaml @@ -18,6 +18,9 @@ # trailing `-slinky` comes from the chart's default map-key for # loginsets/nodesets. Other deployers using a different release name # need a parameterized variant of this check. +# TODO: parameterize CR/workload names once chainsaw supports name-pattern +# matching (currently hardcoded; flux/argocd deployers will rewrite release +# names and silently break these asserts). apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: From b839e97f1b655707519a1a1db235e498543a1ad5 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:10:37 +0200 Subject: [PATCH 06/14] feat(slinky-slurm): cap default MaxTime, correct slurmd conf note MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related cluster-default fixes from review: 1. Single Default=YES partition is a hard slurmctld constraint, not style — call that out so multi-tenant deployers know they must disable this partition before adding their own. 2. MaxTime=UNLIMITED is unsafe as a default; a stuck job pins GPUs indefinitely. Cap at 24h. Leaves with longer-running workloads override per-overlay. 3. The prior "known upstream issue" framing for slurmd auto-registering pod resource limits was wrong. Verified against v1.1 chart sources: --conf only carries Features= + user extraConf, while pod cpu/memory are plumbed as POD_CPUS/POD_MEMORY env vars for the image entrypoint. Reword the note to describe what the chart actually does and why Gres/Features must be set explicitly. --- recipes/components/slinky-slurm/values.yaml | 22 ++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/recipes/components/slinky-slurm/values.yaml b/recipes/components/slinky-slurm/values.yaml index fe5c7d341..573b73359 100644 --- a/recipes/components/slinky-slurm/values.yaml +++ b/recipes/components/slinky-slurm/values.yaml @@ -148,12 +148,17 @@ loginsets: PermitEmptyPasswords no ChallengeResponseAuthentication no -# Only one partition can be Default=YES in slurm.conf; an override -# valuesFile adding another Default=YES partition makes slurmctld fail -# to start. Note slurmd does NOT auto-register its pod resource limits -# (known upstream issue); for GPU clusters set explicit Gres/Features -# via `nodesets.slinky.extraConfMap` and the matching nvidia.com/gpu -# limit on `nodesets.slinky.slurmd.resources.limits`. +# Single-tenant policy: one Default=YES partition only — slurmctld +# refuses to start if an override valuesFile adds a second one. +# Multi-tenant deployers should disable this partition and define +# their own (e.g. per-team Default=NO + AllowGroups). +# +# The chart does NOT inject CPUs=/RealMemory=/Gres= into slurmd's +# --conf line from pod resource limits — it only plumbs POD_CPUS / +# POD_MEMORY env vars and lets the image entrypoint act on them. For +# GPU clusters, declare Gres/Features explicitly via +# `nodesets.slinky.extraConfMap` and pair with the matching +# nvidia.com/gpu limit on `nodesets.slinky.slurmd.resources.limits`. nodesets: slinky: enabled: true @@ -169,7 +174,10 @@ nodesets: configMap: State: UP Default: "YES" - MaxTime: UNLIMITED + # Cap default walltime at 24h. UNLIMITED is unsafe as a default: + # a single stuck job can hold GPUs indefinitely with no operator + # recourse short of scancel. Override per-leaf if needed. + MaxTime: "24:00:00" vendor: nvidia: From f66991d68f739255319b99eb794c85ad61433303 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:13:47 +0200 Subject: [PATCH 07/14] ci(kwok): untaint kind control-plane to accept system-tier pods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR-948 CI: every Tier 1/2 lane went red after the cert-manager harness fix. Root cause: the previous commit pinned aicr.nvidia.com/node-type= system exclusively to the real Kind control-plane (KWOK fakes now carry =kwok-system). That correctly routes cert-manager — which tolerates the control-plane taint — but leaves untolerated charts (kai-scheduler, nvsentinel, prometheus-adapter) Pending because the CP still has node-role.kubernetes.io/control-plane:NoSchedule. Remove that taint. Production clusters either run dedicated, untainted system nodes or schedule these charts on workers; the harness should model the former. --- kwok/scripts/run-all-recipes.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 9019f7983..08c4cddf2 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -112,6 +112,12 @@ ensure_cluster() { cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) if [[ -n "$cp_node" ]]; then kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null + # Remove control-plane NoSchedule taint. KWOK fakes previously + # absorbed system-tier workloads vacuously; now that the real CP + # is the only system-labeled node, untolerated charts (kai-scheduler, + # nvsentinel, monitoring) would go Pending. Real clusters either + # untaint dedicated system nodes or run those charts on workers. + kubectl taint node "$cp_node" node-role.kubernetes.io/control-plane- 2>/dev/null || true fi # Patch kindnet to exclude KWOK nodes From 38de273b0e0c0c8e7439729e8ad7e2d9ca6f8072 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:23:00 +0200 Subject: [PATCH 08/14] feat(slinky-slurm): drop multifactor priority defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark called out that PriorityType=priority/multifactor + the raw weight values + PriorityFlags=NO_NORMAL_ALL bake a specific site-policy into the AICR default. Most upstream Slurm clusters either run normalized (default) or omit PriorityFlags and let admins tune. These weights were inherited from an AWS reference config and have not been validated by any AICR-shipped leaf. Drop them — sites that want multifactor add it in their leaf valuesFile. Keep SelectType=select/cons_tres (required for GPU GRES) and ScronParameters=enable. Drop EnforcePartLimits=no (matches Slurm's own default). --- recipes/components/slinky-slurm/values.yaml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/recipes/components/slinky-slurm/values.yaml b/recipes/components/slinky-slurm/values.yaml index 573b73359..43ce9896b 100644 --- a/recipes/components/slinky-slurm/values.yaml +++ b/recipes/components/slinky-slurm/values.yaml @@ -77,17 +77,11 @@ controller: # bare YAML 1.1 scalars (e.g. `no`) would coerce to boolean false. # Accounting-only directives belong in the accounting opt-in valuesFile. extraConfMap: - PriorityType: "priority/multifactor" - PriorityWeightAge: "2000" - PriorityWeightFairShare: "10000" - PriorityWeightJobSize: "1000" - PriorityWeightPartition: "1000" - PriorityWeightQOS: "10000" - PriorityMaxAge: "7-0" - PriorityDecayHalfLife: "7-0" - PriorityFlags: "NO_NORMAL_ALL" + # select/cons_tres is required for GPU GRES to allocate per-resource + # rather than whole-node (Slurm's select/linear default). All other + # site-policy directives (priority weights, fairshare, QoS) are + # deliberately omitted — sites should add them in a leaf valuesFile. SelectType: "select/cons_tres" - EnforcePartLimits: "no" ScronParameters: "enable" # /metrics endpoint on; ServiceMonitor stays at chart default (off) # so the chart doesn't render a prometheus-operator CR on clusters From c3eed64b39da5d092feaf1ce4b64c20d52acc6cc Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:26:42 +0200 Subject: [PATCH 09/14] ci(kwok): disable cert-manager webhook instead of pinning to CP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous two harness fixes (label KWOK fakes kwok-system, pin real CP, untaint CP) cascaded: pinning system-tier workloads to a single real node caused Insufficient CPU pending across most lanes. Use the simpler fix the harness already employs for similar cases — disable the side-effect in the bundle. cert-manager's validating webhook is the only thing slinky-slurm-operator's install touched that needed a reachable endpoint; setting webhook.enabled=false skips admission entirely. KWOK doesn't execute workloads, so we lose nothing for scheduling validation, and every other chart returns to landing on KWOK fakes as before. Revert apply-nodes.sh and run-all-recipes.sh to upstream behavior. --- kwok/scripts/apply-nodes.sh | 11 ++--------- kwok/scripts/run-all-recipes.sh | 18 ------------------ kwok/scripts/validate-scheduling.sh | 6 ++++++ 3 files changed, 8 insertions(+), 27 deletions(-) diff --git a/kwok/scripts/apply-nodes.sh b/kwok/scripts/apply-nodes.sh index 0f51b9525..1c3c066d0 100755 --- a/kwok/scripts/apply-nodes.sh +++ b/kwok/scripts/apply-nodes.sh @@ -96,7 +96,7 @@ generate_node() { local max_pods="110" # System nodes get control-plane label for operator controllers - if [[ "$node_type" == "system" || "$node_type" == "kwok-system" ]]; then + if [[ "$node_type" == "system" ]]; then extra_labels=" node-role.kubernetes.io/control-plane: \"\"" fi @@ -179,18 +179,11 @@ create_nodes() { sys_mem=$(yq eval '.spec.resources.memory' "$sys_profile_path") sys_storage=$(yq eval '.spec.resources.storage' "$sys_profile_path") - # KWOK fake "system" nodes get aicr.nvidia.com/node-type=kwok-system - # (not =system) so they do NOT match --system-node-selector - # aicr.nvidia.com/node-type=system. That selector is owned by the real - # Kind control-plane node (labeled by run-all-recipes.sh ensure_cluster). - # Reason: workloads that provide admission webhooks (cert-manager) must - # land on a real node — KWOK fakes report pods Ready without running a - # container, leaving the webhook unreachable. log_info "Creating $SYSTEM_NODE_COUNT system nodes ($sys_instance)" for ((i = 0; i < SYSTEM_NODE_COUNT; i++)); do local zone node_name="system-${i}" zone="${DEFAULT_ZONES[$((i % ${#DEFAULT_ZONES[@]}))]}" - generate_node "$node_name" "kwok-system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ + generate_node "$node_name" "system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ "$DEFAULT_K8S_VERSION" "$sys_cpu" "$sys_mem" "$sys_storage" "$sys_arch" "$sys_os" \ > "${temp_dir}/${node_name}.yaml" log_info " $node_name ($zone)" diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 08c4cddf2..b2c43fd31 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -102,24 +102,6 @@ ensure_cluster() { helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi - # Pin --system-node-selector aicr.nvidia.com/node-type=system to the real - # Kind control-plane node. KWOK fake "system" nodes carry =kwok-system - # (set by apply-nodes.sh) so they do NOT match. Without this, charts with - # admission webhooks (cert-manager) land on a fake — pods report Ready - # without a running container, webhook is unreachable, downstream installs - # that submit cert-manager.io/Certificate (slinky-slurm-operator) fail. - local cp_node - cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) - if [[ -n "$cp_node" ]]; then - kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null - # Remove control-plane NoSchedule taint. KWOK fakes previously - # absorbed system-tier workloads vacuously; now that the real CP - # is the only system-labeled node, untolerated charts (kai-scheduler, - # nvsentinel, monitoring) would go Pending. Real clusters either - # untaint dedicated system nodes or run those charts on workers. - kubectl taint node "$cp_node" node-role.kubernetes.io/control-plane- 2>/dev/null || true - fi - # Patch kindnet to exclude KWOK nodes if kubectl get daemonset -n kube-system kindnet &>/dev/null; then kubectl patch daemonset -n kube-system kindnet --type=json -p='[ diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index a248a1fa2..688e68b9e 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -359,6 +359,11 @@ generate_bundle() { # Disable features not needed for scheduling validation: # - PrometheusRules and AlertManager (slow to create) # - Nodewright customization (creates CRs that depend on operator CRDs) + # - cert-manager webhook: validation needs a reachable webhook service, + # but cert-manager pods land on KWOK fakes that report Ready without + # a running container. Affects charts that submit cert-manager CRs + # at install time (slinky-slurm-operator); harmless for scheduling + # validation since KWOK doesn't execute workloads anyway. log_info "Generating bundle..." local bundle_output @@ -371,6 +376,7 @@ generate_bundle() { --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ --set "certmanager:startupapicheck.enabled=false" \ + --set "certmanager:webhook.enabled=false" \ --set "kubeprometheusstack:defaultRules.create=false" \ --set "kubeprometheusstack:alertmanager.enabled=false" \ --set "nodewright-customizations:enabled=false" \ From ac94a23d754a1dfc83ad735d67d65c50a0378312 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:37:53 +0200 Subject: [PATCH 10/14] Revert "ci(kwok): disable cert-manager webhook instead of pinning to CP" This reverts commit c3eed64b39da5d092feaf1ce4b64c20d52acc6cc. --- kwok/scripts/apply-nodes.sh | 11 +++++++++-- kwok/scripts/run-all-recipes.sh | 18 ++++++++++++++++++ kwok/scripts/validate-scheduling.sh | 6 ------ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/kwok/scripts/apply-nodes.sh b/kwok/scripts/apply-nodes.sh index 1c3c066d0..0f51b9525 100755 --- a/kwok/scripts/apply-nodes.sh +++ b/kwok/scripts/apply-nodes.sh @@ -96,7 +96,7 @@ generate_node() { local max_pods="110" # System nodes get control-plane label for operator controllers - if [[ "$node_type" == "system" ]]; then + if [[ "$node_type" == "system" || "$node_type" == "kwok-system" ]]; then extra_labels=" node-role.kubernetes.io/control-plane: \"\"" fi @@ -179,11 +179,18 @@ create_nodes() { sys_mem=$(yq eval '.spec.resources.memory' "$sys_profile_path") sys_storage=$(yq eval '.spec.resources.storage' "$sys_profile_path") + # KWOK fake "system" nodes get aicr.nvidia.com/node-type=kwok-system + # (not =system) so they do NOT match --system-node-selector + # aicr.nvidia.com/node-type=system. That selector is owned by the real + # Kind control-plane node (labeled by run-all-recipes.sh ensure_cluster). + # Reason: workloads that provide admission webhooks (cert-manager) must + # land on a real node — KWOK fakes report pods Ready without running a + # container, leaving the webhook unreachable. log_info "Creating $SYSTEM_NODE_COUNT system nodes ($sys_instance)" for ((i = 0; i < SYSTEM_NODE_COUNT; i++)); do local zone node_name="system-${i}" zone="${DEFAULT_ZONES[$((i % ${#DEFAULT_ZONES[@]}))]}" - generate_node "$node_name" "system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ + generate_node "$node_name" "kwok-system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ "$DEFAULT_K8S_VERSION" "$sys_cpu" "$sys_mem" "$sys_storage" "$sys_arch" "$sys_os" \ > "${temp_dir}/${node_name}.yaml" log_info " $node_name ($zone)" diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index b2c43fd31..08c4cddf2 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -102,6 +102,24 @@ ensure_cluster() { helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi + # Pin --system-node-selector aicr.nvidia.com/node-type=system to the real + # Kind control-plane node. KWOK fake "system" nodes carry =kwok-system + # (set by apply-nodes.sh) so they do NOT match. Without this, charts with + # admission webhooks (cert-manager) land on a fake — pods report Ready + # without a running container, webhook is unreachable, downstream installs + # that submit cert-manager.io/Certificate (slinky-slurm-operator) fail. + local cp_node + cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [[ -n "$cp_node" ]]; then + kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null + # Remove control-plane NoSchedule taint. KWOK fakes previously + # absorbed system-tier workloads vacuously; now that the real CP + # is the only system-labeled node, untolerated charts (kai-scheduler, + # nvsentinel, monitoring) would go Pending. Real clusters either + # untaint dedicated system nodes or run those charts on workers. + kubectl taint node "$cp_node" node-role.kubernetes.io/control-plane- 2>/dev/null || true + fi + # Patch kindnet to exclude KWOK nodes if kubectl get daemonset -n kube-system kindnet &>/dev/null; then kubectl patch daemonset -n kube-system kindnet --type=json -p='[ diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index 688e68b9e..a248a1fa2 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -359,11 +359,6 @@ generate_bundle() { # Disable features not needed for scheduling validation: # - PrometheusRules and AlertManager (slow to create) # - Nodewright customization (creates CRs that depend on operator CRDs) - # - cert-manager webhook: validation needs a reachable webhook service, - # but cert-manager pods land on KWOK fakes that report Ready without - # a running container. Affects charts that submit cert-manager CRs - # at install time (slinky-slurm-operator); harmless for scheduling - # validation since KWOK doesn't execute workloads anyway. log_info "Generating bundle..." local bundle_output @@ -376,7 +371,6 @@ generate_bundle() { --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ --set "certmanager:startupapicheck.enabled=false" \ - --set "certmanager:webhook.enabled=false" \ --set "kubeprometheusstack:defaultRules.create=false" \ --set "kubeprometheusstack:alertmanager.enabled=false" \ --set "nodewright-customizations:enabled=false" \ From eb78d96d3b10e4b7fea1518c33a812e80a73cbaa Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 16:38:17 +0200 Subject: [PATCH 11/14] Revert "ci(kwok): untaint kind control-plane to accept system-tier pods" This reverts commit f66991d68f739255319b99eb794c85ad61433303. --- kwok/scripts/run-all-recipes.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 08c4cddf2..9019f7983 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -112,12 +112,6 @@ ensure_cluster() { cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) if [[ -n "$cp_node" ]]; then kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null - # Remove control-plane NoSchedule taint. KWOK fakes previously - # absorbed system-tier workloads vacuously; now that the real CP - # is the only system-labeled node, untolerated charts (kai-scheduler, - # nvsentinel, monitoring) would go Pending. Real clusters either - # untaint dedicated system nodes or run those charts on workers. - kubectl taint node "$cp_node" node-role.kubernetes.io/control-plane- 2>/dev/null || true fi # Patch kindnet to exclude KWOK nodes From 54260ca2f99ef5e5ba4022d32afe9506062df359 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 17:30:05 +0200 Subject: [PATCH 12/14] ci(kwok): disable slurm-operator webhook + cert-manager for KWOK slinky-slurm-operator's chart gates both the cert-manager.io/Certificate submission and the ValidatingWebhookConfiguration on its own webhook.enabled / certManager.enabled toggles. Disable both for KWOK so admission isn't routed to unreachable fake-node pods. Harmless for scheduling validation; production recipes are unaffected. Verified locally: 43/43 pods scheduled. --- kwok/scripts/apply-nodes.sh | 11 ++--------- kwok/scripts/run-all-recipes.sh | 12 ------------ kwok/scripts/validate-scheduling.sh | 9 +++++++++ 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/kwok/scripts/apply-nodes.sh b/kwok/scripts/apply-nodes.sh index 0f51b9525..1c3c066d0 100755 --- a/kwok/scripts/apply-nodes.sh +++ b/kwok/scripts/apply-nodes.sh @@ -96,7 +96,7 @@ generate_node() { local max_pods="110" # System nodes get control-plane label for operator controllers - if [[ "$node_type" == "system" || "$node_type" == "kwok-system" ]]; then + if [[ "$node_type" == "system" ]]; then extra_labels=" node-role.kubernetes.io/control-plane: \"\"" fi @@ -179,18 +179,11 @@ create_nodes() { sys_mem=$(yq eval '.spec.resources.memory' "$sys_profile_path") sys_storage=$(yq eval '.spec.resources.storage' "$sys_profile_path") - # KWOK fake "system" nodes get aicr.nvidia.com/node-type=kwok-system - # (not =system) so they do NOT match --system-node-selector - # aicr.nvidia.com/node-type=system. That selector is owned by the real - # Kind control-plane node (labeled by run-all-recipes.sh ensure_cluster). - # Reason: workloads that provide admission webhooks (cert-manager) must - # land on a real node — KWOK fakes report pods Ready without running a - # container, leaving the webhook unreachable. log_info "Creating $SYSTEM_NODE_COUNT system nodes ($sys_instance)" for ((i = 0; i < SYSTEM_NODE_COUNT; i++)); do local zone node_name="system-${i}" zone="${DEFAULT_ZONES[$((i % ${#DEFAULT_ZONES[@]}))]}" - generate_node "$node_name" "kwok-system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ + generate_node "$node_name" "system" "$sys_instance" "$DEFAULT_REGION" "$zone" \ "$DEFAULT_K8S_VERSION" "$sys_cpu" "$sys_mem" "$sys_storage" "$sys_arch" "$sys_os" \ > "${temp_dir}/${node_name}.yaml" log_info " $node_name ($zone)" diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 9019f7983..b2c43fd31 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -102,18 +102,6 @@ ensure_cluster() { helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi - # Pin --system-node-selector aicr.nvidia.com/node-type=system to the real - # Kind control-plane node. KWOK fake "system" nodes carry =kwok-system - # (set by apply-nodes.sh) so they do NOT match. Without this, charts with - # admission webhooks (cert-manager) land on a fake — pods report Ready - # without a running container, webhook is unreachable, downstream installs - # that submit cert-manager.io/Certificate (slinky-slurm-operator) fail. - local cp_node - cp_node=$(kubectl get nodes -l '!type' -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) - if [[ -n "$cp_node" ]]; then - kubectl label node "$cp_node" aicr.nvidia.com/node-type=system --overwrite >/dev/null - fi - # Patch kindnet to exclude KWOK nodes if kubectl get daemonset -n kube-system kindnet &>/dev/null; then kubectl patch daemonset -n kube-system kindnet --type=json -p='[ diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index a248a1fa2..1844164f2 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -359,6 +359,13 @@ generate_bundle() { # Disable features not needed for scheduling validation: # - PrometheusRules and AlertManager (slow to create) # - Nodewright customization (creates CRs that depend on operator CRDs) + # - slinky-slurm-operator webhook + cert-manager wiring: the operator's + # webhook validates Slurm CRs through a Service whose pod runs on a + # KWOK fake (Ready without container). Both certManager.enabled and + # webhook.enabled gate the cert-manager.io/Certificate submission + # plus the ValidatingWebhookConfiguration. Disabling them skips + # admission entirely; harmless under KWOK since no real Slurm CRs + # are reconciled. log_info "Generating bundle..." local bundle_output @@ -371,6 +378,8 @@ generate_bundle() { --accelerated-node-toleration "nvidia.com/gpu=present:NoSchedule" \ --accelerated-node-toleration "kwok.x-k8s.io/node=fake:NoSchedule" \ --set "certmanager:startupapicheck.enabled=false" \ + --set "slinkyslurmoperator:webhook.enabled=false" \ + --set "slinkyslurmoperator:certManager.enabled=false" \ --set "kubeprometheusstack:defaultRules.create=false" \ --set "kubeprometheusstack:alertmanager.enabled=false" \ --set "nodewright-customizations:enabled=false" \ From 0651bfdebe23b451851fa6996652fd143465f8ed Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 18:09:46 +0200 Subject: [PATCH 13/14] ci(kwok): wait for slurm controller pod before verify_pods EKS slurm lane snapshot caught slinky-slurm-controller-0 mid-bind (NOMINATED system-1, spec.nodeName empty) and counted it as unscheduled. slurm-operator reconciles the Controller CR into a StatefulSet AFTER Helm install completes, so the controller pod appears later than the script's existing 5s post-deploy sleep. Poll up to 60s for the controller pod's spec.nodeName, gated on the Controllers CRD existing so non-slurm lanes are unaffected. --- kwok/scripts/validate-scheduling.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index 1844164f2..4c75db0ca 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -449,6 +449,26 @@ deploy_bundle() { log_info "Waiting for pods to be scheduled..." sleep 5 + # slurm-operator reconciles the Controller CR into a StatefulSet AFTER + # Helm install completes, so the controller pod appears later than the + # 5s window above. Poll up to 60s for spec.nodeName on the controller + # pod (presence implies the operator reconciled and the scheduler ran). + if kubectl get crd controllers.slinky.slurm.net &>/dev/null; then + local waited=0 + while ((waited < 60)); do + local scheduled + scheduled=$(kubectl get pods --all-namespaces \ + -l app.kubernetes.io/name=slurmctld \ + -o jsonpath='{.items[*].spec.nodeName}' 2>/dev/null || true) + if [[ -n "$scheduled" ]]; then + log_info "slurm controller scheduled after ${waited}s" + break + fi + sleep 5 + waited=$((waited + 5)) + done + fi + log_info "Bundle deployed successfully" } From 6fe4ad0c4da37d9a9bbfe5baf7c840166fe62089 Mon Sep 17 00:00:00 2001 From: Fagani Hajizada Date: Mon, 18 May 2026 18:34:41 +0200 Subject: [PATCH 14/14] ci(kwok): disable slurm controller persistence for KWOK The previous controller-pod poll fix was treating the symptom: the real failure is that slinky-slurm-controller is a StatefulSet with persistence.enabled=true by default, and Kind's local-path provisioner uses WaitForFirstConsumer binding. The pod gets NominatedNodeName=system-1 (a KWOK fake), the PVC tries to provision local-path on that fake, KWOK can't back local storage, and the pod sits Pending forever with no FailedScheduling event. Revert the controller-pod poll and disable controller persistence in the bundle via --set slurmcluster:controller.persistence.enabled=false. Verified locally: 43/43 pods scheduled. --- kwok/scripts/validate-scheduling.sh | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/kwok/scripts/validate-scheduling.sh b/kwok/scripts/validate-scheduling.sh index 4c75db0ca..8c54adb31 100755 --- a/kwok/scripts/validate-scheduling.sh +++ b/kwok/scripts/validate-scheduling.sh @@ -366,6 +366,12 @@ generate_bundle() { # plus the ValidatingWebhookConfiguration. Disabling them skips # admission entirely; harmless under KWOK since no real Slurm CRs # are reconciled. + # - slinky-slurm controller persistence: the chart provisions a PVC + # via the cluster's default StorageClass. Kind's local-path provisioner + # binds with WaitForFirstConsumer, so the PVC is pinned to whichever + # node the pod schedules on — and KWOK fakes can't actually back a + # local-path volume, leaving the pod stuck Pending with NominatedNodeName + # set. Disabling persistence lets the controller pod bind. log_info "Generating bundle..." local bundle_output @@ -380,6 +386,7 @@ generate_bundle() { --set "certmanager:startupapicheck.enabled=false" \ --set "slinkyslurmoperator:webhook.enabled=false" \ --set "slinkyslurmoperator:certManager.enabled=false" \ + --set "slurmcluster:controller.persistence.enabled=false" \ --set "kubeprometheusstack:defaultRules.create=false" \ --set "kubeprometheusstack:alertmanager.enabled=false" \ --set "nodewright-customizations:enabled=false" \ @@ -449,26 +456,6 @@ deploy_bundle() { log_info "Waiting for pods to be scheduled..." sleep 5 - # slurm-operator reconciles the Controller CR into a StatefulSet AFTER - # Helm install completes, so the controller pod appears later than the - # 5s window above. Poll up to 60s for spec.nodeName on the controller - # pod (presence implies the operator reconciled and the scheduler ran). - if kubectl get crd controllers.slinky.slurm.net &>/dev/null; then - local waited=0 - while ((waited < 60)); do - local scheduled - scheduled=$(kubectl get pods --all-namespaces \ - -l app.kubernetes.io/name=slurmctld \ - -o jsonpath='{.items[*].spec.nodeName}' 2>/dev/null || true) - if [[ -n "$scheduled" ]]; then - log_info "slurm controller scheduled after ${waited}s" - break - fi - sleep 5 - waited=$((waited + 5)) - done - fi - log_info "Bundle deployed successfully" }