diff --git a/.github/actions/README.md b/.github/actions/README.md
index cef2fd6ca..15710df7d 100644
--- a/.github/actions/README.md
+++ b/.github/actions/README.md
@@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize
 
 ## Composite Actions
 
+### Script Conventions
+
+Composite action helper scripts in this directory are intentionally portable
+across checkout modes: keep them mode `0644` and invoke them as
+`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on
+executable bits or `./script.sh` invocation.
+
 ### Core CI/CD Actions
 
 #### `security-scan/`
@@ -50,7 +57,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which
 **When to use**: When you need version values in workflow steps
 **Outputs**:
 - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense`
-- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm`
+- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm`
+- `kind_node_image`, `h100_kind_node_image`
 
 **Example**:
 ```yaml
diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml
index 7a973ae21..6bbd6a0ab 100644
--- a/.github/actions/aicr-build/action.yml
+++ b/.github/actions/aicr-build/action.yml
@@ -13,9 +13,17 @@
 # limitations under the License.
 
 name: 'AICR Build'
-description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.'
+description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.'
 
 inputs:
+  build_cli:
+    description: 'Build and stage the standalone aicr CLI binary at the repository root'
+    required: false
+    default: 'true'
+  build_snapshot_agent:
+    description: 'Build the CUDA-based snapshot agent image and load it into kind'
+    required: false
+    default: 'true'
   build_validators:
     description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.'
     required: false
@@ -28,86 +36,34 @@ inputs:
 runs:
   using: 'composite'
   steps:
-
-    - name: Install ko
+    - name: Build standalone aicr CLI binary
+      if: inputs.build_cli == 'true'
       shell: bash
-      run: |
-        KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml)
-        GOFLAGS= go install "github.com/google/ko@${KO_VERSION}"
+      env:
+        GOFLAGS: -mod=vendor
+      run: bash "${{ github.action_path }}/build-cli.sh"
 
-    - name: Build snapshot agent image and load into kind
+    - name: Build snapshot agent CLI binary
+      if: inputs.build_cli != 'true' && inputs.build_snapshot_agent == 'true'
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
-        # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
-        # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
-        CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
-        docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
-        FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
-        COPY dist/aicr /usr/local/bin/aicr
-        ENTRYPOINT ["/usr/local/bin/aicr"]
-        DOCKERFILE
+      run: bash "${{ github.action_path }}/build-cli.sh"
 
-        # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
-        # does not set a node selector, so it can land on any GPU-capable node
-        # including the control-plane (e.g., T4 smoke test).
-        #
-        # Timeout is intentionally generous (900s per attempt). H100 self-hosted
-        # runners transfer images over a shared Docker-in-Docker bridge; large
-        # CUDA base images (~250MB compressed) combined with I/O contention from
-        # parallel GPU operator pods regularly exceed the previous 600s limit.
-        timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
-          echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
-          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
-        }
+    - name: Build snapshot agent image and load into kind
+      if: inputs.build_snapshot_agent == 'true'
+      shell: bash
+      run: bash "${{ github.action_path }}/build-snapshot-agent.sh"
 
     - name: Build validator images and load into kind
       if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))"
       shell: bash
       env:
         GOFLAGS: -mod=vendor
-      run: |
-        # Determine which validator phases to build.
-        # validator_phases takes precedence; build_validators is a deprecated fallback.
-        if [[ -n "${{ inputs.validator_phases }}" ]]; then
-          if [[ "${{ inputs.validator_phases }}" == "none" ]]; then
-            echo "Skipping validator builds (validator_phases=none)"
-            exit 0
-          fi
-          PHASES="${{ inputs.validator_phases }}"
-        else
-          # Default: build all phases (backwards compatible)
-          PHASES="deployment,performance,conformance"
-        fi
-
-        # Compile only the requested validator binaries.
-        mkdir -p dist/validator
-        for phase in ${PHASES//,/ }; do
-          echo "Building validator binary: ${phase}"
-          CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
-        done
-
-        for phase in ${PHASES//,/ }; do
-          mkdir -p "validators/${phase}/testdata"
-          docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
-        FROM gcr.io/distroless/static-debian12:nonroot
-        COPY dist/validator/${phase} /${phase}
-        COPY validators/${phase}/testdata /app/testdata
-        WORKDIR /app
-        USER nonroot
-        ENTRYPOINT ["/${phase}"]
-        DOCKERFILE
-          # Validator images are small (~30MB distroless), but share the same
-          # Docker-in-Docker bridge as the smoke-test load above. 600s per
-          # attempt accommodates I/O queuing behind concurrent image pulls.
-          timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
-            echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
-            timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
-          }
-        done
+        VALIDATOR_PHASES: ${{ inputs.validator_phases }}
+      run: bash "${{ github.action_path }}/build-validator-images.sh"
 
     - name: Stage aicr binary at repo root
+      if: inputs.build_cli == 'true'
       shell: bash
-      run: cp dist/aicr ./aicr
+      run: bash "${{ github.action_path }}/stage-cli.sh"
diff --git a/.github/actions/aicr-build/build-cli.sh b/.github/actions/aicr-build/build-cli.sh
new file mode 100644
index 000000000..c87428241
--- /dev/null
+++ b/.github/actions/aicr-build/build-cli.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+mkdir -p dist
+CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh
new file mode 100644
index 000000000..a48d99615
--- /dev/null
+++ b/.github/actions/aicr-build/build-snapshot-agent.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if ! command -v yq >/dev/null 2>&1; then
+  echo "::error::yq is required to read testing.snapshot_agent_cuda_image from .settings.yaml"
+  exit 1
+fi
+
+SNAPSHOT_AGENT_CUDA_IMAGE="$(yq eval '.testing.snapshot_agent_cuda_image // ""' .settings.yaml)"
+if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then
+  echo "::error::testing.snapshot_agent_cuda_image must be set in .settings.yaml"
+  exit 1
+fi
+
+if [[ ! -f dist/aicr ]]; then
+  echo "::error::dist/aicr not found; build the AICR CLI before building the snapshot agent image"
+  exit 1
+fi
+
+# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
+# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed.
+timeout 900s docker build \
+  --build-arg SNAPSHOT_AGENT_CUDA_IMAGE="${SNAPSHOT_AGENT_CUDA_IMAGE}" \
+  -t ko.local:smoke-test -f - . <<'DOCKERFILE'
+ARG SNAPSHOT_AGENT_CUDA_IMAGE
+FROM ${SNAPSHOT_AGENT_CUDA_IMAGE}
+COPY dist/aicr /usr/local/bin/aicr
+ENTRYPOINT ["/usr/local/bin/aicr"]
+DOCKERFILE
+
+# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but does not
+# set a node selector, so it can land on any GPU-capable node including the
+# control-plane in the L40G smoke test.
+timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
+  echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
+  timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+}
diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh
new file mode 100644
index 000000000..e308fba4e
--- /dev/null
+++ b/.github/actions/aicr-build/build-validator-images.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+VALIDATOR_PHASES="${VALIDATOR_PHASES:-}"
+if [[ -n "${VALIDATOR_PHASES}" ]]; then
+  if [[ "${VALIDATOR_PHASES}" == "none" ]]; then
+    echo "Skipping validator builds (validator_phases=none)"
+    exit 0
+  fi
+  PHASES="${VALIDATOR_PHASES}"
+else
+  # Default: build all phases (backwards compatible).
+  PHASES="deployment,performance,conformance"
+fi
+
+: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+
+mkdir -p dist/validator
+for phase in ${PHASES//,/ }; do
+  if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then
+    echo "::error::invalid validator phase '${phase}'; expected ^[a-z][a-z0-9_-]*$"
+    exit 1
+  fi
+  echo "Building validator binary: ${phase}"
+  CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}"
+done
+
+for phase in ${PHASES//,/ }; do
+  if [[ ! -d "validators/${phase}/testdata" ]]; then
+    echo "::error::validators/${phase}/testdata is missing"
+    exit 1
+  fi
+  docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
+FROM gcr.io/distroless/static-debian12:nonroot
+COPY dist/validator/${phase} /${phase}
+COPY validators/${phase}/testdata /app/testdata
+WORKDIR /app
+USER nonroot
+ENTRYPOINT ["/${phase}"]
+DOCKERFILE
+  timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
+    echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
+    timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
+  }
+done
diff --git a/.github/actions/aicr-build/stage-cli.sh b/.github/actions/aicr-build/stage-cli.sh
new file mode 100644
index 000000000..c5b737a4d
--- /dev/null
+++ b/.github/actions/aicr-build/stage-cli.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+cp dist/aicr ./aicr
diff --git a/.github/actions/check-control-plane-health/action.yml b/.github/actions/check-control-plane-health/action.yml
new file mode 100644
index 000000000..c9002f217
--- /dev/null
+++ b/.github/actions/check-control-plane-health/action.yml
@@ -0,0 +1,80 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'Check Control Plane Health'
+description: 'Fails if Kind control-plane static pods are missing, unready, or unstable.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  namespace:
+    description: 'Namespace that contains the control-plane pods'
+    required: false
+    default: kube-system
+  components:
+    description: 'Space-separated component label values to check'
+    required: false
+    default: kube-apiserver kube-controller-manager kube-scheduler etcd
+  wait_timeout:
+    description: 'Timeout for each component readiness wait'
+    required: false
+    default: 60s
+  max_restarts:
+    description: 'Compatibility input; with stability_window=0s, fail if historical restartCount exceeds this ceiling'
+    required: false
+    default: '1'
+  stability_window:
+    description: 'Optional duration to watch for new control-plane restarts after pods are Ready'
+    required: false
+    default: '0s'
+  stability_probe_interval:
+    description: 'Interval for active API server probes during the stability window'
+    required: false
+    default: '10s'
+  stability_probe_failure_threshold:
+    description: 'Consecutive active stability probe failures allowed before failing'
+    required: false
+    default: '2'
+  lease_components:
+    description: 'Space-separated leader election lease names to check for freshness'
+    required: false
+    default: kube-controller-manager kube-scheduler
+  lease_stale_timeout:
+    description: 'Maximum allowed leader election lease age at the end of a stability window'
+    required: false
+    default: '120s'
+  runtime_diagnostics:
+    description: 'Collect expensive kind node runtime diagnostics such as docker stats, crictl, and journalctl on failure'
+    required: false
+    default: 'false'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Check control-plane pods
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        NAMESPACE: ${{ inputs.namespace }}
+        COMPONENTS: ${{ inputs.components }}
+        WAIT_TIMEOUT: ${{ inputs.wait_timeout }}
+        MAX_RESTARTS: ${{ inputs.max_restarts }}
+        STABILITY_WINDOW: ${{ inputs.stability_window }}
+        STABILITY_PROBE_INTERVAL: ${{ inputs.stability_probe_interval }}
+        STABILITY_PROBE_FAILURE_THRESHOLD: ${{ inputs.stability_probe_failure_threshold }}
+        LEASE_COMPONENTS: ${{ inputs.lease_components }}
+        LEASE_STALE_TIMEOUT: ${{ inputs.lease_stale_timeout }}
+        RUNTIME_DIAGNOSTICS: ${{ inputs.runtime_diagnostics }}
+      run: bash "${{ github.action_path }}/check-control-plane-health.sh"
diff --git a/.github/actions/check-control-plane-health/check-control-plane-health.sh b/.github/actions/check-control-plane-health/check-control-plane-health.sh
new file mode 100644
index 000000000..c6214083f
--- /dev/null
+++ b/.github/actions/check-control-plane-health/check-control-plane-health.sh
@@ -0,0 +1,526 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 60s, 2m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+  local amount
+
+  amount=$((10#${number}))
+
+  case "${unit}" in
+    s) echo "${amount}" ;;
+    m) echo $((amount * 60)) ;;
+    h) echo $((amount * 3600)) ;;
+    *)
+      echo "::error::unsupported duration unit in '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+MAX_RESTARTS="${MAX_RESTARTS:-}"
+MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}"
+MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}"
+MAX_RESTARTS_LIMIT=""
+if [[ -n "${MAX_RESTARTS}" ]]; then
+  if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then
+    echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'"
+    exit 1
+  fi
+  MAX_RESTARTS_LIMIT="$((10#${MAX_RESTARTS}))"
+fi
+
+WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}"
+WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input wait_timeout "${WAIT_TIMEOUT}"
+
+STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}"
+STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}"
+if [[ -z "${STABILITY_WINDOW}" ]]; then
+  STABILITY_WINDOW="0s"
+fi
+validate_duration_input stability_window "${STABILITY_WINDOW}"
+if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then
+  STABILITY_WINDOW="0s"
+fi
+STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")"
+if [[ -n "${MAX_RESTARTS_LIMIT}" ]] && [[ "${STABILITY_WINDOW}" != "0s" ]] && (( MAX_RESTARTS_LIMIT != 1 )); then
+  echo "::warning::max_restarts is diagnostic context when stability_window is non-zero; new restarts during the stability window remain the hard failure gate"
+fi
+
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}"
+STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}"
+validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}"
+STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")"
+if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then
+  echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'"
+  exit 1
+fi
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}"
+STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}"
+if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then
+  echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then
+  echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'"
+  exit 1
+fi
+
+LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}"
+LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}"
+
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}"
+LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}"
+validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}"
+LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")"
+if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then
+  echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'"
+  exit 1
+fi
+
+RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS:-false}"
+RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS#"${RUNTIME_DIAGNOSTICS%%[![:space:]]*}"}"
+RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS%"${RUNTIME_DIAGNOSTICS##*[![:space:]]}"}"
+case "${RUNTIME_DIAGNOSTICS}" in
+  true|false) ;;
+  *)
+    echo "::error::runtime_diagnostics must be true or false, got '${RUNTIME_DIAGNOSTICS}'"
+    exit 1
+    ;;
+esac
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
+RESTART_COUNT_ATTEMPTS=3
+RESTART_COUNT_RETRY_SLEEP_SECONDS=2
+declare -A INITIAL_RESTARTS=()
+
+kubectl_kind get --raw='/readyz' || true
+
+wait_ready() {
+  local component="$1"
+  local selector="component=${component}"
+
+  if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \
+    wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then
+    return 1
+  fi
+}
+
+restart_total() {
+  local component="$1"
+  local selector="component=${component}"
+  local restart_counts
+  local restart_count
+  local total=0
+  local attempt
+
+  for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do
+    if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \
+      -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then
+      if [[ -n "${restart_counts}" ]]; then
+        break
+      fi
+      echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    else
+      echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2
+    fi
+
+    if (( attempt < RESTART_COUNT_ATTEMPTS )); then
+      sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}"
+    fi
+  done
+
+  if [[ -z "${restart_counts}" ]]; then
+    echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2
+    dump_component_diagnostics "${component}" >&2
+    exit 1
+  fi
+
+  while IFS= read -r restart_count; do
+    [[ -z "${restart_count}" ]] && continue
+    total=$((total + restart_count))
+  done <<< "${restart_counts}"
+  echo "${total}"
+}
+
+report_restart_baseline() {
+  local component="$1"
+  local restart_count="$2"
+
+  if (( restart_count > 0 )); then
+    if [[ "${STABILITY_WINDOW}" == "0s" ]] && [[ -n "${MAX_RESTARTS_LIMIT}" ]]; then
+      echo "::warning::${component} has historical restartCount=${restart_count}; max_restarts=${MAX_RESTARTS_LIMIT} will be enforced because stability_window=0s"
+    else
+      echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only"
+    fi
+    return
+  fi
+  echo "${component} restartCount=${restart_count}"
+}
+
+dump_control_plane_summary() {
+  echo "=== Control-plane pod restart summary ==="
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true
+  kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \
+    -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true
+}
+
+require_readyz() {
+  local reason="$1"
+
+  if ! kubectl_kind get --raw='/readyz'; then
+    echo "::error::kube-apiserver /readyz failed ${reason}"
+    dump_all_control_plane_runtime_diagnostics
+    exit 1
+  fi
+}
+
+probe_control_plane_api() {
+  local reason="$1"
+  local component
+  local lease_summary
+
+  if ! kubectl_kind get --raw='/readyz' >/dev/null; then
+    echo "::error::kube-apiserver /readyz probe failed ${reason}"
+    return 1
+  fi
+
+  for component in ${LEASE_COMPONENTS}; do
+    if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \
+      -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component} ${reason}"
+      return 1
+    fi
+    echo "${lease_summary}"
+  done
+}
+
+lease_renew_epoch() {
+  local renew_time="$1"
+
+  date -u -d "${renew_time}" +%s 2>/dev/null
+}
+
+verify_leader_lease_freshness() {
+  local component
+  local now_epoch
+  local renew_time
+  local renew_epoch
+  local lease_age
+
+  [[ -z "${LEASE_COMPONENTS}" ]] && return
+
+  now_epoch="$(date -u +%s)"
+  echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..."
+  for component in ${LEASE_COMPONENTS}; do
+    if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then
+      echo "::error::failed to read leader election lease ${component}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if [[ -z "${renew_time}" ]]; then
+      echo "::error::leader election lease ${component} has empty spec.renewTime"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then
+      echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+    lease_age=$((now_epoch - renew_epoch))
+    if (( lease_age < 0 )); then
+      lease_age=0
+    fi
+    echo "${component} lease renewTime=${renew_time} age=${lease_age}s"
+    if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then
+      echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+}
+
+observe_stability_window() {
+  local label="$1"
+  local elapsed=0
+  local probe=0
+  local sleep_seconds
+  local consecutive_failures=0
+  local total_failures=0
+
+  echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..."
+  while (( elapsed < STABILITY_WINDOW_SECONDS )); do
+    sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}"
+    if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then
+      sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed))
+    fi
+    if (( sleep_seconds > 0 )); then
+      sleep "${sleep_seconds}"
+      elapsed=$((elapsed + sleep_seconds))
+    fi
+
+    probe=$((probe + 1))
+    echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ==="
+    if probe_control_plane_api "during ${label} stability probe ${probe}"; then
+      consecutive_failures=0
+      continue
+    fi
+
+    total_failures=$((total_failures + 1))
+    consecutive_failures=$((consecutive_failures + 1))
+    echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)"
+    if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then
+      echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}"
+      dump_all_control_plane_runtime_diagnostics
+      exit 1
+    fi
+  done
+
+  if (( total_failures > 0 )); then
+    echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass"
+  fi
+  verify_leader_lease_freshness
+}
+
+dump_api_server_health() {
+  local endpoint
+
+  for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do
+    echo "=== kube-apiserver ${endpoint} ==="
+    kubectl_kind get --raw="${endpoint}" || true
+  done
+}
+
+dump_kind_node_runtime_summary() {
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect node runtime summary: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} docker stats ==="
+  docker_timeout stats --no-stream \
+    --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
+    "${node}" || true
+
+  echo "=== ${node} docker inspect state ==="
+  docker_timeout inspect \
+    --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \
+    "${node}" || true
+
+  echo "=== ${node} node pressure snapshot ==="
+  docker_timeout exec "${node}" sh -c '
+    date
+    uptime || true
+    free -h || true
+    df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+    echo "--- top cpu/memory processes ---"
+    ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+  ' || true
+
+  echo "=== ${node} CRI pod/container summary ==="
+  docker_timeout exec "${node}" crictl pods || true
+  docker_timeout exec "${node}" crictl ps -a || true
+  docker_timeout exec "${node}" crictl stats || true
+}
+
+dump_static_pod_runtime_diagnostics() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+  local count=0
+
+  if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then
+    echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found"
+    return
+  fi
+
+  echo "=== ${node} ${component} static pod manifest ==="
+  docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true
+
+  echo "=== ${node} ${component} CRI containers ==="
+  docker_timeout exec "${node}" crictl ps -a --name "${component}" || true
+
+  container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true)
+  for container_id in ${container_ids}; do
+    count=$((count + 1))
+    if (( count > 8 )); then
+      echo "Skipping remaining ${component} CRI containers after first 8 entries."
+      break
+    fi
+
+    echo "=== ${node} crictl inspect ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl inspect "${container_id}" || true
+    echo "=== ${node} crictl logs ${component} ${container_id} ==="
+    docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true
+  done
+
+  echo "=== ${node} kubelet journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \
+    | tail -200 || true
+
+  echo "=== ${node} containerd journal (${component}) ==="
+  docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \
+    | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \
+    | tail -200 || true
+}
+
+dump_all_control_plane_runtime_diagnostics() {
+  local component
+
+  dump_control_plane_summary
+  dump_api_server_health
+  if [[ "${RUNTIME_DIAGNOSTICS}" != "true" ]]; then
+    echo "Skipping kind node runtime diagnostics. Set runtime_diagnostics=true to collect docker stats, crictl, and journalctl on failure."
+    return
+  fi
+  dump_kind_node_runtime_summary
+  for component in ${COMPONENTS}; do
+    dump_static_pod_runtime_diagnostics "${component}"
+    kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+  done
+}
+
+dump_component_diagnostics() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local pod
+
+  dump_control_plane_summary
+  kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true
+  kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true
+  kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+
+  pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true)
+  while IFS= read -r pod; do
+    [[ -z "${pod}" ]] && continue
+    echo "=== ${pod} logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true
+    echo "=== ${pod} previous logs ==="
+    kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true
+  done <<< "${pods}"
+
+  dump_all_control_plane_runtime_diagnostics
+  kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true
+}
+
+check_component() {
+  local component="$1"
+  local selector="component=${component}"
+  local pods
+  local initial_restarts
+
+  if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then
+    echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}"
+    kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+    exit 1
+  fi
+  if [[ -z "${pods}" ]]; then
+    echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}"
+    kubectl_kind -n "${NAMESPACE}" get pods -o wide || true
+    exit 1
+  fi
+
+  if ! wait_ready "${component}"; then
+    echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}"
+    dump_component_diagnostics "${component}"
+    kubectl_kind get --raw='/readyz' || true
+    exit 1
+  fi
+  initial_restarts=$(restart_total "${component}")
+  report_restart_baseline "${component}" "${initial_restarts}"
+  INITIAL_RESTARTS["${component}"]="${initial_restarts}"
+}
+
+verify_stability_window() {
+  local component
+  local initial_restarts
+  local final_restarts
+
+  if [[ "${STABILITY_WINDOW}" == "0s" ]]; then
+    if [[ -n "${MAX_RESTARTS_LIMIT}" ]]; then
+      for component in ${COMPONENTS}; do
+        final_restarts="${INITIAL_RESTARTS[${component}]:-0}"
+        if (( final_restarts > MAX_RESTARTS_LIMIT )); then
+          echo "::error::${component} restartCount=${final_restarts} exceeds max_restarts=${MAX_RESTARTS_LIMIT}"
+          dump_component_diagnostics "${component}"
+          exit 1
+        fi
+      done
+    fi
+    verify_leader_lease_freshness
+    return
+  fi
+
+  observe_stability_window "primary"
+  for component in ${COMPONENTS}; do
+    initial_restarts="${INITIAL_RESTARTS[${component}]:-}"
+    if [[ -z "${initial_restarts}" ]]; then
+      echo "::error::missing initial restart count for ${component}"
+      exit 1
+    fi
+    if ! wait_ready "${component}"; then
+      echo "::error::${component} pods became unready during ${STABILITY_WINDOW}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    final_restarts=$(restart_total "${component}")
+    if (( final_restarts > initial_restarts )); then
+      echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}"
+      dump_component_diagnostics "${component}"
+      kubectl_kind get --raw='/readyz' || true
+      exit 1
+    fi
+    INITIAL_RESTARTS["${component}"]="${final_restarts}"
+  done
+}
+
+for component in ${COMPONENTS}; do
+  check_component "${component}"
+done
+verify_stability_window
+require_readyz "after stability window"
diff --git a/.github/actions/gpu-chainsaw-health/action.yml b/.github/actions/gpu-chainsaw-health/action.yml
new file mode 100644
index 000000000..2b13ef645
--- /dev/null
+++ b/.github/actions/gpu-chainsaw-health/action.yml
@@ -0,0 +1,51 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Chainsaw Health'
+description: 'Run Chainsaw runtime health checks for a GPU Kind test cluster.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  chainsaw_path:
+    description: 'Path to the Chainsaw health-check directory'
+    required: true
+  chainsaw_version:
+    description: 'Chainsaw version'
+    required: true
+  chainsaw_sha256:
+    description: 'Chainsaw SHA256 checksum for linux/amd64'
+    required: true
+  test_timeout:
+    description: 'Outer timeout for the Chainsaw test command'
+    required: false
+    default: 15m
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Install chainsaw
+      uses: ./.github/actions/setup-build-tools
+      with:
+        install_chainsaw: 'true'
+        chainsaw_version: ${{ inputs.chainsaw_version }}
+        chainsaw_sha256: ${{ inputs.chainsaw_sha256 }}
+
+    - name: Run chainsaw health checks
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        CHAINSAW_TEST_TIMEOUT: ${{ inputs.test_timeout }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-chainsaw-health.sh" "${{ inputs.chainsaw_path }}"
diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml
index b9bc3060f..8e81aea5a 100644
--- a/.github/actions/gpu-cluster-setup/action.yml
+++ b/.github/actions/gpu-cluster-setup/action.yml
@@ -15,18 +15,90 @@
 name: 'GPU Cluster Setup'
 description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.'
 
+inputs:
+  kind_node_image:
+    description: 'Kind node image for nvkind cluster creation'
+    required: false
+    default: ''
+  min_gpu_count:
+    description: 'Minimum visible GPU count required before cluster setup'
+    required: true
+  gpu_model_pattern:
+    description: 'Optional grep-compatible GPU model pattern required for visible GPUs'
+    required: false
+    default: ''
+  min_free_disk_gb:
+    description: 'Minimum free disk space on / required before cluster setup'
+    required: false
+    default: '20'
+  min_available_memory_gb:
+    description: 'Minimum available system memory required before cluster setup'
+    required: false
+    default: '8'
+  cluster_create_timeout:
+    description: 'Timeout for nvkind cluster create'
+    required: false
+    default: '900s'
+  control_plane_resource_patches:
+    description: 'Apply kubeadm patches that raise control-plane static pod resource requests'
+    required: false
+    default: 'false'
+  control_plane_leader_election_tuning:
+    description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes'
+    required: false
+    default: 'false'
+  leader_election_lease_duration:
+    description: 'Leader election lease duration when control_plane_leader_election_tuning is true'
+    required: false
+    default: '300s'
+  leader_election_renew_deadline:
+    description: 'Leader election renew deadline when control_plane_leader_election_tuning is true'
+    required: false
+    default: '240s'
+  leader_election_retry_period:
+    description: 'Leader election retry period when control_plane_leader_election_tuning is true'
+    required: false
+    default: '10s'
+  api_server_cpu_request:
+    description: 'kube-apiserver CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  api_server_memory_request:
+    description: 'kube-apiserver memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
+  controller_manager_cpu_request:
+    description: 'kube-controller-manager CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  controller_manager_memory_request:
+    description: 'kube-controller-manager memory request when control_plane_resource_patches is true'
+    required: false
+    default: '512Mi'
+  scheduler_cpu_request:
+    description: 'kube-scheduler CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '500m'
+  scheduler_memory_request:
+    description: 'kube-scheduler memory request when control_plane_resource_patches is true'
+    required: false
+    default: '256Mi'
+  etcd_cpu_request:
+    description: 'etcd CPU request when control_plane_resource_patches is true'
+    required: false
+    default: '1000m'
+  etcd_memory_request:
+    description: 'etcd memory request when control_plane_resource_patches is true'
+    required: false
+    default: '1Gi'
+
 runs:
   using: 'composite'
   steps:
 
     - name: Validate environment
       shell: bash
-      run: |
-        if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
-          echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
-          exit 1
-        fi
-
+      run: bash "${{ github.action_path }}/validate-env.sh"
     - name: Load versions
       id: versions
       uses: ./.github/actions/load-versions
@@ -52,40 +124,61 @@ runs:
 
     - name: Install nvkind
       shell: bash
-      run: |
-        go install github.com/NVIDIA/nvkind/cmd/nvkind@latest
-        nvkind --help
-
-    - name: Verify host GPU
+      env:
+        NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }}
+      run: bash "${{ github.action_path }}/install-nvkind.sh"
+    - name: Runner preflight
       shell: bash
-      run: nvidia-smi -L
-
+      env:
+        GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+      run: bash "${{ github.action_path }}/runner-preflight.sh"
     - name: Configure NVIDIA Container Toolkit for kind
       shell: bash
-      run: |
-        sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
-        sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
-        sudo systemctl restart docker
-
+      run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh"
     - name: Validate Docker GPU access
       shell: bash
-      run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
-
+      run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh"
     - name: Increase inotify limits
       shell: bash
-      run: |
-        sudo sysctl -w fs.inotify.max_user_watches=524288
-        sudo sysctl -w fs.inotify.max_user_instances=1024
-
+      run: bash "${{ github.action_path }}/increase-inotify-limits.sh"
+    - name: Delete stale kind cluster
+      shell: bash
+      run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh"
+    - name: Check runner capacity
+      shell: bash
+      env:
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+        MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }}
+      run: bash "${{ github.action_path }}/check-runner-capacity.sh"
+    - name: Warm kind node image
+      if: ${{ inputs.kind_node_image != '' }}
+      shell: bash
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }}
+      run: bash "${{ github.action_path }}/warm-kind-node-image.sh"
     - name: Create GPU-enabled kind cluster
       shell: bash
-      run: |
-        nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide
-
+      env:
+        KIND_NODE_IMAGE: ${{ inputs.kind_node_image }}
+        CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }}
+        CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }}
+        CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }}
+        LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }}
+        LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }}
+        LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }}
+        API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }}
+        API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }}
+        CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }}
+        CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }}
+        SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }}
+        SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }}
+        ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }}
+        ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }}
+      run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh"
     - name: Print GPUs (nvkind)
       shell: bash
       run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}"
diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
new file mode 100644
index 000000000..ff6c3168e
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB), need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
+  exit 1
+fi
+
+available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}')
+if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then
+  echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB"
+  exit 1
+fi
+
+echo "Runner capacity is sufficient: disk=${free_disk_gib}GiB (${free_disk_bytes} bytes) memory=${available_memory_gb}GiB"
diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
new file mode 100644
index 000000000..84635a988
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place
+sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place
+set +e
+timeout 120s sudo systemctl restart docker
+restart_status=$?
+set -e
+if (( restart_status != 0 )); then
+  echo "::error::Docker restart failed after NVIDIA runtime configuration"
+  sudo systemctl status docker --no-pager || true
+  sudo journalctl -u docker --since "10 minutes ago" --no-pager || true
+  exit "${restart_status}"
+fi
+
+for attempt in $(seq 1 30); do
+  if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then
+    echo "Docker is healthy after NVIDIA runtime configuration."
+    exit 0
+  fi
+  echo "Waiting for Docker to become healthy... (${attempt}/30)"
+  sleep 2
+done
+
+echo "::error::Docker did not become healthy after NVIDIA runtime configuration"
+sudo systemctl status docker --no-pager || true
+exit 1
diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
new file mode 100644
index 000000000..19ef485cb
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh
@@ -0,0 +1,517 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_cpu_quantity_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^([0-9]+m|[0-9]+)$ ]]; then
+    echo "::error::${input_name} must be a CPU quantity like 500m, 1000m, or 1; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_memory_quantity_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+([EPTGMK]i?|[eptgmk])?$ ]]; then
+    echo "::error::${input_name} must be a memory quantity like 256Mi, 1Gi, or 1024; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_bool_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  case "${input_value}" in
+    true|false) ;;
+    *)
+      echo "::error::${input_name} must be true or false, got '${input_value}'"
+      exit 1
+      ;;
+  esac
+}
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+validate_generated_control_plane_config() {
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    for patch_file in "${patch_dir}"/*.yaml; do
+      if ! grep -Fxq 'apiVersion: v1' "${patch_file}" ||
+        ! grep -Fxq 'kind: Pod' "${patch_file}" ||
+        ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then
+        echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML"
+        sed 's/^/  /' "${patch_file}" || true
+        exit 1
+      fi
+    done
+
+    if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" ||
+      ! grep -Fq 'directory: /patches' "${config_template}"; then
+      echo "::error::rendered kind config is missing control-plane patch mounts"
+      sed 's/^/  /' "${config_template}" || true
+      exit 1
+    fi
+  fi
+
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    for expected in \
+      'apiVersion: kubeadm.k8s.io/v1beta3' \
+      'apiVersion: kubeadm.k8s.io/v1beta4' \
+      "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \
+      "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \
+      "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \
+      "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do
+      if ! grep -Fq "${expected}" "${config_template}"; then
+        echo "::error::rendered kind config is missing expected leader election setting: ${expected}"
+        sed 's/^/  /' "${config_template}" || true
+        exit 1
+      fi
+    done
+  fi
+}
+
+validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}"
+validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}"
+validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}"
+validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}"
+
+CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}")
+if [[ -n "${KIND_NODE_IMAGE}" ]]; then
+  echo "Using kind node image: ${KIND_NODE_IMAGE}"
+  CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}")
+fi
+
+CONTROL_PLANE_RESOURCE_PATCHES="${CONTROL_PLANE_RESOURCE_PATCHES:-false}"
+CONTROL_PLANE_LEADER_ELECTION_TUNING="${CONTROL_PLANE_LEADER_ELECTION_TUNING:-false}"
+validate_bool_input control_plane_resource_patches "${CONTROL_PLANE_RESOURCE_PATCHES}"
+validate_bool_input control_plane_leader_election_tuning "${CONTROL_PLANE_LEADER_ELECTION_TUNING}"
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  patch_dir="$(mktemp -d)"
+  config_template="$(mktemp)"
+  cleanup_generated_config() {
+    [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}"
+    [[ -n "${config_template:-}" ]] && rm -f "${config_template}"
+  }
+  trap cleanup_generated_config EXIT
+
+  # Keep YAML heredocs at column 0; indentation is literal content.
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  validate_cpu_quantity_input api_server_cpu_request "${API_SERVER_CPU_REQUEST}"
+  validate_memory_quantity_input api_server_memory_request "${API_SERVER_MEMORY_REQUEST}"
+  validate_cpu_quantity_input controller_manager_cpu_request "${CONTROLLER_MANAGER_CPU_REQUEST}"
+  validate_memory_quantity_input controller_manager_memory_request "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
+  validate_cpu_quantity_input scheduler_cpu_request "${SCHEDULER_CPU_REQUEST}"
+  validate_memory_quantity_input scheduler_memory_request "${SCHEDULER_MEMORY_REQUEST}"
+  validate_cpu_quantity_input etcd_cpu_request "${ETCD_CPU_REQUEST}"
+  validate_memory_quantity_input etcd_memory_request "${ETCD_MEMORY_REQUEST}"
+
+  cat > "${patch_dir}/kube-apiserver+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-apiserver
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-apiserver
+    resources:
+      requests:
+        cpu: ${API_SERVER_CPU_REQUEST}
+        memory: ${API_SERVER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-controller-manager+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-controller-manager
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-controller-manager
+    resources:
+      requests:
+        cpu: ${CONTROLLER_MANAGER_CPU_REQUEST}
+        memory: ${CONTROLLER_MANAGER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/kube-scheduler+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: kube-scheduler
+  namespace: kube-system
+spec:
+  containers:
+  - name: kube-scheduler
+    resources:
+      requests:
+        cpu: ${SCHEDULER_CPU_REQUEST}
+        memory: ${SCHEDULER_MEMORY_REQUEST}
+EOF
+
+  cat > "${patch_dir}/etcd+strategic.yaml" <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: etcd
+  namespace: kube-system
+spec:
+  containers:
+  - name: etcd
+    resources:
+      requests:
+        cpu: ${ETCD_CPU_REQUEST}
+        memory: ${ETCD_MEMORY_REQUEST}
+EOF
+  fi
+
+  cat > "${config_template}" <<'EOF'
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+{{- if hasKey $ "name" }}
+name: {{ $.name }}
+{{- end }}
+nodes:
+- role: control-plane
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<EOF
+  extraMounts:
+  - hostPath: ${patch_dir}
+    containerPath: /patches
+EOF
+  fi
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  cat >> "${config_template}" <<'EOF'
+  kubeadmConfigPatches:
+EOF
+  fi
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  cat >> "${config_template}" <<'EOF'
+  - |
+    kind: InitConfiguration
+    patches:
+      directory: /patches
+EOF
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so
+  # this remains valid when a future kind image switches API versions.
+  cat >> "${config_template}" <<EOF
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta3
+    controllerManager:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+        leader-elect-lease-duration: "${LEADER_ELECTION_LEASE_DURATION}"
+        leader-elect-renew-deadline: "${LEADER_ELECTION_RENEW_DEADLINE}"
+        leader-elect-retry-period: "${LEADER_ELECTION_RETRY_PERIOD}"
+  - |
+    kind: ClusterConfiguration
+    apiVersion: kubeadm.k8s.io/v1beta4
+    controllerManager:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+    scheduler:
+      extraArgs:
+      - name: leader-elect-lease-duration
+        value: "${LEADER_ELECTION_LEASE_DURATION}"
+      - name: leader-elect-renew-deadline
+        value: "${LEADER_ELECTION_RENEW_DEADLINE}"
+      - name: leader-elect-retry-period
+        value: "${LEADER_ELECTION_RETRY_PERIOD}"
+EOF
+  fi
+  cat >> "${config_template}" <<'EOF'
+{{- range $.workers }}
+- role: worker
+  {{- if hasKey $ "image" }}
+  image: {{ $.image }}
+  {{- end }}
+
+  {{- if hasKey . "devices" }}
+  {{- $devices := .devices }}
+  {{- if not (kindIs "slice" $devices) }}
+    {{- $devices = list .devices }}
+  {{- end }}
+  extraMounts:
+    # We inject all NVIDIA GPUs using the nvidia-container-runtime.
+    # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set
+    # in `/etc/nvidia-container-runtime/config.toml`
+    {{- range $d := $devices }}
+    - hostPath: /dev/null
+      containerPath: /var/run/nvidia-container-devices/{{ $d }}
+    {{- end }}
+  {{- end }}
+{{- end }}
+EOF
+  if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+    echo "Applying control-plane static pod resource patches from ${patch_dir}:"
+    for patch_file in "${patch_dir}"/*.yaml; do
+      echo "--- ${patch_file}"
+      sed 's/^/  /' "${patch_file}"
+    done
+  fi
+  if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+    echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:"
+    echo "  lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    echo "  renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    echo "  retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  fi
+  validate_generated_control_plane_config
+  CREATE_ARGS+=(--config-template="${config_template}")
+fi
+
+set +e
+timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}"
+create_status=$?
+set -e
+case "${create_status}" in
+  0) ;;
+  124)
+    echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass"
+    ;;
+  *)
+    echo "::warning::nvkind cluster create returned status ${create_status}; continuing only if post-create checks pass"
+    ;;
+esac
+
+kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s
+kubectl_kind cluster-info
+kubectl_kind get nodes -o wide
+kubectl_kind describe nodes | \
+  grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:|  cpu|  memory|  nvidia.com/gpu)" || true
+
+echo "=== Kind node container resources ==="
+docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  --format '{{.Names}}' | sort | while read -r node_container; do
+    [[ -z "${node_container}" ]] && continue
+    docker_timeout 30s inspect "${node_container}" \
+      --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}'
+  done
+
+echo "=== Control-plane resource requests/limits ==="
+kubectl_kind -n kube-system \
+  get pods -l tier=control-plane -o json | jq -r '
+    .items[] as $pod |
+    $pod.metadata.name,
+    ($pod.spec.containers[] |
+      "  " + .name +
+      " requests=" + ((.resources.requests // {}) | tostring) +
+      " limits=" + ((.resources.limits // {}) | tostring))
+  ' || true
+
+normalize_cpu_request() {
+  local cpu="$1"
+
+  if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then
+    echo "${BASH_REMATCH[1]}"
+    return
+  fi
+  echo "${cpu}"
+}
+
+control_plane_request() {
+  local component="$1"
+  local resource="$2"
+
+  kubectl_kind -n kube-system \
+    get pod -l "component=${component}" \
+    -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}"
+}
+
+assert_control_plane_request() {
+  local component="$1"
+  local resource="$2"
+  local expected="$3"
+  local actual
+
+  actual="$(control_plane_request "${component}" "${resource}")"
+  if [[ "${resource}" == "cpu" ]]; then
+    expected="$(normalize_cpu_request "${expected}")"
+    actual="$(normalize_cpu_request "${actual}")"
+  fi
+  if [[ "${actual}" != "${expected}" ]]; then
+    echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'"
+    exit 1
+  fi
+  echo "${component} ${resource} request verified: ${actual}"
+}
+
+control_plane_command_args() {
+  local component="$1"
+
+  kubectl_kind -n kube-system \
+    get pod -l "component=${component}" \
+    -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?'
+}
+
+static_pod_manifest_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml"
+}
+
+running_static_pod_container_contains_arg() {
+  local component="$1"
+  local expected="$2"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+  local inspect_output
+
+  if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then
+    return 1
+  fi
+  [[ -z "${container_ids}" ]] && return 1
+
+  for container_id in ${container_ids}; do
+    inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)"
+    if jq -e --arg expected "${expected}" '
+      ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null
+    ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+dump_running_static_pod_container_args() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+  local container_ids
+  local container_id
+
+  echo "Running ${component} CRI container args:"
+  container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)"
+  if [[ -z "${container_ids}" ]]; then
+    echo "(no running ${component} CRI containers found)"
+    return
+  fi
+  for container_id in ${container_ids}; do
+    echo "--- ${container_id} ---"
+    docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r '
+      [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]?
+    ' || true
+  done
+}
+
+dump_static_pod_manifest() {
+  local component="$1"
+  local node="${KIND_CLUSTER_NAME}-control-plane"
+
+  echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:"
+  docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true
+}
+
+assert_control_plane_arg() {
+  local component="$1"
+  local expected="$2"
+  local attempt
+  local command_args
+
+  for attempt in $(seq 1 12); do
+    command_args="$(control_plane_command_args "${component}" || true)"
+    if grep -Fxq -- "${expected}" <<< "${command_args}"; then
+      echo "${component} command/args verified: ${expected}"
+      return
+    fi
+    if running_static_pod_container_contains_arg "${component}" "${expected}"; then
+      echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)"
+      return
+    fi
+    if static_pod_manifest_contains_arg "${component}" "${expected}"; then
+      echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)"
+      sleep 5
+      continue
+    fi
+
+    break
+  done
+
+  echo "::error::${component} running command/args does not contain ${expected}"
+  echo "Observed live command/args:"
+  echo "${command_args:-}"
+  dump_running_static_pod_container_args "${component}"
+  dump_static_pod_manifest "${component}"
+  exit 1
+}
+
+if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then
+  echo "Verifying control-plane resource patches..."
+  assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}"
+  assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}"
+  assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}"
+  assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}"
+  assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}"
+  assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}"
+  assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}"
+fi
+
+if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then
+  echo "Verifying control-plane leader election timeout patches..."
+  for component in kube-controller-manager kube-scheduler; do
+    assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}"
+    assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}"
+    assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}"
+  done
+fi
diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
new file mode 100644
index 000000000..75d113151
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+docker_timeout() {
+  timeout 30s docker "$@"
+}
+
+read_kind_container_ids() {
+  local output
+
+  if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then
+    echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}"
+    echo "${output}"
+    exit 1
+  fi
+
+  remaining_containers=()
+  if [[ -n "${output}" ]]; then
+    mapfile -t remaining_containers <<< "${output}"
+  fi
+}
+
+if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then
+  echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}"
+  if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then
+    echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup"
+  fi
+else
+  echo "No stale kind cluster named ${KIND_CLUSTER_NAME}"
+fi
+
+read_kind_container_ids
+if (( ${#remaining_containers[@]} > 0 )); then
+  echo "Removing stale containers for ${KIND_CLUSTER_NAME}:"
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
+  docker_timeout rm -f "${remaining_containers[@]}"
+fi
+
+read_kind_container_ids
+if (( ${#remaining_containers[@]} > 0 )); then
+  echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:"
+  docker_timeout ps -a --filter "label=${kind_cluster_label}"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
new file mode 100644
index 000000000..843496a38
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+sudo sysctl -w fs.inotify.max_user_watches=524288
+sudo sysctl -w fs.inotify.max_user_instances=1024
diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh
new file mode 100644
index 000000000..c2200e078
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ -z "${NVKIND_VERSION:-}" ]]; then
+  echo "::error::NVKIND_VERSION must be set"
+  exit 1
+fi
+
+go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}"
+nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind"
+"${nvkind_bin}" --help
diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh
new file mode 100644
index 000000000..70d38ecf5
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+
+echo "=== Runner baseline ==="
+date -u
+hostname
+uptime
+nproc
+free -h
+df -h /
+df -ih /
+
+for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do
+  value="${!value_name}"
+  if ! [[ "${value}" =~ ^[0-9]+$ ]]; then
+    echo "::error::${value_name} must be an integer, got '${value}'"
+    exit 1
+  fi
+done
+
+echo "=== Docker health ==="
+docker info >/dev/null
+docker version
+
+echo "=== Host GPUs ==="
+nvidia-smi -L
+nvidia-smi
+
+mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader)
+if [[ -n "${GPU_MODEL_PATTERN}" ]]; then
+  set +e
+  gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}")
+  grep_status=$?
+  set -e
+  if (( grep_status == 2 )); then
+    echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}"
+    exit 1
+  fi
+  if (( grep_status != 0 )); then
+    gpu_count=0
+  fi
+  echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}"
+else
+  gpu_count="${#gpu_names[@]}"
+  echo "Visible GPUs: ${gpu_count}"
+fi
+
+if (( gpu_count < MIN_GPU_COUNT )); then
+  echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}"
+  exit 1
+fi
+
+echo "=== Existing kind state ==="
+kind get clusters || true
+docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
new file mode 100644
index 000000000..6f01ba156
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L
diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh
new file mode 100644
index 000000000..697d077c2
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/validate-env.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
+  echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow"
+  exit 1
+fi
diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
new file mode 100644
index 000000000..4a0fcf5e3
--- /dev/null
+++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+KIND_NODE_IMAGE="${KIND_NODE_IMAGE:?KIND_NODE_IMAGE must be set}"
+MIN_FREE_DISK_GB="${MIN_FREE_DISK_GB:?MIN_FREE_DISK_GB must be set}"
+if ! [[ "${MIN_FREE_DISK_GB}" =~ ^[0-9]+$ ]]; then
+  echo "::error::MIN_FREE_DISK_GB must be an integer, got '${MIN_FREE_DISK_GB}'"
+  exit 1
+fi
+
+echo "=== Kind node image cache ==="
+if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then
+  echo "Kind node image already cached: ${KIND_NODE_IMAGE}"
+else
+  echo "Pulling kind node image: ${KIND_NODE_IMAGE}"
+  timeout 600s docker pull "${KIND_NODE_IMAGE}"
+fi
+free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9')
+min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024))
+free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024))
+if (( free_disk_bytes < min_free_disk_bytes )); then
+  echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)"
+  exit 1
+fi
+echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)"
diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml
new file mode 100644
index 000000000..42ee4e091
--- /dev/null
+++ b/.github/actions/gpu-debug-diagnostics/action.yml
@@ -0,0 +1,35 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Debug Diagnostics'
+description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  mode:
+    description: 'Diagnostic mode: smoke, training, or inference'
+    required: false
+    default: 'smoke'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Print GPU debug diagnostics
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+        GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }}
+      run: bash "${{ github.action_path }}/../../scripts/gpu-debug-diagnostics.sh"
diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml
deleted file mode 100644
index e2bdb300c..000000000
--- a/.github/actions/gpu-operator-install/action.yml
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: 'GPU Operator Install'
-description: 'Installs the GPU operator via standalone Helm chart or aicr bundle.'
-
-inputs:
-  method:
-    description: 'Installation method: helm (standalone chart) or bundle (aicr recipe+bundle)'
-    required: true
-  accelerator:
-    description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)'
-    required: false
-    default: ''
-  intent:
-    description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)'
-    required: false
-    default: 'inference'
-  platform:
-    description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
-    required: false
-    default: ''
-
-runs:
-  using: 'composite'
-  steps:
-
-    # --- Helm mode: standalone GPU operator chart ---
-
-    - name: Install GPU Operator (helm)
-      if: inputs.method == 'helm'
-      shell: bash
-      run: |
-        helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
-        helm repo update
-        helm upgrade -i \
-          --kube-context="kind-${KIND_CLUSTER_NAME}" \
-          --namespace gpu-operator \
-          --create-namespace \
-          --set driver.enabled=false \
-          --set toolkit.enabled=false \
-          --set dcgmExporter.enabled=false \
-          --set nfd.enabled=true \
-          --wait --timeout=600s \
-          gpu-operator nvidia/gpu-operator
-
-    - name: Wait for GPU operands (helm)
-      if: inputs.method == 'helm'
-      shell: bash
-      run: |
-        echo "Waiting for device plugin to be ready..."
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
-
-    # --- Bundle mode: aicr recipe → bundle → deploy ---
-
-    - name: Generate recipe
-      if: inputs.method == 'bundle'
-      shell: bash
-      run: |
-        PLATFORM_FLAG=""
-        if [[ -n "${{ inputs.platform }}" ]]; then
-          PLATFORM_FLAG="--platform ${{ inputs.platform }}"
-        fi
-        ./aicr recipe \
-          --service kind \
-          --accelerator ${{ inputs.accelerator }} \
-          --os ubuntu \
-          --intent ${{ inputs.intent }} \
-          ${PLATFORM_FLAG} \
-          --output recipe.yaml
-        echo "--- Recipe ---"
-        cat recipe.yaml
-
-    - name: Generate deployment bundle
-      if: inputs.method == 'bundle'
-      shell: bash
-      run: |
-        ./aicr bundle \
-          --recipe recipe.yaml \
-          --accelerated-node-toleration nvidia.com/gpu:NoSchedule \
-          --output bundle
-        echo "--- Bundle contents ---"
-        ls -la bundle/
-
-    - name: Install bundle into cluster
-      if: inputs.method == 'bundle'
-      shell: bash
-      run: |
-        cd bundle
-        # Use --no-wait: several components (gpu-operator ClusterPolicy,
-        # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin)
-        # stay InProgress in kind because their CRs/DaemonSets require
-        # features not available in kind (DRA feature gates, driver modules).
-        # The explicit "Wait for GPU operands" step below gates on what
-        # actually matters (device plugin readiness).
-        # --best-effort: some components (e.g. network-operator) have Helm
-        # hooks that may time out in Kind; continue deploying remaining
-        # components so the overall stack is functional.
-        chmod +x deploy.sh
-        echo "--- deploy.sh ---"
-        cat deploy.sh
-        ./deploy.sh --no-wait --best-effort
-
-    - name: Wait for GPU operands (bundle)
-      if: inputs.method == 'bundle'
-      shell: bash
-      run: |
-        echo "Waiting for GPU operator controller to deploy operands..."
-        # The GPU operator controller watches ClusterPolicy and creates
-        # DaemonSets for device-plugin, NFD, GFD, etc. This happens
-        # asynchronously after the helm install completes.
-        for i in $(seq 1 30); do
-          count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l)
-          if [[ "$count" -gt 0 ]]; then
-            echo "Device plugin DaemonSet found."
-            break
-          fi
-          echo "Waiting for device plugin DaemonSet to be created... (${i}/30)"
-          sleep 10
-        done
-        echo "Waiting for device plugin rollout..."
-        # Operands are excluded from control-plane nodes via nodeAffinity in
-        # the kind overlay, so all scheduled pods should become ready.
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-          rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s
-        echo "GPU Operator pods:"
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods
diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml
new file mode 100644
index 000000000..cb61b5d0d
--- /dev/null
+++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml
@@ -0,0 +1,36 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Smoke nvidia-smi'
+description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Run nvidia-smi in a pod
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh"
+    - name: Show nvidia-smi output
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh"
diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml
index e1ee3c14b..9f215a3e3 100644
--- a/.github/actions/gpu-snapshot-validate/action.yml
+++ b/.github/actions/gpu-snapshot-validate/action.yml
@@ -26,60 +26,36 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (for kubectl context)'
     required: true
+  snapshot_timeout:
+    description: 'Timeout for aicr snapshot'
+    required: false
+    default: '5m'
 
 runs:
   using: composite
   steps:
+    - name: Build snapshot agent image
+      uses: ./.github/actions/aicr-build
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      with:
+        build_cli: 'false'
+        build_snapshot_agent: 'true'
+        validator_phases: 'none'
     - name: Run aicr snapshot
       shell: bash
-      run: |
-        ./aicr snapshot \
-          --kubeconfig="${HOME}/.kube/config" \
-          --namespace=default \
-          --image=ko.local:smoke-test \
-          --require-gpu \
-          --output=snapshot.yaml
-        echo "--- Snapshot output ---"
-        cat snapshot.yaml
-
+      env:
+        SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }}
+      run: bash "${{ github.action_path }}/run-snapshot.sh"
     - name: Validate snapshot detected GPU
       shell: bash
-      run: |
-        # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
-        GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
-        GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
-        echo "GPU model: ${GPU_MODEL}"
-        echo "GPU count: ${GPU_COUNT}"
-        if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then
-          echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}"
-          exit 1
-        fi
-        if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then
-          echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}"
-          exit 1
-        fi
-        echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
-
+      env:
+        EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }}
+        MIN_GPU_COUNT: ${{ inputs.min_gpu_count }}
+      run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh"
     - name: Debug snapshot Job
       if: failure()
       shell: bash
-      run: |
-        echo "=== Snapshot Job ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true
-        echo "=== Snapshot Pods ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get pods -l app.kubernetes.io/name=aicr -o wide || true
-        echo "=== Snapshot Job describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true
-        echo "=== Snapshot Pod describe ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          describe pods -l app.kubernetes.io/name=aicr || true
-        echo "=== Snapshot current logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
-        echo "=== Snapshot previous logs ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
-        echo "=== Snapshot ConfigMap ==="
-        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
-          get configmap aicr-snapshot -o yaml || true
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.action_path }}/debug-snapshot-job.sh"
diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
new file mode 100644
index 000000000..2e0f1547f
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+echo "=== Snapshot Job ==="
+kubectl_kind -n default get job aicr -o yaml || true
+echo "=== Snapshot Pods ==="
+kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true
+echo "=== Snapshot Job describe ==="
+kubectl_kind -n default describe job aicr || true
+echo "=== Snapshot Pod describe ==="
+kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true
+echo "=== Snapshot current logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
+echo "=== Snapshot previous logs ==="
+kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
+echo "=== Snapshot ConfigMap ==="
+kubectl_kind -n default get configmap aicr-snapshot -o yaml || true
diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
new file mode 100644
index 000000000..e45b575ef
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+./aicr snapshot \
+  --kubeconfig="${HOME}/.kube/config" \
+  --namespace=default \
+  --image=ko.local:smoke-test \
+  --require-gpu \
+  --timeout="${SNAPSHOT_TIMEOUT}" \
+  --output=snapshot.yaml
+echo "--- Snapshot output ---"
+cat snapshot.yaml
diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
new file mode 100644
index 000000000..5a27e6093
--- /dev/null
+++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi".
+GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml)
+GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml)
+echo "GPU model: ${GPU_MODEL}"
+echo "GPU count: ${GPU_COUNT}"
+if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then
+  echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}"
+  exit 1
+fi
+if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then
+  echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}"
+  exit 1
+fi
+if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then
+  echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}"
+  exit 1
+fi
+echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml
index 30ac7831f..e58588b1a 100644
--- a/.github/actions/gpu-test-cleanup/action.yml
+++ b/.github/actions/gpu-test-cleanup/action.yml
@@ -23,48 +23,74 @@ inputs:
     description: 'Prefix for the uploaded artifact name'
     required: false
     default: 'gpu-test-debug'
+  collect_artifacts:
+    description: 'Collect and upload debug artifacts before deleting the kind cluster'
+    required: false
+    default: 'false'
+  collect_node_runtime_artifacts:
+    description: 'Collect expensive kind node runtime artifacts such as journalctl, crictl, and kind export logs'
+    required: false
+    default: 'false'
+  diagnostic_mode:
+    description: 'Optional gpu-debug-diagnostics mode to run when collect_artifacts is true'
+    required: false
+    default: ''
+  upload_validation_artifacts:
+    description: 'Upload validation result and evidence artifacts before cleanup'
+    required: false
+    default: 'false'
+  validation_artifact_name:
+    description: 'Name for uploaded validation artifacts'
+    required: false
+    default: 'conformance-evidence'
+  validation_artifact_paths:
+    description: 'Newline-separated validation artifact paths'
+    required: false
+    default: |
+      conformance-evidence/
+      validation-result.yaml
 
 runs:
   using: 'composite'
   steps:
+    - name: Debug diagnostics
+      if: inputs.collect_artifacts == 'true' && inputs.diagnostic_mode != ''
+      uses: ./.github/actions/gpu-debug-diagnostics
+      with:
+        cluster_name: ${{ inputs.cluster_name }}
+        mode: ${{ inputs.diagnostic_mode }}
+    - name: Upload validation artifacts
+      if: always() && inputs.upload_validation_artifacts == 'true'
+      uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
+      with:
+        name: ${{ inputs.validation_artifact_name }}
+        path: ${{ inputs.validation_artifact_paths }}
+        if-no-files-found: warn
     - name: Collect debug artifacts
-      if: failure()
+      if: always() && inputs.collect_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        mkdir -p /tmp/debug-artifacts
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
-
+        COLLECT_NODE_RUNTIME_ARTIFACTS: ${{ inputs.collect_node_runtime_artifacts }}
+      run: bash "${{ github.action_path }}/collect-debug-artifacts.sh"
     - name: Export kind logs
-      if: failure()
+      if: always() && inputs.collect_artifacts == 'true' && inputs.collect_node_runtime_artifacts == 'true'
       shell: bash
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        mkdir -p /tmp/kind-logs
-        kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true
-
+      run: bash "${{ github.action_path }}/export-kind-logs.sh"
+    - name: Cleanup
+      if: always()
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh"
     - name: Upload debug artifacts
-      if: failure()
-      uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
+      if: always() && inputs.collect_artifacts == 'true'
+      uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
       with:
         name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }}
         path: |
           /tmp/debug-artifacts/
           /tmp/kind-logs/
         retention-days: 7
-
-    - name: Cleanup
-      if: always()
-      shell: bash
-      env:
-        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
-      run: |
-        kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
-        docker system prune -f || true
diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
new file mode 100644
index 000000000..134aa2589
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}"
+mapfile -t remaining_containers < <(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true)
+if (( ${#remaining_containers[@]} > 0 )); then
+  echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:"
+  docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true
+  docker_timeout 30s rm -f "${remaining_containers[@]}" || true
+  mapfile -t remaining_containers < <(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true)
+  if (( ${#remaining_containers[@]} > 0 )); then
+    echo "::warning::leftover kind containers still present for ${KIND_CLUSTER_NAME}: ${remaining_containers[*]}"
+  fi
+fi
diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
new file mode 100644
index 000000000..a77744645
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh
@@ -0,0 +1,179 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diagnostic artifact collection intentionally omits -e so one broken cluster
+# call does not prevent later artifacts from being collected.
+set -uo pipefail
+rm -rf /tmp/debug-artifacts /tmp/kind-logs
+mkdir -p /tmp/debug-artifacts
+mkdir -p /tmp/kind-logs
+CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd"
+MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}"
+COLLECT_NODE_RUNTIME_ARTIFACTS="${COLLECT_NODE_RUNTIME_ARTIFACTS:-false}"
+if ! [[ "${MAX_KIND_NODE_ARTIFACT_SECONDS}" =~ ^[0-9]+$ ]]; then
+  echo "::warning::MAX_KIND_NODE_ARTIFACT_SECONDS must be an integer; got '${MAX_KIND_NODE_ARTIFACT_SECONDS}', defaulting to 600" >&2
+  MAX_KIND_NODE_ARTIFACT_SECONDS=600
+fi
+command_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" "$@"
+}
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+{
+  date -u || true
+  hostname || true
+  uptime || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true
+docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true
+docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true
+command_timeout 30s nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+command_timeout 30s nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true
+command_timeout 30s kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true
+docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+  > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true
+
+kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true
+kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true
+kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true
+kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true
+kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \
+  > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true
+kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true
+for component in ${CONTROL_PLANE_COMPONENTS}; do
+  kubectl_kind -n kube-system describe pod -l "component=${component}" \
+    > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \
+    > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \
+    > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true
+  kubectl_kind -n kube-system get lease "${component}" -o yaml \
+    > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true
+done
+kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true
+kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true
+kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true
+kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \
+  > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true
+kubectl_kind -n monitoring describe deployment kube-prometheus-operator \
+  > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true
+kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \
+  > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true
+kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \
+  > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true
+{
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "=== ${pod} ==="
+        kubectl_kind -n monitoring describe "${pod}" 2>&1 || true
+      done
+} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true
+kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true
+tar_inputs=()
+[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml)
+[[ -d bundle ]] && tar_inputs+=(bundle)
+if [[ "${#tar_inputs[@]}" -gt 0 ]]; then
+  echo "Archiving runtime bundle inputs: ${tar_inputs[*]}"
+  tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true
+else
+  echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive"
+fi
+
+case "${COLLECT_NODE_RUNTIME_ARTIFACTS}" in
+  true)
+    artifact_loop_start="$(date +%s)"
+    docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+      --format '{{.Names}}' | sort | while read -r node_container; do
+        [[ -z "${node_container}" ]] && continue
+        artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start))
+        if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then
+          echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection."
+          break
+        fi
+        node_file="${node_container//[^A-Za-z0-9_.-]/_}"
+        docker_timeout 30s inspect "${node_container}" \
+          > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" journalctl -u kubelet \
+          --since "90 minutes ago" --no-pager \
+          > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" journalctl -u containerd \
+          --since "90 minutes ago" --no-pager \
+          > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" crictl ps -a \
+          > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" crictl pods \
+          > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" crictl stats \
+          > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true
+        docker_timeout 30s exec "${node_container}" sh -c '
+          date
+          uptime || true
+          free -h || true
+          df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+          echo "--- top cpu/memory processes ---"
+          ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true
+        ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true
+        # shellcheck disable=SC2016 # Expanded inside the kind node shell.
+        docker_timeout 120s exec "${node_container}" sh -c '
+          for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do
+            echo "=== ${component} static pod manifest ==="
+            sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true
+            echo "=== ${component} CRI containers ==="
+            crictl ps -a --name "${component}" || true
+            count=0
+            for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do
+              count=$((count + 1))
+              if [ "${count}" -gt 8 ]; then
+                echo "Skipping remaining ${component} CRI containers after first 8 entries."
+                break
+              fi
+              echo "=== crictl inspect ${component} ${container_id} ==="
+              crictl inspect "${container_id}" || true
+              echo "=== crictl logs ${component} ${container_id} ==="
+              crictl logs --tail=300 "${container_id}" || true
+            done
+          done
+        ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true
+      done || true
+    ;;
+  ""|false)
+    echo "Skipped kind node runtime artifacts. Set collect_node_runtime_artifacts=true to collect journalctl, crictl, and kind export logs." \
+      > /tmp/debug-artifacts/node-runtime-artifacts-skipped.txt
+    echo "Skipped kind log export. Set collect_node_runtime_artifacts=true to export kind logs." \
+      > /tmp/kind-logs/kind-logs-skipped.txt
+    ;;
+  *)
+    echo "Unknown COLLECT_NODE_RUNTIME_ARTIFACTS=${COLLECT_NODE_RUNTIME_ARTIFACTS}; skipping kind node runtime artifacts." \
+      > /tmp/debug-artifacts/node-runtime-artifacts-skipped.txt
+    echo "Unknown COLLECT_NODE_RUNTIME_ARTIFACTS=${COLLECT_NODE_RUNTIME_ARTIFACTS}; skipping kind log export." \
+      > /tmp/kind-logs/kind-logs-skipped.txt
+    ;;
+esac
diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
new file mode 100644
index 000000000..a46624f60
--- /dev/null
+++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+mkdir -p /tmp/kind-logs
+if ! timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}"; then
+  echo "::warning::kind log export failed or timed out for ${KIND_CLUSTER_NAME}; continuing cleanup" >&2
+fi
diff --git a/.github/actions/gpu-validate-conformance/action.yml b/.github/actions/gpu-validate-conformance/action.yml
new file mode 100644
index 000000000..bde5238a9
--- /dev/null
+++ b/.github/actions/gpu-validate-conformance/action.yml
@@ -0,0 +1,57 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Validate Conformance'
+description: 'Run CNCF AI Conformance validation for a GPU Kind test cluster.'
+
+inputs:
+  cluster_name:
+    description: 'Kind cluster name'
+    required: true
+  kwok_helm_timeout:
+    description: 'Timeout for KWOK controller Helm install'
+    required: false
+    default: '300s'
+  ko_build_timeout:
+    description: 'Timeout for Karpenter KWOK provider ko build'
+    required: false
+    default: '900s'
+  karpenter_helm_timeout:
+    description: 'Timeout for Karpenter Helm install'
+    required: false
+    default: '300s'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Install Karpenter + KWOK
+      uses: ./.github/actions/install-karpenter-kwok
+      with:
+        cluster_name: ${{ inputs.cluster_name }}
+        kwok_helm_timeout: ${{ inputs.kwok_helm_timeout }}
+        ko_build_timeout: ${{ inputs.ko_build_timeout }}
+        karpenter_helm_timeout: ${{ inputs.karpenter_helm_timeout }}
+    - name: Build conformance validator image
+      uses: ./.github/actions/aicr-build
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      with:
+        build_cli: 'false'
+        build_snapshot_agent: 'false'
+        validator_phases: 'conformance'
+    - name: Validate CNCF AI Conformance
+      shell: bash
+      env:
+        KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+      run: bash "${{ github.workspace }}/.github/scripts/gpu-validate-conformance.sh"
diff --git a/.github/actions/gpu-workflow-prepare/action.yml b/.github/actions/gpu-workflow-prepare/action.yml
new file mode 100644
index 000000000..1faf4496d
--- /dev/null
+++ b/.github/actions/gpu-workflow-prepare/action.yml
@@ -0,0 +1,47 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'GPU Workflow Prepare'
+description: 'Print early runner diagnostics and load GPU workflow tool/image versions.'
+
+outputs:
+  chainsaw:
+    description: 'Chainsaw version'
+    value: ${{ steps.versions.outputs.chainsaw }}
+  chainsaw_sha256_linux_amd64:
+    description: 'Chainsaw SHA256 checksum for linux/amd64'
+    value: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }}
+  h100_kind_node_image:
+    description: 'Kind node image for H100 GPU tests'
+    value: ${{ steps.versions.outputs.h100_kind_node_image }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Runner preflight snapshot
+      shell: bash
+      run: |
+        echo "::group::Runner preflight snapshot"
+        echo "hostname:     $(hostname)"
+        echo "kernel:       $(uname -a)"
+        echo "uptime:       $(uptime)"
+        echo "loadavg:      $(cat /proc/loadavg 2>/dev/null || echo unavailable)"
+        echo "nproc:        $(nproc 2>/dev/null || echo unavailable)"
+        free -h 2>/dev/null || true
+        df -h / 2>/dev/null || true
+        echo "::endgroup::"
+
+    - name: Load GPU test versions
+      id: versions
+      uses: ./.github/actions/load-versions
diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml
index fde7bddde..d3570a43b 100644
--- a/.github/actions/install-karpenter-kwok/action.yml
+++ b/.github/actions/install-karpenter-kwok/action.yml
@@ -19,6 +19,18 @@ inputs:
   cluster_name:
     description: 'Kind cluster name (used for kubectl context)'
     required: true
+  kwok_helm_timeout:
+    description: 'Timeout for KWOK controller Helm install'
+    required: false
+    default: '300s'
+  ko_build_timeout:
+    description: 'Timeout for Karpenter KWOK provider ko build'
+    required: false
+    default: '900s'
+  karpenter_helm_timeout:
+    description: 'Timeout for Karpenter Helm install'
+    required: false
+    default: '300s'
 
 runs:
   using: 'composite'
@@ -26,9 +38,12 @@ runs:
     - name: Resolve versions
       id: versions
       shell: bash
-      run: |
-        echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT"
-        echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT"
+      run: bash "${{ github.action_path }}/resolve-versions.sh"
+    - name: Install ko
+      uses: ./.github/actions/setup-build-tools
+      with:
+        install_ko: 'true'
+        ko_version: ${{ steps.versions.outputs.ko }}
 
     - name: Cache Karpenter Go build cache
       uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684  # v4.2.3
@@ -46,7 +61,7 @@ runs:
       env:
         KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
         KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }}
-      run: |
-        set -euo pipefail
-        bash kwok/scripts/install-karpenter-kwok.sh
-        kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml
+        KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }}
+        KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }}
+        KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }}
+      run: bash "${{ github.action_path }}/install-karpenter-kwok.sh"
diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
new file mode 100644
index 000000000..0ec6480d1
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then
+  echo "::error::KIND_CLUSTER_NAME is required"
+  exit 1
+fi
+KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}"
+
+validate_duration_input() {
+  local input_name="$1"
+  local input_value="$2"
+
+  if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then
+    echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'"
+    exit 1
+  fi
+}
+
+validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}"
+validate_duration_input ko_build_timeout "${KO_BUILD_TIMEOUT}"
+validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}"
+bash kwok/scripts/install-karpenter-kwok.sh
+timeout 30s kubectl --request-timeout=10s \
+  --context="${KUBE_CONTEXT}" \
+  apply -f kwok/manifests/karpenter/nodepool.yaml
diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh
new file mode 100644
index 000000000..6aeb173a7
--- /dev/null
+++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+{
+  echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)"
+  echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)"
+  echo "go=go$(yq eval '.languages.go' .settings.yaml)"
+} >> "$GITHUB_OUTPUT"
diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml
index b87e321d1..b3c506d40 100644
--- a/.github/actions/load-versions/action.yml
+++ b/.github/actions/load-versions/action.yml
@@ -40,6 +40,9 @@ outputs:
   kind:
     description: 'Kind version'
     value: ${{ steps.versions.outputs.kind }}
+  nvkind:
+    description: 'nvkind git ref'
+    value: ${{ steps.versions.outputs.nvkind }}
   ctlptl:
     description: 'ctlptl version'
     value: ${{ steps.versions.outputs.ctlptl }}
@@ -91,6 +94,9 @@ outputs:
   kind_node_image:
     description: 'Kind node image for testing'
     value: ${{ steps.versions.outputs.kind_node_image }}
+  h100_kind_node_image:
+    description: 'Kind node image for H100 GPU tests'
+    value: ${{ steps.versions.outputs.h100_kind_node_image }}
 
 runs:
   using: 'composite'
@@ -121,6 +127,7 @@ runs:
         # Testing tools
         echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT
         echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT
@@ -141,6 +148,7 @@ runs:
 
         # Testing configuration
         echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
+        echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
 
     - name: Display loaded versions
       shell: bash
@@ -158,6 +166,7 @@ runs:
         echo "  grype: ${{ steps.versions.outputs.grype }}"
         echo "  kubectl: ${{ steps.versions.outputs.kubectl }}"
         echo "  kind: ${{ steps.versions.outputs.kind }}"
+        echo "  nvkind: ${{ steps.versions.outputs.nvkind }}"
         echo "  ctlptl: ${{ steps.versions.outputs.ctlptl }}"
         echo "  tilt: ${{ steps.versions.outputs.tilt }}"
         echo "  helm: ${{ steps.versions.outputs.helm }}"
@@ -172,3 +181,4 @@ runs:
         echo "  lint_timeout: ${{ steps.versions.outputs.lint_timeout }}"
         echo "  test_timeout: ${{ steps.versions.outputs.test_timeout }}"
         echo "  kind_node_image: ${{ steps.versions.outputs.kind_node_image }}"
+        echo "  h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}"
diff --git a/.github/actions/runtime-install/action.yml b/.github/actions/runtime-install/action.yml
new file mode 100644
index 000000000..1adfea364
--- /dev/null
+++ b/.github/actions/runtime-install/action.yml
@@ -0,0 +1,104 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: 'Runtime Install'
+description: 'Installs the standalone GPU operator for smoke tests or the full AICR runtime bundle.'
+
+inputs:
+  method:
+    description: 'Installation method: helm (standalone chart) or bundle (aicr recipe+bundle)'
+    required: true
+  accelerator:
+    description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)'
+    required: false
+    default: ''
+  intent:
+    description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)'
+    required: false
+    default: 'inference'
+  platform:
+    description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)'
+    required: false
+    default: ''
+  wait:
+    description: 'Wait for bundle Helm resources during deploy'
+    required: false
+    default: 'false'
+  best_effort:
+    description: 'Continue deploying remaining bundle components after a component failure'
+    required: false
+    default: 'true'
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Validate installation method
+      shell: bash
+      env:
+        RUNTIME_INSTALL_METHOD: ${{ inputs.method }}
+      run: |
+        case "${RUNTIME_INSTALL_METHOD}" in
+          helm|bundle) ;;
+          *)
+            echo "::error::unsupported runtime install method: ${RUNTIME_INSTALL_METHOD}"
+            exit 1
+            ;;
+        esac
+
+    # --- Helm mode: standalone GPU operator chart ---
+
+    - name: Install GPU Operator (helm)
+      if: inputs.method == 'helm'
+      shell: bash
+      run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh"
+    - name: Wait for GPU operands (helm)
+      if: inputs.method == 'helm'
+      shell: bash
+      run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh"
+    # --- Bundle mode: aicr recipe → bundle → deploy ---
+
+    - name: Validate bundle inputs
+      if: inputs.method == 'bundle'
+      shell: bash
+      env:
+        AICR_ACCELERATOR: ${{ inputs.accelerator }}
+      run: |
+        if [[ -z "${AICR_ACCELERATOR}" ]]; then
+          echo "::error::inputs.accelerator is required when inputs.method is 'bundle'"
+          exit 1
+        fi
+
+    - name: Generate recipe
+      if: inputs.method == 'bundle'
+      shell: bash
+      env:
+        AICR_ACCELERATOR: ${{ inputs.accelerator }}
+        AICR_INTENT: ${{ inputs.intent }}
+        AICR_PLATFORM: ${{ inputs.platform }}
+      run: bash "${{ github.action_path }}/generate-recipe.sh"
+    - name: Generate deployment bundle
+      if: inputs.method == 'bundle'
+      shell: bash
+      run: bash "${{ github.action_path }}/generate-bundle.sh"
+    - name: Install bundle into cluster
+      if: inputs.method == 'bundle'
+      shell: bash
+      env:
+        AICR_DEPLOY_WAIT: ${{ inputs.wait }}
+        AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }}
+      run: bash "${{ github.action_path }}/install-bundle.sh"
+    - name: Wait for GPU operands (bundle)
+      if: inputs.method == 'bundle'
+      shell: bash
+      run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh"
diff --git a/.github/actions/runtime-install/generate-bundle.sh b/.github/actions/runtime-install/generate-bundle.sh
new file mode 100644
index 000000000..8e3f8436d
--- /dev/null
+++ b/.github/actions/runtime-install/generate-bundle.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+rm -rf bundle
+
+BUNDLE_ARGS=(
+  --recipe recipe.yaml
+  --accelerated-node-toleration nvidia.com/gpu:NoSchedule
+  --output bundle
+)
+
+./aicr bundle "${BUNDLE_ARGS[@]}"
+echo "--- Bundle contents ---"
+ls -la bundle/
diff --git a/.github/actions/runtime-install/generate-recipe.sh b/.github/actions/runtime-install/generate-recipe.sh
new file mode 100644
index 000000000..b3555ef78
--- /dev/null
+++ b/.github/actions/runtime-install/generate-recipe.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+RECIPE_ARGS=(
+  --service kind
+  --accelerator "${AICR_ACCELERATOR}"
+  --os ubuntu
+  --intent "${AICR_INTENT}"
+)
+if [[ -n "${AICR_PLATFORM:-}" ]]; then
+  RECIPE_ARGS+=(--platform "${AICR_PLATFORM}")
+fi
+
+./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml
+echo "Recipe written to recipe.yaml"
diff --git a/.github/actions/runtime-install/install-bundle.sh b/.github/actions/runtime-install/install-bundle.sh
new file mode 100644
index 000000000..1068cddaa
--- /dev/null
+++ b/.github/actions/runtime-install/install-bundle.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+cd bundle
+# The default keeps legacy bundle-mode behavior: do not wait on every
+# Helm resource and keep deploying after component failures. H100
+# qualification jobs override these inputs to hard-fail and wait.
+chmod +x deploy.sh
+AICR_DEPLOY_WAIT="${AICR_DEPLOY_WAIT:-false}"
+AICR_DEPLOY_BEST_EFFORT="${AICR_DEPLOY_BEST_EFFORT:-true}"
+for deploy_flag_name in AICR_DEPLOY_WAIT AICR_DEPLOY_BEST_EFFORT; do
+  case "${!deploy_flag_name}" in
+    true|false) ;;
+    *)
+      echo "::error::${deploy_flag_name} must be true or false, got '${!deploy_flag_name}'"
+      exit 1
+      ;;
+  esac
+done
+
+DEPLOY_ARGS=()
+if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then
+  DEPLOY_ARGS+=(--no-wait)
+fi
+if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then
+  DEPLOY_ARGS+=(--best-effort)
+fi
+if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then
+  echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}"
+else
+  echo "Deploying bundle with default args"
+fi
+./deploy.sh "${DEPLOY_ARGS[@]}"
diff --git a/.github/actions/runtime-install/install-gpu-operator-helm.sh b/.github/actions/runtime-install/install-gpu-operator-helm.sh
new file mode 100644
index 000000000..f20527ed3
--- /dev/null
+++ b/.github/actions/runtime-install/install-gpu-operator-helm.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if ! command -v yq >/dev/null 2>&1; then
+  echo "::error::yq is required to read testing.gpu_operator_chart_version from .settings.yaml"
+  exit 1
+fi
+
+GPU_OPERATOR_CHART_VERSION="$(yq eval '.testing.gpu_operator_chart_version // ""' .settings.yaml)"
+if [[ -z "${GPU_OPERATOR_CHART_VERSION}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then
+  echo "::error::testing.gpu_operator_chart_version must be set in .settings.yaml"
+  exit 1
+fi
+
+helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update
+helm repo update
+helm upgrade -i \
+  --kube-context="kind-${KIND_CLUSTER_NAME}" \
+  --namespace gpu-operator \
+  --create-namespace \
+  --set driver.enabled=false \
+  --set toolkit.enabled=false \
+  --set dcgmExporter.enabled=false \
+  --set nfd.enabled=true \
+  --version="${GPU_OPERATOR_CHART_VERSION}" \
+  --wait --timeout=600s \
+  gpu-operator nvidia/gpu-operator
diff --git a/.github/actions/runtime-install/wait-gpu-operands-bundle.sh b/.github/actions/runtime-install/wait-gpu-operands-bundle.sh
new file mode 100644
index 000000000..9133ba435
--- /dev/null
+++ b/.github/actions/runtime-install/wait-gpu-operands-bundle.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}"
+DEVICE_PLUGIN_WAIT_TIMEOUT="${DEVICE_PLUGIN_WAIT_TIMEOUT:-300s}"
+KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-330s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \
+    --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \
+    --context="${KUBE_CONTEXT}" "$@"
+}
+
+echo "Waiting for GPU operator controller to deploy operands..."
+# The GPU operator controller watches ClusterPolicy and creates
+# DaemonSets for device-plugin, NFD, GFD, etc. This happens
+# asynchronously after the bundle deploy applies the ClusterPolicy.
+if ! kubectl_kind_wait -n gpu-operator wait --for=create \
+  daemonset/nvidia-device-plugin-daemonset \
+  --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then
+  echo "::error::device plugin DaemonSet was not created within ${DEVICE_PLUGIN_WAIT_TIMEOUT}"
+  kubectl_kind -n gpu-operator get pods || true
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true
+  exit 1
+fi
+echo "Device plugin DaemonSet found."
+echo "Waiting for device plugin rollout..."
+# Operands are excluded from control-plane nodes via nodeAffinity in
+# the kind overlay, so all scheduled pods should become ready.
+if ! kubectl_kind_wait -n gpu-operator rollout status daemonset/nvidia-device-plugin-daemonset \
+  --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then
+  echo "::error::device plugin DaemonSet did not roll out within ${DEVICE_PLUGIN_WAIT_TIMEOUT}"
+  kubectl_kind -n gpu-operator get pods -o wide || true
+  kubectl_kind -n gpu-operator describe daemonset/nvidia-device-plugin-daemonset || true
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true
+  exit 1
+fi
+echo "GPU Operator pods:"
+kubectl_kind -n gpu-operator get pods
diff --git a/.github/actions/runtime-install/wait-gpu-operands-helm.sh b/.github/actions/runtime-install/wait-gpu-operands-helm.sh
new file mode 100644
index 000000000..ccd47670d
--- /dev/null
+++ b/.github/actions/runtime-install/wait-gpu-operands-helm.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}"
+DEVICE_PLUGIN_WAIT_TIMEOUT="${DEVICE_PLUGIN_WAIT_TIMEOUT:-300s}"
+KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-330s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \
+    --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \
+    --context="${KUBE_CONTEXT}" "$@"
+}
+
+echo "Waiting for device plugin to be ready..."
+if ! kubectl_kind_wait -n gpu-operator wait --for=create \
+  daemonset/nvidia-device-plugin-daemonset \
+  --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then
+  echo "::error::device plugin DaemonSet was not created within ${DEVICE_PLUGIN_WAIT_TIMEOUT}"
+  kubectl_kind -n gpu-operator get pods || true
+  exit 1
+fi
+echo "Device plugin DaemonSet found."
+
+if ! kubectl_kind_wait -n gpu-operator rollout status daemonset/nvidia-device-plugin-daemonset \
+  --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then
+  echo "::error::device plugin DaemonSet did not roll out within ${DEVICE_PLUGIN_WAIT_TIMEOUT}"
+  kubectl_kind -n gpu-operator get pods -o wide || true
+  kubectl_kind -n gpu-operator describe daemonset/nvidia-device-plugin-daemonset || true
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true
+  exit 1
+fi
+echo "GPU Operator pods:"
+kubectl_kind -n gpu-operator get pods
diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh
new file mode 100644
index 000000000..5b9b4c9c7
--- /dev/null
+++ b/.github/scripts/gpu-chainsaw-health.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+  echo "::error::Usage: $0 <test_dir>"
+  exit 2
+fi
+test_dir="$1"
+if [[ ! -d "${test_dir}" ]]; then
+  echo "::error::Test directory not found: ${test_dir}"
+  exit 1
+fi
+
+CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}"
+if ! [[ "${CHAINSAW_TEST_TIMEOUT}" =~ ^[0-9]+[smh]$ ]]; then
+  echo "::error::CHAINSAW_TEST_TIMEOUT must be a duration like 30m, 180s, or 1h; got '${CHAINSAW_TEST_TIMEOUT}'"
+  exit 1
+fi
+MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}"
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}"
+KUBECTL_WAIT_GRACE_SECONDS="${KUBECTL_WAIT_GRACE_SECONDS:-30}"
+
+if ! [[ "${MONITORING_READY_TIMEOUT}" =~ ^[0-9]+[smh]$ ]]; then
+  echo "::error::MONITORING_READY_TIMEOUT must be a duration like 180s, 5m, or 1h; got '${MONITORING_READY_TIMEOUT}'"
+  exit 1
+fi
+
+duration_seconds() {
+  local input_value="$1"
+  local number="${input_value%[smh]}"
+  local unit="${input_value: -1}"
+
+  case "${unit}" in
+    s) echo "$((10#${number}))" ;;
+    m) echo "$((10#${number} * 60))" ;;
+    h) echo "$((10#${number} * 3600))" ;;
+    *)
+      echo "::error::unsupported duration '${input_value}'" >&2
+      exit 1
+      ;;
+  esac
+}
+
+if ! [[ "${KUBECTL_WAIT_GRACE_SECONDS}" =~ ^[0-9]+$ ]]; then
+  echo "::error::KUBECTL_WAIT_GRACE_SECONDS must be a non-negative integer, got '${KUBECTL_WAIT_GRACE_SECONDS}'"
+  exit 1
+fi
+monitoring_ready_timeout_seconds="$(duration_seconds "${MONITORING_READY_TIMEOUT}")"
+KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-$((monitoring_ready_timeout_seconds + KUBECTL_WAIT_GRACE_SECONDS))s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \
+    --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \
+    --context="${KUBE_CONTEXT}" "$@"
+}
+
+print_monitoring_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null \
+    | grep -E '(^NAME|^kube-prometheus-operator-)' || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true
+}
+
+wait_for_monitoring_operator() {
+  echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..."
+  if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \
+    --timeout="${MONITORING_READY_TIMEOUT}"; then
+    echo "monitoring/kube-prometheus-operator is rolled out."
+    return 0
+  fi
+
+  echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}"
+  print_monitoring_diagnostics
+  return 1
+}
+
+wait_for_monitoring_operator
+
+# --skip-delete: these tests assert the already-deployed runtime bundle. Letting
+# Chainsaw delete asserted resources would tear down the system under test.
+timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \
+  --test-dir "${test_dir}" \
+  --config tests/chainsaw/chainsaw-config.yaml \
+  --skip-delete
diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh
new file mode 100644
index 000000000..c1d7c7b3c
--- /dev/null
+++ b/.github/scripts/gpu-debug-diagnostics.sh
@@ -0,0 +1,291 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diagnostic script: intentionally omits -e so each mode can keep collecting
+# partial failure data. Keep -u and pipefail to catch script bugs and pipeline
+# failures while individual kubectl_kind calls tolerate cluster errors.
+set -uo pipefail
+
+mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}"
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+
+kubectl_kind() {
+  timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+docker_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" docker "$@"
+}
+
+command_timeout() {
+  local limit="$1"
+  shift
+  timeout "${limit}" "$@"
+}
+
+print_setup_diagnostics() {
+  echo "=== Runner baseline ==="
+  date -u || true
+  hostname || true
+  uptime || true
+  cat /proc/loadavg || true
+  nproc || true
+  free -h || true
+  df -h / || true
+  df -ih / || true
+  echo "=== Docker health ==="
+  docker_timeout 30s info >/dev/null 2>&1 && docker_timeout 30s version || true
+  echo "=== Host GPUs ==="
+  command_timeout 30s nvidia-smi -L || true
+  command_timeout 30s nvidia-smi || true
+  echo "=== Kind clusters ==="
+  command_timeout 30s kind get clusters || true
+  echo "=== Kind node containers ==="
+  docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true
+  echo "=== Kind node container resources ==="
+  docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+    --format '{{.Names}}' | sort | while read -r node_container; do
+      [[ -z "${node_container}" ]] && continue
+      docker_timeout 30s inspect "${node_container}" \
+        --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true
+    done || true
+  print_kind_node_pressure
+}
+
+print_kind_node_pressure() {
+  local node_container
+
+  echo "=== Kind node pressure snapshots ==="
+  docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \
+    --format '{{.Names}}' | sort | while read -r node_container; do
+      [[ -z "${node_container}" ]] && continue
+      echo "--- ${node_container} docker stats ---"
+      docker_timeout 30s stats --no-stream \
+        --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \
+        "${node_container}" || true
+      echo "--- ${node_container} node pressure ---"
+      docker_timeout 30s exec "${node_container}" sh -c '
+        date
+        hostname || true
+        uptime || true
+        cat /proc/loadavg || true
+        nproc || true
+        free -h || true
+        df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h
+        echo "--- top cpu/memory processes ---"
+        ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -25 || true
+      ' || true
+    done || true
+}
+
+print_workload_images() {
+  local ns="$1"
+  kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \
+    | jq -r '
+      .items[] |
+      [
+        .kind,
+        .metadata.namespace + "/" + .metadata.name,
+        (([.spec.template.spec.containers[]?.image] +
+          [.spec.template.spec.initContainers[]?.image]) | unique | join(","))
+      ] | @tsv
+    ' || true
+}
+
+print_workload_inventory() {
+  local ns
+  echo "=== Workload image inventory ==="
+  for ns in "$@"; do
+    echo "--- ${ns} ---"
+    print_workload_images "${ns}"
+  done
+}
+
+print_component_status_summary() {
+  echo "=== Component workload status ==="
+  kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true
+  echo "=== Component rollout conditions ==="
+  kubectl_kind get deployments,statefulsets,daemonsets -A \
+    -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \
+    2>/dev/null || true
+  echo "=== Non-ready pods ==="
+  kubectl_kind get pods -A \
+    --field-selector=status.phase!=Running,status.phase!=Succeeded \
+    -o wide 2>/dev/null || true
+}
+
+print_kube_prometheus_operator_diagnostics() {
+  echo "=== Monitoring workloads ==="
+  kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment ==="
+  kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true
+  echo "=== kube-prometheus-operator deployment describe ==="
+  kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true
+  echo "=== kube-prometheus-operator pod describe ==="
+  kubectl_kind -n monitoring get pods -o name 2>/dev/null \
+    | grep '^pod/kube-prometheus-operator-' \
+    | while read -r pod; do
+        echo "--- ${pod} ---"
+        kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== kube-prometheus-operator logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true
+  echo "=== kube-prometheus-operator previous logs ==="
+  kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true
+  echo "=== Recent events (monitoring) ==="
+  kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true
+}
+
+print_kai_diagnostics() {
+  echo "=== KAI scheduler pods ==="
+  kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment ==="
+  kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true
+  echo "=== KAI admission deployment describe ==="
+  kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true
+  echo "=== KAI admission pod describe ==="
+  kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \
+    | grep '^pod/admission-' \
+    | while read -r pod; do
+        kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true
+      done || true
+  echo "=== KAI admission logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true
+  echo "=== KAI scheduler logs ==="
+  kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
+  echo "=== KAI scheduler queues ==="
+  kubectl_kind get queues -A 2>/dev/null || true
+  echo "=== KAI scheduler podgroups ==="
+  kubectl_kind get podgroups -A 2>/dev/null || true
+  echo "=== Recent events (kai-scheduler) ==="
+  kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true
+}
+
+print_custom_metrics() {
+  local metric
+  local ns
+  local namespaces=("$@")
+
+  echo "=== Custom metrics API ==="
+  for metric in gpu_utilization gpu_memory_used gpu_power_usage; do
+    for ns in "${namespaces[@]}"; do
+      echo "--- ${ns}/${metric} ---"
+      kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \
+        | jq . || true
+    done
+  done
+}
+
+print_metrics_pipeline_diagnostics() {
+  echo "=== prometheus-adapter pods ==="
+  kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
+  echo "=== DCGM Exporter pods ==="
+  kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
+  echo "=== Monitoring pods ==="
+  kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true
+  echo "=== DRA ResourceSlices ==="
+  kubectl_kind get resourceslices -o wide 2>/dev/null || true
+  echo "=== Node status ==="
+  kubectl_kind get nodes -o wide 2>/dev/null || true
+}
+
+print_common_gpu_diagnostics() {
+  echo "=== ClusterPolicy status ==="
+  kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true
+  echo "=== GPU Operator pods ==="
+  kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true
+  echo "=== Non-running pods (all namespaces) ==="
+  kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
+  echo "=== Recent events (gpu-operator) ==="
+  kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+print_h100_common_diagnostics() {
+  local metric_namespaces=("$@")
+  local common_namespaces=(
+    cert-manager
+    gpu-operator
+    monitoring
+    skyhook
+    nvsentinel
+    nvidia-dra-driver
+    nvidia-network-operator
+    kai-scheduler
+  )
+
+  print_setup_diagnostics
+  print_component_status_summary
+  print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}"
+  print_common_gpu_diagnostics
+  print_kube_prometheus_operator_diagnostics
+  print_kai_diagnostics
+  print_custom_metrics gpu-operator "${metric_namespaces[@]}"
+  print_metrics_pipeline_diagnostics
+  echo "=== Node resources ==="
+  kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true
+}
+
+print_kubeflow_diagnostics() {
+  echo "=== Kubeflow Trainer deployment ==="
+  kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
+  echo "=== Kubeflow pods ==="
+  kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true
+  echo "=== Kubeflow validating webhooks ==="
+  kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
+  echo "=== Kubeflow Trainer CRD ==="
+  kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
+}
+
+print_dynamo_diagnostics() {
+  echo "=== Dynamo pods ==="
+  kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true
+  echo "=== Dynamo operator logs ==="
+  kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
+  echo "=== Recent events (dynamo-system) ==="
+  kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
+}
+
+print_kgateway_diagnostics() {
+  echo "=== kgateway pods ==="
+  kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
+  echo "=== GatewayClass status ==="
+  kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
+  echo "=== Gateway status ==="
+  kubectl_kind get gateways -A -o yaml 2>/dev/null || true
+}
+
+case "${mode}" in
+  smoke)
+    print_setup_diagnostics
+    print_common_gpu_diagnostics
+    echo "=== Node status ==="
+    kubectl_kind get nodes -o wide 2>/dev/null || true
+    ;;
+  training)
+    print_h100_common_diagnostics kubeflow
+    print_kubeflow_diagnostics
+    ;;
+  inference)
+    print_h100_common_diagnostics dynamo-system kgateway-system
+    print_dynamo_diagnostics
+    print_kgateway_diagnostics
+    ;;
+  *)
+    echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}"
+    exit 1
+    ;;
+esac
diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
new file mode 100644
index 000000000..8751bb513
--- /dev/null
+++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+kubectl_kind_wait() {
+  timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}'
+apiVersion: v1
+kind: Pod
+metadata:
+  generateName: gpu-smoke-test-
+  labels:
+    app: gpu-smoke-test
+spec:
+  restartPolicy: Never
+  containers:
+  - name: nvidia-smi
+    # Intentionally use a small base image: NVIDIA Container Toolkit should
+    # inject nvidia-smi into GPU containers. This smoke test should fail if it
+    # does not.
+    image: ubuntu:22.04
+    command: ["nvidia-smi"]
+    resources:
+      limits:
+        nvidia.com/gpu: 1
+EOF
+)
+
+mkdir -p "$(dirname "${POD_NAME_FILE}")"
+echo "${pod_name}" > "${POD_NAME_FILE}"
+
+echo "Waiting for ${pod_name} pod to complete..."
+kubectl_kind_wait wait "pod/${pod_name}" \
+  --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
new file mode 100644
index 000000000..2510e1742
--- /dev/null
+++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
+KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}"
+POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}"
+trap 'rm -f "${POD_NAME_FILE}"' EXIT
+
+kubectl_kind() {
+  kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@"
+}
+
+pod_name=""
+if [[ -f "${POD_NAME_FILE}" ]]; then
+  pod_name="$(cat "${POD_NAME_FILE}")"
+  if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then
+    pod_name=""
+  fi
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  pod_name=$(kubectl_kind get pods \
+    -l app=gpu-smoke-test \
+    --sort-by=.metadata.creationTimestamp \
+    -o jsonpath='{.items[-1:].metadata.name}')
+fi
+
+if [[ -z "${pod_name}" ]]; then
+  echo "::error::no gpu-smoke-test pod found"
+  exit 1
+fi
+
+kubectl_kind logs "${pod_name}"
diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh
new file mode 100644
index 000000000..76c354249
--- /dev/null
+++ b/.github/scripts/gpu-validate-conformance.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
+./aicr validate \
+  --recipe recipe.yaml \
+  --snapshot snapshot.yaml \
+  --phase conformance \
+  --namespace gpu-operator \
+  --kubeconfig="${HOME}/.kube/config" \
+  --require-gpu \
+  --image=ko.local:smoke-test \
+  --timeout=10m \
+  --toleration '*' \
+  --output=validation-result.yaml \
+  --evidence-dir=conformance-evidence
diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml
index c5e1882d4..07e8c6c74 100644
--- a/.github/workflows/gpu-h100-inference-test.yaml
+++ b/.github/workflows/gpu-h100-inference-test.yaml
@@ -12,17 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: GPU Inference Test (nvkind + H100 x2)
+name: GPU Inference Test (nvkind + H100 x1)
 
 on:
   schedule:
-    - cron: '15 6,18 * * *'  # Every 12 hours (2x daily), offset from T4 smoke test
+    - cron: '15 6 * * *'  # Daily, 6h offset from training test
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
-  workflow_dispatch: {}  # Allow manual runs
+  workflow_dispatch:
+    inputs:
+      run_full_validation:
+        description: 'Run snapshot and CNCF AI Conformance validation'
+        required: false
+        type: boolean
+        default: false
 
 permissions:
   contents: read
@@ -40,6 +44,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -49,16 +55,24 @@ jobs:
               - '.github/workflows/gpu-h100-inference-test.yaml'
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
-              - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
-              - 'pkg/evidence/**'
+              - 'validators/conformance/**'
+              - 'recipes/validators/catalog.yaml'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-workflow-prepare/**'
+              - '.github/actions/gpu-debug-diagnostics/**'
+              - '.github/actions/gpu-chainsaw-health/**'
+              - '.github/actions/gpu-validate-conformance/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
-              - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
               - 'tests/chainsaw/ai-conformance/kind-common/**'
               - 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**'
@@ -72,213 +86,28 @@ jobs:
               - 'recipes/overlays/kind-inference.yaml'
               - 'recipes/overlays/h100-kind-inference.yaml'
               - 'recipes/overlays/h100-kind-inference-dynamo.yaml'
-              - 'kwok/manifests/karpenter/**'
-              - 'kwok/scripts/install-karpenter-kwok.sh'
               - 'pkg/collector/**'
               - 'pkg/snapshotter/**'
               - '.github/actions/gpu-snapshot-validate/**'
-              - 'pkg/validator/job/**'
-              - 'pkg/validator/catalog/**'
-              - 'pkg/defaults/timeouts.go'
-              - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-inference-test:
     needs: [check-paths]
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
-    name: GPU Inference Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
-
-    env:
-      KIND_CLUSTER_NAME: gpu-inference-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-
-      - name: Build aicr
-        uses: ./.github/actions/aicr-build
-        with:
-          validator_phases: 'conformance'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          platform: dynamo
-
-      # --- Snapshot and GPU validation ---
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Health checks ---
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      - name: Run chainsaw health checks
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --config tests/chainsaw/chainsaw-config.yaml
-
-      # --- CNCF AI Conformance validation ---
-      # Runs after the stack health checks so gateway and metrics validators
-      # see a settled inference stack.
-
-      - name: Verify expected resources exist
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      # Dynamo smoke is intentionally disabled for now. The vLLM runtime image
-      # adds significant latency and flakiness in Kind CI, and training has no
-      # matching smoke path yet. Reintroduce it later alongside a symmetric
-      # training smoke test if needed.
-      # --- Validation artifacts ---
-
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
-      - name: Upload validation artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      - name: Debug diagnostics
-        if: failure()
-        run: |
-          echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Dynamo pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
-          echo "=== Dynamo operator logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
-          echo "=== Recent events (dynamo-system) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Custom metrics API ==="
-          for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
-            echo "--- ${METRIC} ---"
-            for NS in gpu-operator dynamo-system; do
-              kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
-                "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
-            done
-          done
-          echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== prometheus-adapter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
-          echo "=== kgateway pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true
-          echo "=== GatewayClass status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
-          echo "=== Gateway status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true
-          echo "=== DCGM Exporter pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
-            get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
-          echo "=== Monitoring pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true
-          echo "=== DRA ResourceSlices ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true
-          echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-inference-test-debug
+    name: GPU Inference Test (nvkind + H100 x1)
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Inference Test (nvkind + H100 x1)
+      cluster_name: gpu-inference-test
+      intent: inference
+      platform: dynamo
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo
+      artifact_name_prefix: gpu-inference-test-debug
+      run_full_validation: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.run_full_validation) }}
diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml
new file mode 100644
index 000000000..c2140082f
--- /dev/null
+++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml
@@ -0,0 +1,146 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: GPU H100 Kind Runtime Test
+
+on:
+  workflow_call:
+    inputs:
+      job_name:
+        description: 'Display name for the H100 runtime job'
+        required: true
+        type: string
+      cluster_name:
+        description: 'Kind cluster name'
+        required: true
+        type: string
+      intent:
+        description: 'Runtime intent passed to the bundle installer'
+        required: true
+        type: string
+      platform:
+        description: 'Runtime platform passed to the bundle installer'
+        required: true
+        type: string
+      chainsaw_path:
+        description: 'Chainsaw health-check directory'
+        required: true
+        type: string
+      artifact_name_prefix:
+        description: 'Prefix for uploaded debug artifacts'
+        required: true
+        type: string
+      run_full_validation:
+        description: 'Run snapshot validation and CNCF AI Conformance validation'
+        required: false
+        type: boolean
+        default: false
+
+permissions:
+  contents: read
+
+jobs:
+  gpu-h100-kind-runtime-test:
+    name: ${{ inputs.job_name }}
+    runs-on: linux-amd64-gpu-h100-latest-1
+    timeout-minutes: ${{ inputs.run_full_validation && 180 || 150 }}
+    concurrency:
+      group: gpu-h100-${{ github.event_name }}-${{ github.ref }}-${{ inputs.intent }}
+      cancel-in-progress: true
+
+    env:
+      KIND_CLUSTER_NAME: ${{ inputs.cluster_name }}
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Prepare GPU workflow
+        id: prepare
+        uses: ./.github/actions/gpu-workflow-prepare
+
+      - name: Set up GPU cluster
+        timeout-minutes: 25
+        uses: ./.github/actions/gpu-cluster-setup
+        with:
+          kind_node_image: ${{ steps.prepare.outputs.h100_kind_node_image }}
+          min_gpu_count: '1'
+          gpu_model_pattern: H100
+          min_free_disk_gb: '50'
+          min_available_memory_gb: '16'
+          cluster_create_timeout: 900s
+          control_plane_resource_patches: 'true'
+          control_plane_leader_election_tuning: 'true'
+
+      - name: Build AICR CLI
+        timeout-minutes: 10
+        uses: ./.github/actions/aicr-build
+        with:
+          build_snapshot_agent: 'false'
+          validator_phases: 'none'
+
+      - name: Install runtime bundle
+        id: bundle-install
+        timeout-minutes: 80
+        uses: ./.github/actions/runtime-install
+        with:
+          method: bundle
+          accelerator: h100
+          intent: ${{ inputs.intent }}
+          platform: ${{ inputs.platform }}
+          wait: 'true'
+          best_effort: 'false'
+
+      - name: Snapshot and validate GPU
+        if: inputs.run_full_validation
+        timeout-minutes: 30
+        uses: ./.github/actions/gpu-snapshot-validate
+        with:
+          gpu_model: H100
+          min_gpu_count: '1'
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          snapshot_timeout: 10m
+
+      - name: Run chainsaw health checks
+        timeout-minutes: 20
+        uses: ./.github/actions/gpu-chainsaw-health
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          chainsaw_path: ${{ inputs.chainsaw_path }}
+          chainsaw_version: ${{ steps.prepare.outputs.chainsaw }}
+          chainsaw_sha256: ${{ steps.prepare.outputs.chainsaw_sha256_linux_amd64 }}
+
+      - name: Run CNCF AI Conformance
+        if: inputs.run_full_validation
+        timeout-minutes: 60
+        id: validate-conformance
+        uses: ./.github/actions/gpu-validate-conformance
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          kwok_helm_timeout: 600s
+          ko_build_timeout: 1200s
+          karpenter_helm_timeout: 600s
+
+      - name: GPU Test Cleanup
+        if: always()
+        timeout-minutes: 15
+        uses: ./.github/actions/gpu-test-cleanup
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          artifact_name_prefix: ${{ inputs.artifact_name_prefix }}
+          collect_artifacts: ${{ job.status != 'success' }}
+          diagnostic_mode: ${{ inputs.intent }}
+          upload_validation_artifacts: ${{ inputs.run_full_validation && steps.validate-conformance.outcome != 'skipped' }}
diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml
index d3a04de03..4dd30f872 100644
--- a/.github/workflows/gpu-h100-training-test.yaml
+++ b/.github/workflows/gpu-h100-training-test.yaml
@@ -12,17 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: GPU Training Test (nvkind + H100 x2)
+name: GPU Training Test (nvkind + H100 x1)
 
 on:
   schedule:
-    - cron: '30 6,18 * * *'  # Every 12 hours (2x daily), offset from inference test
+    - cron: '15 0 * * *'  # Daily, 6h offset from inference test
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
-  workflow_dispatch: {}  # Allow manual runs
+  workflow_dispatch:
+    inputs:
+      run_full_validation:
+        description: 'Run snapshot and CNCF AI Conformance validation'
+        required: false
+        type: boolean
+        default: false
 
 permissions:
   contents: read
@@ -40,6 +44,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -49,16 +55,24 @@ jobs:
               - '.github/workflows/gpu-h100-training-test.yaml'
               - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
+              - '.github/actions/check-control-plane-health/**'
               - '.github/actions/aicr-build/**'
               - '.github/actions/setup-build-tools/**'
-              - '.github/actions/install-karpenter-kwok/**'
               - 'validators/*/Dockerfile'
-              - 'pkg/evidence/**'
+              - 'validators/conformance/**'
+              - 'recipes/validators/catalog.yaml'
+              - '.github/workflows/gpu-h100-kind-runtime-test.yaml'
+              - '.github/actions/gpu-workflow-prepare/**'
+              - '.github/actions/gpu-debug-diagnostics/**'
+              - '.github/actions/gpu-chainsaw-health/**'
+              - '.github/actions/gpu-validate-conformance/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-chainsaw-health.sh'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - 'pkg/bundler/deployer/helm/**'
               - 'tests/chainsaw/chainsaw-config.yaml'
-              - 'tests/chainsaw/ai-conformance/main.go'
               - 'tests/chainsaw/ai-conformance/common/**'
               - 'tests/chainsaw/ai-conformance/kind-common/**'
               - 'tests/chainsaw/ai-conformance/kind-training-kubeflow/**'
@@ -67,198 +81,29 @@ jobs:
               - 'recipes/overlays/h100-kind-training-kubeflow.yaml'
               - 'recipes/mixins/platform-kubeflow.yaml'
               - 'recipes/components/kubeflow-trainer/**'
-              - 'kwok/manifests/karpenter/**'
-              - 'kwok/scripts/install-karpenter-kwok.sh'
               - 'recipes/components/prometheus-adapter/**'
               - 'pkg/collector/**'
               - 'pkg/snapshotter/**'
               - '.github/actions/gpu-snapshot-validate/**'
-              - 'pkg/validator/job/**'
-              - 'pkg/validator/catalog/**'
-              - 'pkg/defaults/timeouts.go'
-              - 'validators/conformance/**'
 
+  # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+  # checkout. PR GPU coverage runs through the pull-request/<number> push
+  # mirror after ok-to-test approval.
   gpu-training-test:
     needs: [check-paths]
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
-    name: GPU Training Test (nvkind + H100 x2)
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: linux-amd64-gpu-h100-latest-2
-    timeout-minutes: 120
-
-    env:
-      KIND_CLUSTER_NAME: gpu-training-test
-
-    steps:
-
-      - name: Checkout Code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
-        with:
-          persist-credentials: false
-
-      - name: Set up GPU cluster
-        uses: ./.github/actions/gpu-cluster-setup
-
-      - name: Build aicr
-        uses: ./.github/actions/aicr-build
-        with:
-          validator_phases: 'conformance'
-
-      - name: Install runtime bundle
-        id: bundle-install
-        uses: ./.github/actions/gpu-operator-install
-        with:
-          method: bundle
-          accelerator: h100
-          intent: training
-          platform: kubeflow
-
-      # --- Snapshot and GPU validation ---
-
-      - name: Snapshot and validate GPU
-        uses: ./.github/actions/gpu-snapshot-validate
-        with:
-          gpu_model: H100
-          min_gpu_count: '2'
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
-
-      - name: Install Karpenter + KWOK
-        uses: ./.github/actions/install-karpenter-kwok
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-
-      # --- Health checks ---
-
-      - name: Prepare chainsaw
-        id: versions
-        uses: ./.github/actions/load-versions
-
-      - name: Install chainsaw
-        uses: ./.github/actions/setup-build-tools
-        with:
-          install_chainsaw: 'true'
-          chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
-
-      - name: Run chainsaw health checks
-        run: |
-          chainsaw test \
-            --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --config tests/chainsaw/chainsaw-config.yaml
-
-      # --- CNCF AI Conformance validation ---
-      # Runs last to ensure the DCGM → Prometheus → adapter pipeline
-      # has had time to bootstrap (pod-autoscaling check needs live metric data).
-
-      - name: Verify expected resources exist
-        run: |
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug
-
-      - name: Validate CNCF AI Conformance
-        id: validate-conformance
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --toleration '*' \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      # --- Validation artifacts ---
-
-      # Collect a post-run resource snapshot regardless of whether conformance
-      # validation ran, so triage always has a cluster-state artifact.
-      - name: Collect validation artifacts
-        if: >-
-          always()
-          && !cancelled()
-          && steps.bundle-install.outcome == 'success'
-        continue-on-error: true
-        shell: bash
-        run: |
-          set -o pipefail
-          mkdir -p conformance-evidence
-          go run ./tests/chainsaw/ai-conformance/ \
-            --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \
-            --dir tests/chainsaw/ai-conformance/common \
-            --dir tests/chainsaw/ai-conformance/kind-common \
-            --kubeconfig="${HOME}/.kube/config" \
-            --debug | tee conformance-evidence/resource-existence-post.txt
-
-      - name: Upload validation artifacts
-        if: always()
-        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
-        with:
-          name: conformance-evidence
-          path: |
-            conformance-evidence/
-            validation-result.yaml
-          if-no-files-found: warn
-
-      # --- Debug diagnostics (before cleanup so resources still exist) ---
-
-      - name: Debug diagnostics
-        if: failure()
-        run: |
-          echo "=== Grafana deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true
-          echo "=== Grafana pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \
-            -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true
-          echo "=== Grafana deployment describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true
-          echo "=== Grafana pod describe ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \
-            -l app.kubernetes.io/name=grafana 2>/dev/null || true
-          echo "=== KAI scheduler pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true
-          echo "=== KAI scheduler logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \
-            logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true
-          echo "=== KAI scheduler queues ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true
-          echo "=== KAI scheduler podgroups ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true
-          echo "=== Kubeflow Trainer deployment ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true
-          echo "=== Kubeflow pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true
-          echo "=== Kubeflow validating webhooks ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Kubeflow Trainer CRD ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \
-            --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Node resources ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \
-            grep -A 20 "Allocated resources" || true
-
-      - name: GPU Test Cleanup
-        if: always()
-        uses: ./.github/actions/gpu-test-cleanup
-        with:
-          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
-          artifact_name_prefix: gpu-training-test-debug
+    name: GPU Training Test (nvkind + H100 x1)
+    uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml
+    with:
+      job_name: GPU Training Test (nvkind + H100 x1)
+      cluster_name: gpu-training-test
+      intent: training
+      platform: kubeflow
+      chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow
+      artifact_name_prefix: gpu-training-test-debug
+      run_full_validation: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.run_full_validation) }}
diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml
index d5b8c5c74..b2b609962 100644
--- a/.github/workflows/gpu-smoke-test.yaml
+++ b/.github/workflows/gpu-smoke-test.yaml
@@ -20,8 +20,6 @@ on:
   push:
     branches:
       - "pull-request/[0-9]+"
-  pull_request:
-    types: [labeled]
   workflow_dispatch: {}  # Allow manual runs
 
 permissions:
@@ -40,6 +38,8 @@ jobs:
       should-run: ${{ steps.filter.outputs.matched }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          persist-credentials: false
       - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d  # v4.0.1
         id: filter
         with:
@@ -47,11 +47,17 @@ jobs:
           filters: |
             matched:
               - '.github/workflows/gpu-smoke-test.yaml'
+              - '.settings.yaml'
               - '.github/actions/gpu-cluster-setup/**'
-              - '.github/actions/gpu-operator-install/**'
+              - '.github/actions/runtime-install/**'
               - '.github/actions/aicr-build/**'
+              - '.github/actions/gpu-debug-diagnostics/**'
               - '.github/actions/gpu-test-cleanup/**'
+              - '.github/actions/gpu-smoke-nvidia-smi/**'
               - '.github/actions/load-versions/**'
+              - '.github/scripts/gpu-debug-diagnostics.sh'
+              - '.github/scripts/gpu-smoke-run-nvidia-smi.sh'
+              - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh'
               - 'pkg/collector/**'
               - 'pkg/snapshotter/**'
               - '.github/actions/gpu-snapshot-validate/**'
@@ -62,11 +68,13 @@ jobs:
 
   gpu-smoke-test:
     needs: [check-paths]
+    # NVIDIA self-hosted GPU runners reject pull_request event jobs before
+    # checkout. PR GPU coverage runs through the pull-request/<number> push
+    # mirror after ok-to-test approval.
     if: >
       always() && (
         github.event_name == 'schedule' ||
         github.event_name == 'workflow_dispatch' ||
-        (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') ||
         (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true')
       )
     name: GPU Smoke Test (nvkind + L40G)
@@ -86,45 +94,43 @@ jobs:
         with:
           persist-credentials: false
 
+      - name: Runner preflight snapshot
+        shell: bash
+        run: |
+          echo "::group::Runner preflight snapshot"
+          echo "hostname:     $(hostname)"
+          echo "kernel:       $(uname -a)"
+          echo "uptime:       $(uptime)"
+          echo "loadavg:      $(cat /proc/loadavg 2>/dev/null || echo unavailable)"
+          echo "nproc:        $(nproc 2>/dev/null || echo unavailable)"
+          free -h 2>/dev/null || true
+          df -h / 2>/dev/null || true
+          echo "::endgroup::"
+
       - name: Set up GPU cluster
         uses: ./.github/actions/gpu-cluster-setup
+        with:
+          # Keep smoke runner preflight explicit so action default changes do not
+          # silently alter L40G coverage.
+          min_gpu_count: '1'
+          min_free_disk_gb: '20'
+          min_available_memory_gb: '8'
 
       - name: Build aicr
         uses: ./.github/actions/aicr-build
         with:
+          build_snapshot_agent: 'false'
           validator_phases: 'none'
 
       - name: Install GPU operator (helm)
-        uses: ./.github/actions/gpu-operator-install
+        uses: ./.github/actions/runtime-install
         with:
           method: helm
 
       - name: Run nvidia-smi in a pod
-        run: |
-          cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f -
-          apiVersion: v1
-          kind: Pod
-          metadata:
-            name: gpu-smoke-test
-          spec:
-            restartPolicy: Never
-            containers:
-            - name: nvidia-smi
-              image: ubuntu:22.04
-              command: ["nvidia-smi"]
-              resources:
-                limits:
-                  nvidia.com/gpu: 1
-          EOF
-
-          echo "Waiting for gpu-smoke-test pod to complete..."
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=condition=Ready --timeout=120s || true
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \
-            --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s
-
-      - name: Show nvidia-smi output
-        run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test
+        uses: ./.github/actions/gpu-smoke-nvidia-smi
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
       # --- Snapshot and validation ---
 
@@ -135,22 +141,10 @@ jobs:
           min_gpu_count: '1'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      - name: Debug diagnostics
-        if: failure()
-        run: |
-          echo "=== ClusterPolicy status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
-          echo "=== GPU Operator pods ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
-          echo "=== Non-running pods (all namespaces) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
-          echo "=== Recent events (gpu-operator) ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
-          echo "=== Node status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
-
       - name: GPU Test Cleanup
         if: always()
         uses: ./.github/actions/gpu-test-cleanup
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          collect_artifacts: ${{ job.status != 'success' }}
+          diagnostic_mode: smoke
diff --git a/.settings.yaml b/.settings.yaml
index 75b4559b1..43da8de37 100644
--- a/.settings.yaml
+++ b/.settings.yaml
@@ -40,6 +40,7 @@ security_tools:
 testing_tools:
   kubectl: 'v1.35.0'
   kind: '0.31.0'
+  nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138'
   ctlptl: '0.9.0'
   tilt: '0.37.0'
   helm: 'v4.1.1'
@@ -71,6 +72,9 @@ docs_tools:
 # Testing Configuration
 testing:
   kind_node_image: 'kindest/node:v1.32.0'
+  h100_kind_node_image: 'kindest/node:v1.35.0'
+  gpu_operator_chart_version: 'v25.10.1'
+  snapshot_agent_cuda_image: 'nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04'
 
   # Component test harness configuration
   # Used by tools/component-test/ scripts to validate individual components
diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh
index 72b64dae1..53e70fabd 100755
--- a/kwok/scripts/install-karpenter-kwok.sh
+++ b/kwok/scripts/install-karpenter-kwok.sh
@@ -41,7 +41,10 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}"
 KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}"
 KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}"
 KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}"
-KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}"  # 15 minutes
+KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}"
+KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900s}"  # 15 minutes
+KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}"
+KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}"
 
 RED='\033[0;31m'
 GREEN='\033[0;32m'
@@ -52,6 +55,14 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
 log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
 log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
 
+kubectl_kind() {
+    kubectl --context="${KUBE_CONTEXT}" "$@"
+}
+
+helm_kind() {
+    helm --kube-context "${KUBE_CONTEXT}" "$@"
+}
+
 # -------------------------------------------------------------------
 # Step 1: Install KWOK controller
 # Uses the same approach as kwok/scripts/run-all-recipes.sh
@@ -59,19 +70,20 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
 install_kwok() {
     log_info "Installing KWOK controller..."
 
-    if kubectl get deployment -n kube-system kwok-controller &>/dev/null; then
+    if kubectl_kind get deployment -n kube-system kwok-controller &>/dev/null; then
         log_info "KWOK controller already installed, skipping"
         return 0
     fi
 
     helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
-    helm upgrade --install kwok-controller kwok/kwok \
+    helm_kind upgrade --install kwok-controller kwok/kwok \
         --namespace kube-system \
         --set hostNetwork=true \
-        --wait --timeout 300s
+        --wait --timeout "${KWOK_HELM_TIMEOUT}"
 
-    helm upgrade --install kwok-stage-fast kwok/stage-fast \
-        --namespace kube-system
+    helm_kind upgrade --install kwok-stage-fast kwok/stage-fast \
+        --namespace kube-system \
+        --wait --timeout "${KWOK_HELM_TIMEOUT}"
 
     log_info "KWOK controller installed"
 }
@@ -98,11 +110,16 @@ build_karpenter() {
     # Redirect stderr to avoid Go compilation warnings corrupting the image reference.
     # Output format: kind.local/<name>:<content-hash>
     # Hard timeout prevents a slow/stuck compilation from consuming the entire job.
+    local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr"
     CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \
         env KO_DOCKER_REPO=kind.local \
         KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \
-        ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || {
-        log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s"
+        ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || {
+        log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}"
+        if [[ -s "${ko_stderr}" ]]; then
+            log_error "ko build stderr:"
+            sed 's/^/  /' "${ko_stderr}" || true
+        fi
         exit 1
     }
 
@@ -141,20 +158,20 @@ deploy_karpenter() {
     log_info "Deploying Karpenter to namespace ${KARPENTER_NAMESPACE}..."
 
     # Apply CRDs first
-    kubectl apply -f "${KARPENTER_CLONE_DIR}/kwok/charts/crds"
+    kubectl_kind apply -f "${KARPENTER_CLONE_DIR}/kwok/charts/crds"
 
     # Create namespace and instance types ConfigMap before Helm install
     # so the volume mount can reference it immediately.
-    kubectl create namespace "${KARPENTER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
+    kubectl_kind create namespace "${KARPENTER_NAMESPACE}" --dry-run=client -o yaml | kubectl_kind apply -f -
 
     local instance_types_file="${MANIFESTS_DIR}/instance-types.json"
     if [[ ! -f "${instance_types_file}" ]]; then
         log_error "Instance types file not found: ${instance_types_file}"
         exit 1
     fi
-    kubectl create configmap -n "${KARPENTER_NAMESPACE}" karpenter-instance-types \
+    kubectl_kind create configmap -n "${KARPENTER_NAMESPACE}" karpenter-instance-types \
         --from-file=instance-types.json="${instance_types_file}" \
-        --dry-run=client -o yaml | kubectl apply -f -
+        --dry-run=client -o yaml | kubectl_kind apply -f -
 
     # Build the image tag argument. If ko provided a tag, use it.
     # If not, omit it and let the chart default to its AppVersion.
@@ -169,7 +186,7 @@ deploy_karpenter() {
     # - extraVolumes + extraVolumeMounts: mount the instance types ConfigMap
     # - controller.env: set INSTANCE_TYPES_FILE_PATH for the KWOK provider
     # shellcheck disable=SC2086
-    helm upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \
+    helm_kind upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \
         --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
         --set controller.image.repository="${IMG_REPOSITORY}" \
         ${tag_arg} \
@@ -187,17 +204,17 @@ deploy_karpenter() {
         --set 'controller.extraVolumeMounts[0].readOnly=true' \
         --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \
         --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \
-        --wait --timeout 300s \
+        --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \
         || {
             log_error "Helm install failed. Diagnostics:"
-            kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true
-            kubectl -n "${KARPENTER_NAMESPACE}" describe deployment karpenter 2>/dev/null || true
+            kubectl_kind -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true
+            kubectl_kind -n "${KARPENTER_NAMESPACE}" describe deployment karpenter 2>/dev/null || true
             local POD
-            POD=$(kubectl -n "${KARPENTER_NAMESPACE}" get pods -l app.kubernetes.io/name=karpenter \
+            POD=$(kubectl_kind -n "${KARPENTER_NAMESPACE}" get pods -l app.kubernetes.io/name=karpenter \
                 -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
             if [[ -n "${POD}" ]]; then
-                kubectl -n "${KARPENTER_NAMESPACE}" describe pod "${POD}" 2>/dev/null || true
-                kubectl -n "${KARPENTER_NAMESPACE}" logs "${POD}" --tail=50 2>/dev/null || true
+                kubectl_kind -n "${KARPENTER_NAMESPACE}" describe pod "${POD}" 2>/dev/null || true
+                kubectl_kind -n "${KARPENTER_NAMESPACE}" logs "${POD}" --tail=50 2>/dev/null || true
             fi
             exit 1
         }
@@ -212,7 +229,9 @@ main() {
     log_info "=== Karpenter KWOK Provider Installation ==="
     log_info "Karpenter version: ${KARPENTER_VERSION}"
     log_info "Kind cluster: ${KIND_CLUSTER_NAME}"
+    log_info "Kube context: ${KUBE_CONTEXT}"
     log_info "Namespace: ${KARPENTER_NAMESPACE}"
+    log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT} karpenter=${KARPENTER_HELM_TIMEOUT}"
 
     install_kwok
     build_karpenter
diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh
index 459b054b5..6b4af1549 100755
--- a/kwok/scripts/run-all-recipes.sh
+++ b/kwok/scripts/run-all-recipes.sh
@@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; }
 log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
 log_error() { echo -e "${RED}[ERROR]${NC} $*"; }
 
+retry_command() {
+    local description="$1"
+    shift
+
+    local max_attempts="${KWOK_COMMAND_RETRIES:-3}"
+    local delay="${KWOK_COMMAND_RETRY_DELAY:-5}"
+    local attempt=1
+
+    while true; do
+        if "$@"; then
+            return 0
+        fi
+
+        if ((attempt >= max_attempts)); then
+            log_error "${description} failed after ${attempt} attempt(s)"
+            return 1
+        fi
+
+        log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..."
+        sleep "${delay}"
+        attempt=$((attempt + 1))
+        delay=$((delay * 2))
+    done
+}
+
 # Find recipes with service criteria (testable cloud configurations)
 get_recipes() {
     for overlay in "${OVERLAYS_DIR}"/*.yaml; do
@@ -68,10 +93,13 @@ ensure_cluster() {
 
     if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then
         log_info "Installing KWOK controller..."
-        helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
-        helm upgrade --install kwok-controller kwok/kwok \
+        retry_command "Adding KWOK Helm repository" \
+            helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update
+        retry_command "Installing KWOK controller" \
+            helm upgrade --install kwok-controller kwok/kwok \
             --namespace kube-system --set hostNetwork=true --wait
-        helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
+        retry_command "Installing KWOK stage-fast" \
+            helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system
     fi
 
     # Patch kindnet to exclude KWOK nodes
diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml
index b0d8dbd76..f38462580 100644
--- a/recipes/overlays/kind.yaml
+++ b/recipes/overlays/kind.yaml
@@ -115,6 +115,11 @@ spec:
     - name: kube-prometheus-stack
       type: Helm
       overrides:
+        # CI only needs component health, not the full upstream alerting rule
+        # set. Skipping default rules reduces PrometheusRule churn during
+        # install on small kind control planes.
+        defaultRules:
+          create: false
         prometheus:
           prometheusSpec:
             # Smaller storage for local testing
@@ -132,14 +137,35 @@ spec:
                 memory: 1Gi
             # Shorter retention for local testing
             retention: 7d
-        grafana:
+        prometheusOperator:
+          # Keep operator-owned monitoring custom resources in the monitoring
+          # namespace for kind. Do not scope ServiceMonitor discovery here;
+          # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces.
+          alertmanagerInstanceNamespaces:
+            - monitoring
+          alertmanagerConfigNamespaces:
+            - monitoring
+          prometheusInstanceNamespaces:
+            - monitoring
+          thanosRulerInstanceNamespaces:
+            - monitoring
+          # CI kind control planes can be slow under image pulls and controller
+          # startup. Avoid restarting the operator on short health probe stalls.
+          livenessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 10
+          readinessProbe:
+            timeoutSeconds: 10
+            failureThreshold: 6
           resources:
             requests:
               cpu: 100m
-              memory: 128Mi
+              memory: 256Mi
             limits:
               cpu: 500m
               memory: 512Mi
+        grafana:
+          enabled: false
         alertmanager:
           alertmanagerSpec:
             resources:
diff --git a/recipes/validators/README.md b/recipes/validators/README.md
index e4a7bfa47..b45808a4a 100644
--- a/recipes/validators/README.md
+++ b/recipes/validators/README.md
@@ -55,7 +55,7 @@ Applied by `catalog.Load` in order:
 | Name | Description | Timeout |
 |------|-------------|---------|
 | `dra-support` | Verify Dynamic Resource Allocation support | 5m |
-| `gang-scheduling` | Verify gang scheduling with KAI scheduler | 10m |
+| `gang-scheduling` | Verify gang scheduling with KAI scheduler using CPU-only workers | 10m |
 | `accelerator-metrics` | Verify accelerator metrics from DCGM exporter | 5m |
 | `ai-service-metrics` | Verify AI service metrics via Prometheus | 5m |
 | `inference-gateway` | Verify inference gateway (kgateway) is operational | 5m |
diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml
index 6f50bc695..d322ac011 100644
--- a/recipes/validators/catalog.yaml
+++ b/recipes/validators/catalog.yaml
@@ -88,7 +88,7 @@ validators:
     env: []
   - name: gang-scheduling
     phase: conformance
-    description: "Verify gang scheduling with KAI scheduler"
+    description: "Verify gang scheduling with KAI scheduler using CPU-only workers"
     image: ghcr.io/nvidia/aicr-validators/conformance:latest
     timeout: 10m
     args: ["gang-scheduling"]
diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md
index b1a88e9d4..a69b88f13 100644
--- a/tests/chainsaw/ai-conformance/README.md
+++ b/tests/chainsaw/ai-conformance/README.md
@@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/
 │   ├── assert-cert-manager.yaml         # cert-manager healthy
 │   ├── assert-dra-driver.yaml           # DRA driver healthy
 │   ├── assert-kai-scheduler.yaml        # KAI scheduler healthy
-│   ├── assert-monitoring.yaml           # Prometheus stack healthy
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy with Grafana
 │   └── assert-skyhook.yaml              # Skyhook operator healthy
 ├── kind-common/                         # Shared Kind-only assertions
 │   ├── assert-gpu-operator.yaml         # GPU operator healthy on kind
+│   ├── assert-monitoring.yaml           # Prometheus stack healthy without Grafana
 │   ├── assert-network-operator.yaml     # Network operator healthy on kind
 │   └── assert-nvsentinel.yaml           # NVSentinel healthy on kind
 ├── kind-inference-dynamo/               # Kind + H100 + inference + dynamo leaf suite
diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
new file mode 100644
index 000000000..868be3fea
--- /dev/null
+++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml
@@ -0,0 +1,85 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Assert kind monitoring stack components required by H100 CI are healthy.
+# Grafana is intentionally not asserted here because conformance metrics use
+# Prometheus, DCGM exporter, and prometheus-adapter directly.
+
+# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-prometheus-operator
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# kube-state-metrics - Kubernetes object state metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus StatefulSet - time series database
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus-kube-prometheus-prometheus
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Alertmanager StatefulSet - alert routing and silencing
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: alertmanager-kube-prometheus-alertmanager
+  namespace: monitoring
+status:
+  (readyReplicas > `0`): true
+---
+# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: prometheus-node-exporter
+  namespace: monitoring
+status:
+  (numberReady > `0`): true
+  (desiredNumberScheduled > `0`): true
+---
+# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: k8s-ephemeral-storage-metrics
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
+---
+# Prometheus Adapter - custom metrics API for HPA
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-adapter
+  namespace: monitoring
+status:
+  (conditions[?type == 'Available']):
+    - status: "True"
diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
index 1b1f701ad..51c7af093 100644
--- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml
@@ -65,10 +65,10 @@ spec:
 
     # ── Monitoring ─────────────────────────────────────────────────────
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack components.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     # ── kgateway ───────────────────────────────────────────────────────
     - name: assert-kgateway
@@ -110,6 +110,8 @@ spec:
     # ── KAI Scheduler ──────────────────────────────────────────────────
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml
diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
index 382d99104..d16d3ad38 100644
--- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
+++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml
@@ -60,10 +60,10 @@ spec:
             file: ../kind-common/assert-gpu-operator.yaml
 
     - name: assert-monitoring
-      description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter.
+      description: Verify kind monitoring stack components.
       try:
         - assert:
-            file: ../common/assert-monitoring.yaml
+            file: ../kind-common/assert-monitoring.yaml
 
     - name: assert-skyhook
       description: Verify Skyhook operator controller-manager is available.
@@ -73,6 +73,8 @@ spec:
 
     - name: assert-kubeflow-trainer
       description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: assert-kubeflow-trainer.yaml
@@ -99,6 +101,8 @@ spec:
 
     - name: assert-kai-scheduler
       description: Verify KAI scheduler is available.
+      timeouts:
+        assert: 600s
       try:
         - assert:
             file: ../common/assert-kai-scheduler.yaml
diff --git a/validators/conformance/gang_scheduling_check.go b/validators/conformance/gang_scheduling_check.go
index 2bfda2612..e29e20de7 100644
--- a/validators/conformance/gang_scheduling_check.go
+++ b/validators/conformance/gang_scheduling_check.go
@@ -26,7 +26,6 @@ import (
 	"github.com/NVIDIA/aicr/pkg/errors"
 	"github.com/NVIDIA/aicr/pkg/k8s"
 	"github.com/NVIDIA/aicr/validators"
-	"github.com/NVIDIA/aicr/validators/helper"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
@@ -40,7 +39,6 @@ const (
 	gangTestNamespace = "gang-scheduling-test"
 	gangTestPrefix    = "gang-test-"
 	gangPodPrefix     = "gang-worker-"
-	gangClaimPrefix   = "gang-gpu-claim-"
 	gangGroupPrefix   = "gang-group-"
 	gangMinMembers    = 2
 )
@@ -60,12 +58,15 @@ var podGroupGVR = schema.GroupVersionResource{
 	Group: "scheduling.run.ai", Version: "v2alpha2", Resource: "podgroups",
 }
 
+// Gang scheduling scope: this check validates KAI PodGroup co-scheduling only.
+// GPU access and DRA allocation are covered by the DRA support and secure
+// accelerator access checks so full conformance can run on one H100.
+
 // gangTestRun holds per-invocation resource names to avoid collisions.
 type gangTestRun struct {
 	suffix    string
 	groupName string
 	pods      [gangMinMembers]string
-	claims    [gangMinMembers]string
 }
 
 type gangSchedulingReport struct {
@@ -86,15 +87,16 @@ func newGangTestRun() (*gangTestRun, error) {
 	}
 	for i := range gangMinMembers {
 		run.pods[i] = fmt.Sprintf("%s%s-%d", gangPodPrefix, suffix, i)
-		run.claims[i] = fmt.Sprintf("%s%s-%d", gangClaimPrefix, suffix, i)
 	}
 	return run, nil
 }
 
 // CheckGangScheduling validates CNCF requirement #7: Gang Scheduling.
 // Verifies KAI scheduler deployments are running, required CRDs exist, and
-// exercises gang scheduling by creating a PodGroup with 2 GPU pods that must
-// be co-scheduled via the KAI scheduler.
+// exercises gang scheduling by creating a PodGroup with 2 CPU-only pods that
+// must be co-scheduled via the KAI scheduler. GPU access and DRA isolation are
+// validated separately by the DRA and secure accelerator access checks; keeping
+// this workload CPU-only lets one-GPU CI clusters run the full conformance phase.
 func CheckGangScheduling(ctx *validators.Context) error {
 	if ctx.Clientset == nil {
 		return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
@@ -162,20 +164,7 @@ func CheckGangScheduling(ctx *validators.Context) error {
 		"kubectl get crd queues.scheduling.run.ai podgroups.scheduling.run.ai",
 		crdSummary.String())
 
-	// 3. Pre-flight: ensure enough free GPUs for the gang test.
-	total, free, gpuErr := countAvailableGPUs(ctx.Ctx, dynClient)
-	if gpuErr != nil {
-		return gpuErr
-	}
-	recordArtifact(ctx, "GPU Availability",
-		fmt.Sprintf("Total GPUs: %d\nFree GPUs:  %d\nRequired:   %d", total, free, gangMinMembers))
-	if free < gangMinMembers {
-		return errors.New(errors.ErrCodeUnavailable,
-			fmt.Sprintf("insufficient free GPUs for gang scheduling test: %d free of %d total (need %d)",
-				free, total, gangMinMembers))
-	}
-
-	// 4. Functional test: create PodGroup with 2 GPU pods, verify co-scheduling.
+	// 3. Functional test: create PodGroup with 2 CPU-only pods, verify co-scheduling.
 	run, err := newGangTestRun()
 	if err != nil {
 		return err
@@ -187,13 +176,13 @@ func CheckGangScheduling(ctx *validators.Context) error {
 		cleanupGangTestResources(cleanupCtx, ctx.Clientset, dynClient, run)
 		recordRawTextArtifact(ctx, "Delete test namespace",
 			"kubectl delete namespace gang-scheduling-test --ignore-not-found",
-			"Deleted gang test pods, claims, and PodGroup; namespace retained intentionally to avoid DRA finalizer stalls.")
+			"Deleted gang test pods and PodGroup; namespace retained intentionally to keep cleanup bounded.")
 	}()
 
 	recordRawTextArtifact(ctx, "Apply test manifest",
-		"kubectl apply -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml",
-		fmt.Sprintf("Created PodGroup=%s ResourceClaims=%s,%s Pods=%s,%s in namespace=%s",
-			run.groupName, run.claims[0], run.claims[1], run.pods[0], run.pods[1], gangTestNamespace))
+		"kubectl apply generated CPU-only PodGroup test resources",
+		fmt.Sprintf("Created PodGroup=%s Pods=%s,%s in namespace=%s",
+			run.groupName, run.pods[0], run.pods[1], gangTestNamespace))
 
 	if err = deployGangTestResources(ctx.Ctx, ctx.Clientset, dynClient, run, ctx.Tolerations); err != nil {
 		return err
@@ -274,7 +263,7 @@ func collectGangTestArtifacts(ctx *validators.Context, dynClient dynamic.Interfa
 	}
 }
 
-// deployGangTestResources creates the namespace, PodGroup, ResourceClaims, and Pods.
+// deployGangTestResources creates the namespace, PodGroup, and worker Pods.
 // tolerations, when non-nil, replace the default tolerate-all policy on test pods.
 func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *gangTestRun, tolerations []corev1.Toleration) error {
 	// 1. Create namespace (idempotent).
@@ -292,15 +281,8 @@ func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface
 		return errors.Wrap(errors.ErrCodeInternal, "failed to create PodGroup", err)
 	}
 
-	// 3. Create ResourceClaims and Pods.
+	// 3. Create Pods.
 	for i := range gangMinMembers {
-		claim := buildGangResourceClaim(run, i)
-		if _, err := dynClient.Resource(claimGVR).Namespace(gangTestNamespace).Create(
-			ctx, claim, metav1.CreateOptions{}); err != nil {
-			return errors.Wrap(errors.ErrCodeInternal,
-				fmt.Sprintf("failed to create ResourceClaim %s", run.claims[i]), err)
-		}
-
 		pod := buildGangTestPod(run, i, tolerations)
 		if _, err := clientset.CoreV1().Pods(gangTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
 			return errors.Wrap(errors.ErrCodeInternal,
@@ -380,10 +362,11 @@ func validateGangPatterns(pods [gangMinMembers]*corev1.Pod, run *gangTestRun) (*
 					run.pods[i], run.groupName))
 		}
 
-		// Pod must use DRA (resourceClaims, not device plugin).
-		if len(pod.Spec.ResourceClaims) == 0 {
+		// Gang scheduling is intentionally CPU-only. DRA behavior is validated
+		// separately by dra-support and secure-accelerator-access.
+		if len(pod.Spec.ResourceClaims) != 0 {
 			return nil, errors.New(errors.ErrCodeInternal,
-				fmt.Sprintf("gang test pod %s does not use DRA resourceClaims", run.pods[i]))
+				fmt.Sprintf("gang test pod %s unexpectedly uses resourceClaims", run.pods[i]))
 		}
 	}
 
@@ -445,11 +428,6 @@ func cleanupGangTestResources(ctx context.Context, clientset kubernetes.Interfac
 			return err
 		})
 	}
-	// Delete claims.
-	for i := range gangMinMembers {
-		_ = k8s.IgnoreNotFound(dynClient.Resource(claimGVR).Namespace(gangTestNamespace).Delete(
-			ctx, run.claims[i], metav1.DeleteOptions{}))
-	}
 	// Delete PodGroup.
 	_ = k8s.IgnoreNotFound(dynClient.Resource(podGroupGVR).Namespace(gangTestNamespace).Delete(
 		ctx, run.groupName, metav1.DeleteOptions{}))
@@ -473,38 +451,6 @@ func buildPodGroup(run *gangTestRun) *unstructured.Unstructured {
 	}
 }
 
-// buildGangResourceClaim returns the unstructured ResourceClaim for a gang test pod.
-// The kai.scheduler/queue label is required by KAI v0.13.0+ for DRA claims.
-func buildGangResourceClaim(run *gangTestRun, index int) *unstructured.Unstructured {
-	return &unstructured.Unstructured{
-		Object: map[string]interface{}{
-			"apiVersion": "resource.k8s.io/v1",
-			"kind":       "ResourceClaim",
-			"metadata": map[string]interface{}{
-				"name":      run.claims[index],
-				"namespace": gangTestNamespace,
-				"labels": map[string]interface{}{
-					"kai.scheduler/queue": "default-queue",
-				},
-			},
-			"spec": map[string]interface{}{
-				"devices": map[string]interface{}{
-					"requests": []interface{}{
-						map[string]interface{}{
-							"name": "gpu",
-							"exactly": map[string]interface{}{
-								"deviceClassName": "gpu.nvidia.com",
-								"allocationMode":  "ExactCount",
-								"count":           int64(1),
-							},
-						},
-					},
-				},
-			},
-		},
-	}
-}
-
 // buildGangTestPod returns the Pod spec for a gang scheduling test worker.
 // tolerations, when non-nil, replace the default tolerate-all policy.
 func buildGangTestPod(run *gangTestRun, index int, tolerations []corev1.Toleration) *corev1.Pod {
@@ -524,22 +470,11 @@ func buildGangTestPod(run *gangTestRun, index int, tolerations []corev1.Tolerati
 			SchedulerName: "kai-scheduler",
 			RestartPolicy: corev1.RestartPolicyNever,
 			Tolerations:   tolerations,
-			ResourceClaims: []corev1.PodResourceClaim{
-				{
-					Name:              "gpu",
-					ResourceClaimName: helper.StrPtr(run.claims[index]),
-				},
-			},
 			Containers: []corev1.Container{
 				{
 					Name:    "worker",
-					Image:   "nvidia/cuda:12.9.0-base-ubuntu24.04",
-					Command: []string{"bash", "-c", fmt.Sprintf("nvidia-smi && echo 'Gang worker %d completed successfully'", index)},
-					Resources: corev1.ResourceRequirements{
-						Claims: []corev1.ResourceClaim{
-							{Name: "gpu"},
-						},
-					},
+					Image:   defaults.ProbeImage,
+					Command: []string{"sh", "-c", fmt.Sprintf("echo 'Gang worker %d completed successfully'", index)},
 				},
 			},
 		},
diff --git a/validators/conformance/helpers.go b/validators/conformance/helpers.go
index 22ccc8452..5b78d849c 100644
--- a/validators/conformance/helpers.go
+++ b/validators/conformance/helpers.go
@@ -292,51 +292,3 @@ func waitForDeletion(ctx context.Context, getFunc func() error) {
 		},
 	)
 }
-
-// gpuDriverName is the DRA driver name for NVIDIA GPUs.
-const gpuDriverName = "gpu.nvidia.com"
-
-// countAvailableGPUs counts total GPU devices from ResourceSlices and subtracts
-// allocated devices from ResourceClaims to determine how many are free.
-func countAvailableGPUs(ctx context.Context, dynClient dynamic.Interface) (total, free int, err error) {
-	// Count total GPU devices from ResourceSlices.
-	// Uses package-level resourceSliceGVR defined in secure_access_check.go.
-	slices, err := dynClient.Resource(resourceSliceGVR).List(ctx, metav1.ListOptions{})
-	if err != nil {
-		return 0, 0, errors.Wrap(errors.ErrCodeInternal, "failed to list ResourceSlices", err)
-	}
-	for _, slice := range slices.Items {
-		driver, _, _ := unstructured.NestedString(slice.Object, "spec", "driver")
-		if driver != gpuDriverName {
-			continue
-		}
-		devices, found, _ := unstructured.NestedSlice(slice.Object, "spec", "devices")
-		if found {
-			total += len(devices)
-		}
-	}
-
-	// Count allocated GPU devices from ResourceClaims.
-	var allocated int
-	claims, err := dynClient.Resource(claimGVR).List(ctx, metav1.ListOptions{})
-	if err != nil {
-		return 0, 0, errors.Wrap(errors.ErrCodeInternal, "failed to list ResourceClaims", err)
-	}
-	for _, claim := range claims.Items {
-		results, found, _ := unstructured.NestedSlice(claim.Object, "status", "allocation", "devices", "results")
-		if !found {
-			continue
-		}
-		for _, r := range results {
-			result, ok := r.(map[string]interface{})
-			if !ok {
-				continue
-			}
-			if result["driver"] == gpuDriverName {
-				allocated++
-			}
-		}
-	}
-
-	return total, total - allocated, nil
-}
diff --git a/validators/conformance/helpers_test.go b/validators/conformance/helpers_test.go
index 1097ccc8b..5b2566b67 100644
--- a/validators/conformance/helpers_test.go
+++ b/validators/conformance/helpers_test.go
@@ -18,6 +18,7 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/NVIDIA/aicr/pkg/defaults"
 	corev1 "k8s.io/api/core/v1"
 )
 
@@ -397,12 +398,6 @@ func TestNewGangTestRun(t *testing.T) {
 		if !strings.HasPrefix(run.pods[i], gangPodPrefix) {
 			t.Errorf("newGangTestRun() pods[%d] = %q, want prefix %q", i, run.pods[i], gangPodPrefix)
 		}
-		if run.claims[i] == "" {
-			t.Errorf("newGangTestRun() claims[%d] is empty", i)
-		}
-		if !strings.HasPrefix(run.claims[i], gangClaimPrefix) {
-			t.Errorf("newGangTestRun() claims[%d] = %q, want prefix %q", i, run.claims[i], gangClaimPrefix)
-		}
 	}
 
 	// Two calls should produce different suffixes.
@@ -414,3 +409,31 @@ func TestNewGangTestRun(t *testing.T) {
 		t.Error("newGangTestRun() two calls produced identical suffixes")
 	}
 }
+
+func TestBuildGangTestPodUsesCPUOnlyWorkload(t *testing.T) {
+	run, err := newGangTestRun()
+	if err != nil {
+		t.Fatalf("newGangTestRun() error = %v", err)
+	}
+
+	pod := buildGangTestPod(run, 0, nil)
+	if pod.Spec.SchedulerName != "kai-scheduler" {
+		t.Errorf("SchedulerName = %q, want kai-scheduler", pod.Spec.SchedulerName)
+	}
+	if got := pod.Labels["pod-group.scheduling.run.ai/name"]; got != run.groupName {
+		t.Errorf("pod group label = %q, want %q", got, run.groupName)
+	}
+	if len(pod.Spec.ResourceClaims) != 0 {
+		t.Fatalf("ResourceClaims length = %d, want 0", len(pod.Spec.ResourceClaims))
+	}
+	if len(pod.Spec.Containers) != 1 {
+		t.Fatalf("containers length = %d, want 1", len(pod.Spec.Containers))
+	}
+	container := pod.Spec.Containers[0]
+	if container.Image != defaults.ProbeImage {
+		t.Errorf("container image = %q, want %q", container.Image, defaults.ProbeImage)
+	}
+	if len(container.Resources.Claims) != 0 {
+		t.Errorf("container resource claims length = %d, want 0", len(container.Resources.Claims))
+	}
+}
diff --git a/validators/conformance/testdata/README.md b/validators/conformance/testdata/README.md
new file mode 100644
index 000000000..efd9e8043
--- /dev/null
+++ b/validators/conformance/testdata/README.md
@@ -0,0 +1,6 @@
+# Conformance Validator Test Data
+
+The conformance validator currently does not require static fixtures, but the
+CI image build copies each validator phase's `testdata` directory into the
+image. Keep this directory tracked so the image build fails only when a phase's
+fixture directory is genuinely missing.