diff --git a/.github/actions/README.md b/.github/actions/README.md index cef2fd6ca..15710df7d 100644 --- a/.github/actions/README.md +++ b/.github/actions/README.md @@ -4,6 +4,13 @@ This directory contains a modular, reusable GitHub Actions architecture optimize ## Composite Actions +### Script Conventions + +Composite action helper scripts in this directory are intentionally portable +across checkout modes: keep them mode `0644` and invoke them as +`bash path/to/script.sh` from workflows or `action.yml` files. Do not rely on +executable bits or `./script.sh` invocation. + ### Core CI/CD Actions #### `security-scan/` @@ -50,7 +57,8 @@ This action runs `tools/setup-tools --skip-go --skip-docker` in auto mode, which **When to use**: When you need version values in workflow steps **Outputs**: - `go`, `goreleaser`, `ko`, `crane`, `golangci_lint`, `yamllint`, `addlicense` -- `grype`, `kubectl`, `kind`, `ctlptl`, `tilt`, `helm` +- `grype`, `kubectl`, `kind`, `nvkind`, `ctlptl`, `tilt`, `helm` +- `kind_node_image`, `h100_kind_node_image` **Example**: ```yaml diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 7a973ae21..6bbd6a0ab 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -13,9 +13,17 @@ # limitations under the License. name: 'AICR Build' -description: 'Builds the aicr validator image (via Dockerfile) and CLI binary, and loads the image into kind.' +description: 'Builds the aicr CLI and optional snapshot/validator images, and loads requested images into kind.' inputs: + build_cli: + description: 'Build and stage the standalone aicr CLI binary at the repository root' + required: false + default: 'true' + build_snapshot_agent: + description: 'Build the CUDA-based snapshot agent image and load it into kind' + required: false + default: 'true' build_validators: description: 'Deprecated: use validator_phases instead. Ignored when validator_phases is set.' required: false @@ -28,86 +36,34 @@ inputs: runs: using: 'composite' steps: - - - name: Install ko + - name: Build standalone aicr CLI binary + if: inputs.build_cli == 'true' shell: bash - run: | - KO_VERSION=$(yq eval '.build_tools.ko' .settings.yaml) - GOFLAGS= go install "github.com/google/ko@${KO_VERSION}" + env: + GOFLAGS: -mod=vendor + run: bash "${{ github.action_path }}/build-cli.sh" - - name: Build snapshot agent image and load into kind + - name: Build snapshot agent CLI binary + if: inputs.build_cli != 'true' && inputs.build_snapshot_agent == 'true' shell: bash env: GOFLAGS: -mod=vendor - run: | - # Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). - # Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed. - # GPU test workflows use --image=ko.local:smoke-test for aicr snapshot. - CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr - docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE' - FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04 - COPY dist/aicr /usr/local/bin/aicr - ENTRYPOINT ["/usr/local/bin/aicr"] - DOCKERFILE + run: bash "${{ github.action_path }}/build-cli.sh" - # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but - # does not set a node selector, so it can land on any GPU-capable node - # including the control-plane (e.g., T4 smoke test). - # - # Timeout is intentionally generous (900s per attempt). H100 self-hosted - # runners transfer images over a shared Docker-in-Docker bridge; large - # CUDA base images (~250MB compressed) combined with I/O contention from - # parallel GPU operator pods regularly exceed the previous 600s limit. - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { - echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" - } + - name: Build snapshot agent image and load into kind + if: inputs.build_snapshot_agent == 'true' + shell: bash + run: bash "${{ github.action_path }}/build-snapshot-agent.sh" - name: Build validator images and load into kind if: "!(inputs.validator_phases == 'none' || (inputs.validator_phases == '' && inputs.build_validators == 'false'))" shell: bash env: GOFLAGS: -mod=vendor - run: | - # Determine which validator phases to build. - # validator_phases takes precedence; build_validators is a deprecated fallback. - if [[ -n "${{ inputs.validator_phases }}" ]]; then - if [[ "${{ inputs.validator_phases }}" == "none" ]]; then - echo "Skipping validator builds (validator_phases=none)" - exit 0 - fi - PHASES="${{ inputs.validator_phases }}" - else - # Default: build all phases (backwards compatible) - PHASES="deployment,performance,conformance" - fi - - # Compile only the requested validator binaries. - mkdir -p dist/validator - for phase in ${PHASES//,/ }; do - echo "Building validator binary: ${phase}" - CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}" - done - - for phase in ${PHASES//,/ }; do - mkdir -p "validators/${phase}/testdata" - docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . </dev/null 2>&1; then + echo "::error::yq is required to read testing.snapshot_agent_cuda_image from .settings.yaml" + exit 1 +fi + +SNAPSHOT_AGENT_CUDA_IMAGE="$(yq eval '.testing.snapshot_agent_cuda_image // ""' .settings.yaml)" +if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then + echo "::error::testing.snapshot_agent_cuda_image must be set in .settings.yaml" + exit 1 +fi + +if [[ ! -f dist/aicr ]]; then + echo "::error::dist/aicr not found; build the AICR CLI before building the snapshot agent image" + exit 1 +fi + +# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection). +# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) because only nvidia-smi is needed. +timeout 900s docker build \ + --build-arg SNAPSHOT_AGENT_CUDA_IMAGE="${SNAPSHOT_AGENT_CUDA_IMAGE}" \ + -t ko.local:smoke-test -f - . <<'DOCKERFILE' +ARG SNAPSHOT_AGENT_CUDA_IMAGE +FROM ${SNAPSHOT_AGENT_CUDA_IMAGE} +COPY dist/aicr /usr/local/bin/aicr +ENTRYPOINT ["/usr/local/bin/aicr"] +DOCKERFILE + +# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but does not +# set a node selector, so it can land on any GPU-capable node including the +# control-plane in the L40G smoke test. +timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { + echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" +} diff --git a/.github/actions/aicr-build/build-validator-images.sh b/.github/actions/aicr-build/build-validator-images.sh new file mode 100644 index 000000000..e308fba4e --- /dev/null +++ b/.github/actions/aicr-build/build-validator-images.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +VALIDATOR_PHASES="${VALIDATOR_PHASES:-}" +if [[ -n "${VALIDATOR_PHASES}" ]]; then + if [[ "${VALIDATOR_PHASES}" == "none" ]]; then + echo "Skipping validator builds (validator_phases=none)" + exit 0 + fi + PHASES="${VALIDATOR_PHASES}" +else + # Default: build all phases (backwards compatible). + PHASES="deployment,performance,conformance" +fi + +: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" + +mkdir -p dist/validator +for phase in ${PHASES//,/ }; do + if ! [[ "${phase}" =~ ^[a-z][a-z0-9_-]*$ ]]; then + echo "::error::invalid validator phase '${phase}'; expected ^[a-z][a-z0-9_-]*$" + exit 1 + fi + echo "Building validator binary: ${phase}" + CGO_ENABLED=0 go build -trimpath -o "dist/validator/${phase}" "./validators/${phase}" +done + +for phase in ${PHASES//,/ }; do + if [[ ! -d "validators/${phase}/testdata" ]]; then + echo "::error::validators/${phase}/testdata is missing" + exit 1 + fi + docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <&2 + exit 1 + ;; + esac +} + +MAX_RESTARTS="${MAX_RESTARTS:-}" +MAX_RESTARTS="${MAX_RESTARTS#"${MAX_RESTARTS%%[![:space:]]*}"}" +MAX_RESTARTS="${MAX_RESTARTS%"${MAX_RESTARTS##*[![:space:]]}"}" +MAX_RESTARTS_LIMIT="" +if [[ -n "${MAX_RESTARTS}" ]]; then + if ! [[ "${MAX_RESTARTS}" =~ ^[0-9]+$ ]]; then + echo "::error::max_restarts must be a non-negative integer, got '${MAX_RESTARTS}'" + exit 1 + fi + MAX_RESTARTS_LIMIT="$((10#${MAX_RESTARTS}))" +fi + +WAIT_TIMEOUT="${WAIT_TIMEOUT#"${WAIT_TIMEOUT%%[![:space:]]*}"}" +WAIT_TIMEOUT="${WAIT_TIMEOUT%"${WAIT_TIMEOUT##*[![:space:]]}"}" +validate_duration_input wait_timeout "${WAIT_TIMEOUT}" + +STABILITY_WINDOW="${STABILITY_WINDOW#"${STABILITY_WINDOW%%[![:space:]]*}"}" +STABILITY_WINDOW="${STABILITY_WINDOW%"${STABILITY_WINDOW##*[![:space:]]}"}" +if [[ -z "${STABILITY_WINDOW}" ]]; then + STABILITY_WINDOW="0s" +fi +validate_duration_input stability_window "${STABILITY_WINDOW}" +if [[ "${STABILITY_WINDOW}" =~ ^0+[smh]$ ]]; then + STABILITY_WINDOW="0s" +fi +STABILITY_WINDOW_SECONDS="$(duration_seconds "${STABILITY_WINDOW}")" +if [[ -n "${MAX_RESTARTS_LIMIT}" ]] && [[ "${STABILITY_WINDOW}" != "0s" ]] && (( MAX_RESTARTS_LIMIT != 1 )); then + echo "::warning::max_restarts is diagnostic context when stability_window is non-zero; new restarts during the stability window remain the hard failure gate" +fi + +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL:-10s}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL#"${STABILITY_PROBE_INTERVAL%%[![:space:]]*}"}" +STABILITY_PROBE_INTERVAL="${STABILITY_PROBE_INTERVAL%"${STABILITY_PROBE_INTERVAL##*[![:space:]]}"}" +validate_duration_input stability_probe_interval "${STABILITY_PROBE_INTERVAL}" +STABILITY_PROBE_INTERVAL_SECONDS="$(duration_seconds "${STABILITY_PROBE_INTERVAL}")" +if (( STABILITY_PROBE_INTERVAL_SECONDS <= 0 )); then + echo "::error::stability_probe_interval must be greater than 0, got '${STABILITY_PROBE_INTERVAL}'" + exit 1 +fi +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD:-2}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD#"${STABILITY_PROBE_FAILURE_THRESHOLD%%[![:space:]]*}"}" +STABILITY_PROBE_FAILURE_THRESHOLD="${STABILITY_PROBE_FAILURE_THRESHOLD%"${STABILITY_PROBE_FAILURE_THRESHOLD##*[![:space:]]}"}" +if ! [[ "${STABILITY_PROBE_FAILURE_THRESHOLD}" =~ ^[0-9]+$ ]]; then + echo "::error::stability_probe_failure_threshold must be a positive integer, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi +if (( STABILITY_PROBE_FAILURE_THRESHOLD <= 0 )); then + echo "::error::stability_probe_failure_threshold must be greater than 0, got '${STABILITY_PROBE_FAILURE_THRESHOLD}'" + exit 1 +fi + +LEASE_COMPONENTS="${LEASE_COMPONENTS:-kube-controller-manager kube-scheduler}" +LEASE_COMPONENTS="${LEASE_COMPONENTS#"${LEASE_COMPONENTS%%[![:space:]]*}"}" +LEASE_COMPONENTS="${LEASE_COMPONENTS%"${LEASE_COMPONENTS##*[![:space:]]}"}" + +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT:-120s}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT#"${LEASE_STALE_TIMEOUT%%[![:space:]]*}"}" +LEASE_STALE_TIMEOUT="${LEASE_STALE_TIMEOUT%"${LEASE_STALE_TIMEOUT##*[![:space:]]}"}" +validate_duration_input lease_stale_timeout "${LEASE_STALE_TIMEOUT}" +LEASE_STALE_TIMEOUT_SECONDS="$(duration_seconds "${LEASE_STALE_TIMEOUT}")" +if (( LEASE_STALE_TIMEOUT_SECONDS <= 0 )); then + echo "::error::lease_stale_timeout must be greater than 0, got '${LEASE_STALE_TIMEOUT}'" + exit 1 +fi + +RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS:-false}" +RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS#"${RUNTIME_DIAGNOSTICS%%[![:space:]]*}"}" +RUNTIME_DIAGNOSTICS="${RUNTIME_DIAGNOSTICS%"${RUNTIME_DIAGNOSTICS##*[![:space:]]}"}" +case "${RUNTIME_DIAGNOSTICS}" in + true|false) ;; + *) + echo "::error::runtime_diagnostics must be true or false, got '${RUNTIME_DIAGNOSTICS}'" + exit 1 + ;; +esac + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + timeout 30s docker "$@" +} + +RESTART_COUNT_ATTEMPTS=3 +RESTART_COUNT_RETRY_SLEEP_SECONDS=2 +declare -A INITIAL_RESTARTS=() + +kubectl_kind get --raw='/readyz' || true + +wait_ready() { + local component="$1" + local selector="component=${component}" + + if ! timeout "${WAIT_TIMEOUT}" kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" -n "${NAMESPACE}" \ + wait --for=condition=Ready pod -l "${selector}" --timeout="${WAIT_TIMEOUT}"; then + return 1 + fi +} + +restart_total() { + local component="$1" + local selector="component=${component}" + local restart_counts + local restart_count + local total=0 + local attempt + + for ((attempt = 1; attempt <= RESTART_COUNT_ATTEMPTS; attempt++)); do + if restart_counts=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" \ + -o jsonpath='{range .items[*]}{range .status.containerStatuses[*]}{.restartCount}{"\n"}{end}{end}'); then + if [[ -n "${restart_counts}" ]]; then + break + fi + echo "::warning::no container statuses found for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + else + echo "::warning::failed to read restart counts for ${component} pods (attempt ${attempt}/${RESTART_COUNT_ATTEMPTS})" >&2 + fi + + if (( attempt < RESTART_COUNT_ATTEMPTS )); then + sleep "${RESTART_COUNT_RETRY_SLEEP_SECONDS}" + fi + done + + if [[ -z "${restart_counts}" ]]; then + echo "::error::no container statuses found for ${component} pods after ${RESTART_COUNT_ATTEMPTS} attempts" >&2 + dump_component_diagnostics "${component}" >&2 + exit 1 + fi + + while IFS= read -r restart_count; do + [[ -z "${restart_count}" ]] && continue + total=$((total + restart_count)) + done <<< "${restart_counts}" + echo "${total}" +} + +report_restart_baseline() { + local component="$1" + local restart_count="$2" + + if (( restart_count > 0 )); then + if [[ "${STABILITY_WINDOW}" == "0s" ]] && [[ -n "${MAX_RESTARTS_LIMIT}" ]]; then + echo "::warning::${component} has historical restartCount=${restart_count}; max_restarts=${MAX_RESTARTS_LIMIT} will be enforced because stability_window=0s" + else + echo "::warning::${component} has historical restartCount=${restart_count}; checking current readiness and stability window only" + fi + return + fi + echo "${component} restartCount=${restart_count}" +} + +dump_control_plane_summary() { + echo "=== Control-plane pod restart summary ===" + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane -o wide || true + kubectl_kind -n "${NAMESPACE}" get pods -l tier=control-plane \ + -o jsonpath='{range .items[*]}{.metadata.name}{" restartCount="}{range .status.containerStatuses[*]}{.restartCount}{" "}{end}{"\n"}{end}' || true +} + +require_readyz() { + local reason="$1" + + if ! kubectl_kind get --raw='/readyz'; then + echo "::error::kube-apiserver /readyz failed ${reason}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi +} + +probe_control_plane_api() { + local reason="$1" + local component + local lease_summary + + if ! kubectl_kind get --raw='/readyz' >/dev/null; then + echo "::error::kube-apiserver /readyz probe failed ${reason}" + return 1 + fi + + for component in ${LEASE_COMPONENTS}; do + if ! lease_summary=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" \ + -o jsonpath='{.metadata.name}{" holder="}{.spec.holderIdentity}{" renewTime="}{.spec.renewTime}{"\n"}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component} ${reason}" + return 1 + fi + echo "${lease_summary}" + done +} + +lease_renew_epoch() { + local renew_time="$1" + + date -u -d "${renew_time}" +%s 2>/dev/null +} + +verify_leader_lease_freshness() { + local component + local now_epoch + local renew_time + local renew_epoch + local lease_age + + [[ -z "${LEASE_COMPONENTS}" ]] && return + + now_epoch="$(date -u +%s)" + echo "Checking leader election lease freshness (max age ${LEASE_STALE_TIMEOUT})..." + for component in ${LEASE_COMPONENTS}; do + if ! renew_time=$(kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o jsonpath='{.spec.renewTime}' 2>/dev/null); then + echo "::error::failed to read leader election lease ${component}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if [[ -z "${renew_time}" ]]; then + echo "::error::leader election lease ${component} has empty spec.renewTime" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + if ! renew_epoch="$(lease_renew_epoch "${renew_time}")"; then + echo "::error::failed to parse leader election lease ${component} renewTime '${renew_time}'" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + lease_age=$((now_epoch - renew_epoch)) + if (( lease_age < 0 )); then + lease_age=0 + fi + echo "${component} lease renewTime=${renew_time} age=${lease_age}s" + if (( lease_age > LEASE_STALE_TIMEOUT_SECONDS )); then + echo "::error::leader election lease ${component} is stale: age=${lease_age}s exceeds ${LEASE_STALE_TIMEOUT}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done +} + +observe_stability_window() { + local label="$1" + local elapsed=0 + local probe=0 + local sleep_seconds + local consecutive_failures=0 + local total_failures=0 + + echo "Observing control-plane stability for ${STABILITY_WINDOW} (${label}); probing every ${STABILITY_PROBE_INTERVAL}, failing after ${STABILITY_PROBE_FAILURE_THRESHOLD} consecutive probe failure(s)..." + while (( elapsed < STABILITY_WINDOW_SECONDS )); do + sleep_seconds="${STABILITY_PROBE_INTERVAL_SECONDS}" + if (( elapsed + sleep_seconds > STABILITY_WINDOW_SECONDS )); then + sleep_seconds=$((STABILITY_WINDOW_SECONDS - elapsed)) + fi + if (( sleep_seconds > 0 )); then + sleep "${sleep_seconds}" + elapsed=$((elapsed + sleep_seconds)) + fi + + probe=$((probe + 1)) + echo "=== Control-plane stability probe ${probe} (${elapsed}/${STABILITY_WINDOW_SECONDS}s, ${label}) ===" + if probe_control_plane_api "during ${label} stability probe ${probe}"; then + consecutive_failures=0 + continue + fi + + total_failures=$((total_failures + 1)) + consecutive_failures=$((consecutive_failures + 1)) + echo "::warning::control-plane stability probe ${probe} failed (${consecutive_failures} consecutive, ${total_failures} total)" + if (( consecutive_failures >= STABILITY_PROBE_FAILURE_THRESHOLD )); then + echo "::error::control-plane had ${consecutive_failures} consecutive failed stability probes during ${label}" + dump_all_control_plane_runtime_diagnostics + exit 1 + fi + done + + if (( total_failures > 0 )); then + echo "::warning::control-plane had ${total_failures} transient failed stability probe(s) during ${label}; final health checks must still pass" + fi + verify_leader_lease_freshness +} + +dump_api_server_health() { + local endpoint + + for endpoint in '/livez?verbose' '/readyz?verbose' '/healthz'; do + echo "=== kube-apiserver ${endpoint} ===" + kubectl_kind get --raw="${endpoint}" || true + done +} + +dump_kind_node_runtime_summary() { + local node="${KIND_CLUSTER_NAME}-control-plane" + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect node runtime summary: kind node container ${node} not found" + return + fi + + echo "=== ${node} docker stats ===" + docker_timeout stats --no-stream \ + --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ + "${node}" || true + + echo "=== ${node} docker inspect state ===" + docker_timeout inspect \ + --format 'status={{.State.Status}} running={{.State.Running}} oomKilled={{.State.OOMKilled}} pid={{.State.Pid}} started={{.State.StartedAt}} finished={{.State.FinishedAt}}' \ + "${node}" || true + + echo "=== ${node} node pressure snapshot ===" + docker_timeout exec "${node}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' || true + + echo "=== ${node} CRI pod/container summary ===" + docker_timeout exec "${node}" crictl pods || true + docker_timeout exec "${node}" crictl ps -a || true + docker_timeout exec "${node}" crictl stats || true +} + +dump_static_pod_runtime_diagnostics() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local count=0 + + if ! docker_timeout inspect "${node}" >/dev/null 2>&1; then + echo "::warning::cannot collect ${component} runtime diagnostics: kind node container ${node} not found" + return + fi + + echo "=== ${node} ${component} static pod manifest ===" + docker_timeout exec "${node}" sh -c "sed -n '1,220p' /etc/kubernetes/manifests/${component}.yaml" || true + + echo "=== ${node} ${component} CRI containers ===" + docker_timeout exec "${node}" crictl ps -a --name "${component}" || true + + container_ids=$(docker_timeout exec "${node}" crictl ps -a --name "${component}" -q 2>/dev/null || true) + for container_id in ${container_ids}; do + count=$((count + 1)) + if (( count > 8 )); then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + + echo "=== ${node} crictl inspect ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl inspect "${container_id}" || true + echo "=== ${node} crictl logs ${component} ${container_id} ===" + docker_timeout exec "${node}" crictl logs --tail=200 "${container_id}" || true + done + + echo "=== ${node} kubelet journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u kubelet --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|static pod|mirror pod|probe|liveness|readiness|startup|back-off|backoff|container|failed|error|oom|killed" \ + | tail -200 || true + + echo "=== ${node} containerd journal (${component}) ===" + docker_timeout exec "${node}" journalctl -u containerd --since '45 minutes ago' --no-pager 2>/dev/null \ + | grep -Ei "${component}|container|task|shim|deadline|failed|error|oom|killed" \ + | tail -200 || true +} + +dump_all_control_plane_runtime_diagnostics() { + local component + + dump_control_plane_summary + dump_api_server_health + if [[ "${RUNTIME_DIAGNOSTICS}" != "true" ]]; then + echo "Skipping kind node runtime diagnostics. Set runtime_diagnostics=true to collect docker stats, crictl, and journalctl on failure." + return + fi + dump_kind_node_runtime_summary + for component in ${COMPONENTS}; do + dump_static_pod_runtime_diagnostics "${component}" + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true + done +} + +dump_component_diagnostics() { + local component="$1" + local selector="component=${component}" + local pods + local pod + + dump_control_plane_summary + kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o wide || true + kubectl_kind -n "${NAMESPACE}" describe pod -l "${selector}" || true + kubectl_kind -n "${NAMESPACE}" get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true + + pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name 2>/dev/null || true) + while IFS= read -r pod; do + [[ -z "${pod}" ]] && continue + echo "=== ${pod} logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --tail=100 2>/dev/null || true + echo "=== ${pod} previous logs ===" + kubectl_kind -n "${NAMESPACE}" logs "${pod}" --all-containers --previous --tail=100 2>/dev/null || true + done <<< "${pods}" + + dump_all_control_plane_runtime_diagnostics + kubectl_kind -n "${NAMESPACE}" get lease "${component}" -o yaml 2>/dev/null || true +} + +check_component() { + local component="$1" + local selector="component=${component}" + local pods + local initial_restarts + + if ! pods=$(kubectl_kind -n "${NAMESPACE}" get pod -l "${selector}" -o name); then + echo "::error::failed to list ${component} pods in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + if [[ -z "${pods}" ]]; then + echo "::error::no ${component} pods found in ${NAMESPACE} with selector ${selector}" + kubectl_kind -n "${NAMESPACE}" get pods -o wide || true + exit 1 + fi + + if ! wait_ready "${component}"; then + echo "::error::${component} pods did not become Ready within ${WAIT_TIMEOUT}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + initial_restarts=$(restart_total "${component}") + report_restart_baseline "${component}" "${initial_restarts}" + INITIAL_RESTARTS["${component}"]="${initial_restarts}" +} + +verify_stability_window() { + local component + local initial_restarts + local final_restarts + + if [[ "${STABILITY_WINDOW}" == "0s" ]]; then + if [[ -n "${MAX_RESTARTS_LIMIT}" ]]; then + for component in ${COMPONENTS}; do + final_restarts="${INITIAL_RESTARTS[${component}]:-0}" + if (( final_restarts > MAX_RESTARTS_LIMIT )); then + echo "::error::${component} restartCount=${final_restarts} exceeds max_restarts=${MAX_RESTARTS_LIMIT}" + dump_component_diagnostics "${component}" + exit 1 + fi + done + fi + verify_leader_lease_freshness + return + fi + + observe_stability_window "primary" + for component in ${COMPONENTS}; do + initial_restarts="${INITIAL_RESTARTS[${component}]:-}" + if [[ -z "${initial_restarts}" ]]; then + echo "::error::missing initial restart count for ${component}" + exit 1 + fi + if ! wait_ready "${component}"; then + echo "::error::${component} pods became unready during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + final_restarts=$(restart_total "${component}") + if (( final_restarts > initial_restarts )); then + echo "::error::${component} restartCount increased from ${initial_restarts} to ${final_restarts} during ${STABILITY_WINDOW}" + dump_component_diagnostics "${component}" + kubectl_kind get --raw='/readyz' || true + exit 1 + fi + INITIAL_RESTARTS["${component}"]="${final_restarts}" + done +} + +for component in ${COMPONENTS}; do + check_component "${component}" +done +verify_stability_window +require_readyz "after stability window" diff --git a/.github/actions/gpu-chainsaw-health/action.yml b/.github/actions/gpu-chainsaw-health/action.yml new file mode 100644 index 000000000..2b13ef645 --- /dev/null +++ b/.github/actions/gpu-chainsaw-health/action.yml @@ -0,0 +1,51 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Chainsaw Health' +description: 'Run Chainsaw runtime health checks for a GPU Kind test cluster.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + chainsaw_path: + description: 'Path to the Chainsaw health-check directory' + required: true + chainsaw_version: + description: 'Chainsaw version' + required: true + chainsaw_sha256: + description: 'Chainsaw SHA256 checksum for linux/amd64' + required: true + test_timeout: + description: 'Outer timeout for the Chainsaw test command' + required: false + default: 15m + +runs: + using: 'composite' + steps: + - name: Install chainsaw + uses: ./.github/actions/setup-build-tools + with: + install_chainsaw: 'true' + chainsaw_version: ${{ inputs.chainsaw_version }} + chainsaw_sha256: ${{ inputs.chainsaw_sha256 }} + + - name: Run chainsaw health checks + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + CHAINSAW_TEST_TIMEOUT: ${{ inputs.test_timeout }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-chainsaw-health.sh" "${{ inputs.chainsaw_path }}" diff --git a/.github/actions/gpu-cluster-setup/action.yml b/.github/actions/gpu-cluster-setup/action.yml index b9bc3060f..8e81aea5a 100644 --- a/.github/actions/gpu-cluster-setup/action.yml +++ b/.github/actions/gpu-cluster-setup/action.yml @@ -15,18 +15,90 @@ name: 'GPU Cluster Setup' description: 'Creates a GPU-enabled kind cluster using nvkind with CDI-mode GPU passthrough.' +inputs: + kind_node_image: + description: 'Kind node image for nvkind cluster creation' + required: false + default: '' + min_gpu_count: + description: 'Minimum visible GPU count required before cluster setup' + required: true + gpu_model_pattern: + description: 'Optional grep-compatible GPU model pattern required for visible GPUs' + required: false + default: '' + min_free_disk_gb: + description: 'Minimum free disk space on / required before cluster setup' + required: false + default: '20' + min_available_memory_gb: + description: 'Minimum available system memory required before cluster setup' + required: false + default: '8' + cluster_create_timeout: + description: 'Timeout for nvkind cluster create' + required: false + default: '900s' + control_plane_resource_patches: + description: 'Apply kubeadm patches that raise control-plane static pod resource requests' + required: false + default: 'false' + control_plane_leader_election_tuning: + description: 'Increase kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes' + required: false + default: 'false' + leader_election_lease_duration: + description: 'Leader election lease duration when control_plane_leader_election_tuning is true' + required: false + default: '300s' + leader_election_renew_deadline: + description: 'Leader election renew deadline when control_plane_leader_election_tuning is true' + required: false + default: '240s' + leader_election_retry_period: + description: 'Leader election retry period when control_plane_leader_election_tuning is true' + required: false + default: '10s' + api_server_cpu_request: + description: 'kube-apiserver CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + api_server_memory_request: + description: 'kube-apiserver memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' + controller_manager_cpu_request: + description: 'kube-controller-manager CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + controller_manager_memory_request: + description: 'kube-controller-manager memory request when control_plane_resource_patches is true' + required: false + default: '512Mi' + scheduler_cpu_request: + description: 'kube-scheduler CPU request when control_plane_resource_patches is true' + required: false + default: '500m' + scheduler_memory_request: + description: 'kube-scheduler memory request when control_plane_resource_patches is true' + required: false + default: '256Mi' + etcd_cpu_request: + description: 'etcd CPU request when control_plane_resource_patches is true' + required: false + default: '1000m' + etcd_memory_request: + description: 'etcd memory request when control_plane_resource_patches is true' + required: false + default: '1Gi' + runs: using: 'composite' steps: - name: Validate environment shell: bash - run: | - if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then - echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" - exit 1 - fi - + run: bash "${{ github.action_path }}/validate-env.sh" - name: Load versions id: versions uses: ./.github/actions/load-versions @@ -52,40 +124,61 @@ runs: - name: Install nvkind shell: bash - run: | - go install github.com/NVIDIA/nvkind/cmd/nvkind@latest - nvkind --help - - - name: Verify host GPU + env: + NVKIND_VERSION: ${{ steps.versions.outputs.nvkind }} + run: bash "${{ github.action_path }}/install-nvkind.sh" + - name: Runner preflight shell: bash - run: nvidia-smi -L - + env: + GPU_MODEL_PATTERN: ${{ inputs.gpu_model_pattern }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + run: bash "${{ github.action_path }}/runner-preflight.sh" - name: Configure NVIDIA Container Toolkit for kind shell: bash - run: | - sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled - sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place - sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place - sudo systemctl restart docker - + run: bash "${{ github.action_path }}/configure-nvidia-container-toolkit.sh" - name: Validate Docker GPU access shell: bash - run: docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L - + run: bash "${{ github.action_path }}/validate-docker-gpu-access.sh" - name: Increase inotify limits shell: bash - run: | - sudo sysctl -w fs.inotify.max_user_watches=524288 - sudo sysctl -w fs.inotify.max_user_instances=1024 - + run: bash "${{ github.action_path }}/increase-inotify-limits.sh" + - name: Delete stale kind cluster + shell: bash + run: bash "${{ github.action_path }}/delete-stale-kind-cluster.sh" + - name: Check runner capacity + shell: bash + env: + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + MIN_AVAILABLE_MEMORY_GB: ${{ inputs.min_available_memory_gb }} + run: bash "${{ github.action_path }}/check-runner-capacity.sh" + - name: Warm kind node image + if: ${{ inputs.kind_node_image != '' }} + shell: bash + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + MIN_FREE_DISK_GB: ${{ inputs.min_free_disk_gb }} + run: bash "${{ github.action_path }}/warm-kind-node-image.sh" - name: Create GPU-enabled kind cluster shell: bash - run: | - nvkind cluster create --name="${KIND_CLUSTER_NAME}" || echo "::warning::nvkind cluster create returned non-zero (umount errors are expected with CDI mode)" - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait --for=condition=Ready nodes --all --timeout=300s - kubectl --context="kind-${KIND_CLUSTER_NAME}" cluster-info - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide - + env: + KIND_NODE_IMAGE: ${{ inputs.kind_node_image }} + CLUSTER_CREATE_TIMEOUT: ${{ inputs.cluster_create_timeout }} + CONTROL_PLANE_RESOURCE_PATCHES: ${{ inputs.control_plane_resource_patches }} + CONTROL_PLANE_LEADER_ELECTION_TUNING: ${{ inputs.control_plane_leader_election_tuning }} + LEADER_ELECTION_LEASE_DURATION: ${{ inputs.leader_election_lease_duration }} + LEADER_ELECTION_RENEW_DEADLINE: ${{ inputs.leader_election_renew_deadline }} + LEADER_ELECTION_RETRY_PERIOD: ${{ inputs.leader_election_retry_period }} + API_SERVER_CPU_REQUEST: ${{ inputs.api_server_cpu_request }} + API_SERVER_MEMORY_REQUEST: ${{ inputs.api_server_memory_request }} + CONTROLLER_MANAGER_CPU_REQUEST: ${{ inputs.controller_manager_cpu_request }} + CONTROLLER_MANAGER_MEMORY_REQUEST: ${{ inputs.controller_manager_memory_request }} + SCHEDULER_CPU_REQUEST: ${{ inputs.scheduler_cpu_request }} + SCHEDULER_MEMORY_REQUEST: ${{ inputs.scheduler_memory_request }} + ETCD_CPU_REQUEST: ${{ inputs.etcd_cpu_request }} + ETCD_MEMORY_REQUEST: ${{ inputs.etcd_memory_request }} + run: bash "${{ github.action_path }}/create-gpu-kind-cluster.sh" - name: Print GPUs (nvkind) shell: bash run: nvkind cluster print-gpus --name="${KIND_CLUSTER_NAME}" diff --git a/.github/actions/gpu-cluster-setup/check-runner-capacity.sh b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh new file mode 100644 index 000000000..ff6c3168e --- /dev/null +++ b/.github/actions/gpu-cluster-setup/check-runner-capacity.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9') +min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024)) +free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024)) +if (( free_disk_bytes < min_free_disk_bytes )); then + echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB), need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)" + exit 1 +fi + +available_memory_gb=$(free -g | awk '/^Mem:/ {print $7}') +if (( available_memory_gb < MIN_AVAILABLE_MEMORY_GB )); then + echo "::error::available memory is ${available_memory_gb}GiB, need at least ${MIN_AVAILABLE_MEMORY_GB}GiB" + exit 1 +fi + +echo "Runner capacity is sufficient: disk=${free_disk_gib}GiB (${free_disk_bytes} bytes) memory=${available_memory_gb}GiB" diff --git a/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh new file mode 100644 index 000000000..84635a988 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/configure-nvidia-container-toolkit.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo nvidia-ctk runtime configure --runtime=docker --set-as-default --cdi.enabled +sudo nvidia-ctk config --set accept-nvidia-visible-devices-as-volume-mounts=true --in-place +sudo nvidia-ctk config --set accept-nvidia-visible-devices-envvar-when-unprivileged=false --in-place +set +e +timeout 120s sudo systemctl restart docker +restart_status=$? +set -e +if (( restart_status != 0 )); then + echo "::error::Docker restart failed after NVIDIA runtime configuration" + sudo systemctl status docker --no-pager || true + sudo journalctl -u docker --since "10 minutes ago" --no-pager || true + exit "${restart_status}" +fi + +for attempt in $(seq 1 30); do + if systemctl is-active --quiet docker && timeout 5s docker info >/dev/null 2>&1; then + echo "Docker is healthy after NVIDIA runtime configuration." + exit 0 + fi + echo "Waiting for Docker to become healthy... (${attempt}/30)" + sleep 2 +done + +echo "::error::Docker did not become healthy after NVIDIA runtime configuration" +sudo systemctl status docker --no-pager || true +exit 1 diff --git a/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh new file mode 100644 index 000000000..19ef485cb --- /dev/null +++ b/.github/actions/gpu-cluster-setup/create-gpu-kind-cluster.sh @@ -0,0 +1,517 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +validate_cpu_quantity_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^([0-9]+m|[0-9]+)$ ]]; then + echo "::error::${input_name} must be a CPU quantity like 500m, 1000m, or 1; got '${input_value}'" + exit 1 + fi +} + +validate_memory_quantity_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+([EPTGMK]i?|[eptgmk])?$ ]]; then + echo "::error::${input_name} must be a memory quantity like 256Mi, 1Gi, or 1024; got '${input_value}'" + exit 1 + fi +} + +validate_bool_input() { + local input_name="$1" + local input_value="$2" + + case "${input_value}" in + true|false) ;; + *) + echo "::error::${input_name} must be true or false, got '${input_value}'" + exit 1 + ;; + esac +} + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 330s kubectl --request-timeout=300s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +validate_generated_control_plane_config() { + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + for patch_file in "${patch_dir}"/*.yaml; do + if ! grep -Fxq 'apiVersion: v1' "${patch_file}" || + ! grep -Fxq 'kind: Pod' "${patch_file}" || + ! grep -Eq '^[[:space:]]+resources:$' "${patch_file}"; then + echo "::error::rendered static pod patch ${patch_file} is missing expected top-level YAML" + sed 's/^/ /' "${patch_file}" || true + exit 1 + fi + done + + if ! grep -Eq '^[[:space:]]*extraMounts:$' "${config_template}" || + ! grep -Fq 'directory: /patches' "${config_template}"; then + echo "::error::rendered kind config is missing control-plane patch mounts" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + fi + + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + for expected in \ + 'apiVersion: kubeadm.k8s.io/v1beta3' \ + 'apiVersion: kubeadm.k8s.io/v1beta4' \ + "leader-elect-lease-duration: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "leader-elect-renew-deadline: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "leader-elect-retry-period: \"${LEADER_ELECTION_RETRY_PERIOD}\"" \ + "value: \"${LEADER_ELECTION_LEASE_DURATION}\"" \ + "value: \"${LEADER_ELECTION_RENEW_DEADLINE}\"" \ + "value: \"${LEADER_ELECTION_RETRY_PERIOD}\""; do + if ! grep -Fq "${expected}" "${config_template}"; then + echo "::error::rendered kind config is missing expected leader election setting: ${expected}" + sed 's/^/ /' "${config_template}" || true + exit 1 + fi + done + fi +} + +validate_duration_input cluster_create_timeout "${CLUSTER_CREATE_TIMEOUT}" +validate_duration_input leader_election_lease_duration "${LEADER_ELECTION_LEASE_DURATION}" +validate_duration_input leader_election_renew_deadline "${LEADER_ELECTION_RENEW_DEADLINE}" +validate_duration_input leader_election_retry_period "${LEADER_ELECTION_RETRY_PERIOD}" + +CREATE_ARGS=(--name="${KIND_CLUSTER_NAME}") +if [[ -n "${KIND_NODE_IMAGE}" ]]; then + echo "Using kind node image: ${KIND_NODE_IMAGE}" + CREATE_ARGS+=(--image="${KIND_NODE_IMAGE}") +fi + +CONTROL_PLANE_RESOURCE_PATCHES="${CONTROL_PLANE_RESOURCE_PATCHES:-false}" +CONTROL_PLANE_LEADER_ELECTION_TUNING="${CONTROL_PLANE_LEADER_ELECTION_TUNING:-false}" +validate_bool_input control_plane_resource_patches "${CONTROL_PLANE_RESOURCE_PATCHES}" +validate_bool_input control_plane_leader_election_tuning "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" || "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + patch_dir="$(mktemp -d)" + config_template="$(mktemp)" + cleanup_generated_config() { + [[ -n "${patch_dir:-}" ]] && rm -rf "${patch_dir}" + [[ -n "${config_template:-}" ]] && rm -f "${config_template}" + } + trap cleanup_generated_config EXIT + + # Keep YAML heredocs at column 0; indentation is literal content. + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + validate_cpu_quantity_input api_server_cpu_request "${API_SERVER_CPU_REQUEST}" + validate_memory_quantity_input api_server_memory_request "${API_SERVER_MEMORY_REQUEST}" + validate_cpu_quantity_input controller_manager_cpu_request "${CONTROLLER_MANAGER_CPU_REQUEST}" + validate_memory_quantity_input controller_manager_memory_request "${CONTROLLER_MANAGER_MEMORY_REQUEST}" + validate_cpu_quantity_input scheduler_cpu_request "${SCHEDULER_CPU_REQUEST}" + validate_memory_quantity_input scheduler_memory_request "${SCHEDULER_MEMORY_REQUEST}" + validate_cpu_quantity_input etcd_cpu_request "${ETCD_CPU_REQUEST}" + validate_memory_quantity_input etcd_memory_request "${ETCD_MEMORY_REQUEST}" + + cat > "${patch_dir}/kube-apiserver+strategic.yaml" < "${patch_dir}/kube-controller-manager+strategic.yaml" < "${patch_dir}/kube-scheduler+strategic.yaml" < "${patch_dir}/etcd+strategic.yaml" < "${config_template}" <<'EOF' +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +{{- if hasKey $ "name" }} +name: {{ $.name }} +{{- end }} +nodes: +- role: control-plane + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <> "${config_template}" <<'EOF' + kubeadmConfigPatches: +EOF + fi + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + cat >> "${config_template}" <<'EOF' + - | + kind: InitConfiguration + patches: + directory: /patches +EOF + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + # kind v0.31 renders kubeadm v1beta3. Keep a v1beta4 patch too so + # this remains valid when a future kind image switches API versions. + cat >> "${config_template}" <> "${config_template}" <<'EOF' +{{- range $.workers }} +- role: worker + {{- if hasKey $ "image" }} + image: {{ $.image }} + {{- end }} + + {{- if hasKey . "devices" }} + {{- $devices := .devices }} + {{- if not (kindIs "slice" $devices) }} + {{- $devices = list .devices }} + {{- end }} + extraMounts: + # We inject all NVIDIA GPUs using the nvidia-container-runtime. + # This requires `accept-nvidia-visible-devices-as-volume-mounts = true` be set + # in `/etc/nvidia-container-runtime/config.toml` + {{- range $d := $devices }} + - hostPath: /dev/null + containerPath: /var/run/nvidia-container-devices/{{ $d }} + {{- end }} + {{- end }} +{{- end }} +EOF + if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Applying control-plane static pod resource patches from ${patch_dir}:" + for patch_file in "${patch_dir}"/*.yaml; do + echo "--- ${patch_file}" + sed 's/^/ /' "${patch_file}" + done + fi + if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Increasing kube-controller-manager and kube-scheduler leader election timeouts for slow CI control planes:" + echo " lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + echo " renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + echo " retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + fi + validate_generated_control_plane_config + CREATE_ARGS+=(--config-template="${config_template}") +fi + +set +e +timeout "${CLUSTER_CREATE_TIMEOUT}" nvkind cluster create "${CREATE_ARGS[@]}" +create_status=$? +set -e +case "${create_status}" in + 0) ;; + 124) + echo "::warning::nvkind cluster create timed out after ${CLUSTER_CREATE_TIMEOUT}; continuing only if post-create checks pass" + ;; + *) + echo "::warning::nvkind cluster create returned status ${create_status}; continuing only if post-create checks pass" + ;; +esac + +kubectl_kind_wait wait --for=condition=Ready nodes --all --timeout=300s +kubectl_kind cluster-info +kubectl_kind get nodes -o wide +kubectl_kind describe nodes | \ + grep -E "^(Name:|Capacity:|Allocatable:|Allocated resources:| cpu| memory| nvidia.com/gpu)" || true + +echo "=== Kind node container resources ===" +docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker_timeout 30s inspect "${node_container}" \ + --format '{{.Name}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' + done + +echo "=== Control-plane resource requests/limits ===" +kubectl_kind -n kube-system \ + get pods -l tier=control-plane -o json | jq -r ' + .items[] as $pod | + $pod.metadata.name, + ($pod.spec.containers[] | + " " + .name + + " requests=" + ((.resources.requests // {}) | tostring) + + " limits=" + ((.resources.limits // {}) | tostring)) + ' || true + +normalize_cpu_request() { + local cpu="$1" + + if [[ "${cpu}" =~ ^([0-9]+)000m$ ]]; then + echo "${BASH_REMATCH[1]}" + return + fi + echo "${cpu}" +} + +control_plane_request() { + local component="$1" + local resource="$2" + + kubectl_kind -n kube-system \ + get pod -l "component=${component}" \ + -o "jsonpath={.items[0].spec.containers[0].resources.requests.${resource}}" +} + +assert_control_plane_request() { + local component="$1" + local resource="$2" + local expected="$3" + local actual + + actual="$(control_plane_request "${component}" "${resource}")" + if [[ "${resource}" == "cpu" ]]; then + expected="$(normalize_cpu_request "${expected}")" + actual="$(normalize_cpu_request "${actual}")" + fi + if [[ "${actual}" != "${expected}" ]]; then + echo "::error::${component} ${resource} request is '${actual}', expected '${expected}'" + exit 1 + fi + echo "${component} ${resource} request verified: ${actual}" +} + +control_plane_command_args() { + local component="$1" + + kubectl_kind -n kube-system \ + get pod -l "component=${component}" \ + -o json | jq -r '.items[0].spec.containers[0] | ((.command // []) + (.args // []))[]?' +} + +static_pod_manifest_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + + docker_timeout 30s exec "${node}" grep -Fq -- "- ${expected}" "/etc/kubernetes/manifests/${component}.yaml" +} + +running_static_pod_container_contains_arg() { + local component="$1" + local expected="$2" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + local inspect_output + + if ! container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null)"; then + return 1 + fi + [[ -z "${container_ids}" ]] && return 1 + + for container_id in ${container_ids}; do + inspect_output="$(docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null || true)" + if jq -e --arg expected "${expected}" ' + ([.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?] | index($expected)) != null + ' >/dev/null 2>&1 <<< "${inspect_output}" || grep -Fq -- "${expected}" <<< "${inspect_output}"; then + return 0 + fi + done + return 1 +} + +dump_running_static_pod_container_args() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + local container_ids + local container_id + + echo "Running ${component} CRI container args:" + container_ids="$(docker_timeout 30s exec "${node}" crictl ps --name "${component}" -q 2>/dev/null || true)" + if [[ -z "${container_ids}" ]]; then + echo "(no running ${component} CRI containers found)" + return + fi + for container_id in ${container_ids}; do + echo "--- ${container_id} ---" + docker_timeout 30s exec "${node}" crictl inspect "${container_id}" 2>/dev/null | jq -r ' + [.info.runtimeSpec.process.args[]?, .status.info.runtimeSpec.process.args[]?][]? + ' || true + done +} + +dump_static_pod_manifest() { + local component="$1" + local node="${KIND_CLUSTER_NAME}-control-plane" + + echo "Static pod manifest /etc/kubernetes/manifests/${component}.yaml:" + docker_timeout 30s exec "${node}" sed -n '1,220p' "/etc/kubernetes/manifests/${component}.yaml" || true +} + +assert_control_plane_arg() { + local component="$1" + local expected="$2" + local attempt + local command_args + + for attempt in $(seq 1 12); do + command_args="$(control_plane_command_args "${component}" || true)" + if grep -Fxq -- "${expected}" <<< "${command_args}"; then + echo "${component} command/args verified: ${expected}" + return + fi + if running_static_pod_container_contains_arg "${component}" "${expected}"; then + echo "${component} running CRI container args verified: ${expected} (live mirror pod omitted it)" + return + fi + if static_pod_manifest_contains_arg "${component}" "${expected}"; then + echo "::warning::${component} static pod manifest has ${expected}, but the running container does not yet; waiting for kubelet to converge (${attempt}/12)" + sleep 5 + continue + fi + + break + done + + echo "::error::${component} running command/args does not contain ${expected}" + echo "Observed live command/args:" + echo "${command_args:-}" + dump_running_static_pod_container_args "${component}" + dump_static_pod_manifest "${component}" + exit 1 +} + +if [[ "${CONTROL_PLANE_RESOURCE_PATCHES}" == "true" ]]; then + echo "Verifying control-plane resource patches..." + assert_control_plane_request kube-apiserver cpu "${API_SERVER_CPU_REQUEST}" + assert_control_plane_request kube-apiserver memory "${API_SERVER_MEMORY_REQUEST}" + assert_control_plane_request kube-controller-manager cpu "${CONTROLLER_MANAGER_CPU_REQUEST}" + assert_control_plane_request kube-controller-manager memory "${CONTROLLER_MANAGER_MEMORY_REQUEST}" + assert_control_plane_request kube-scheduler cpu "${SCHEDULER_CPU_REQUEST}" + assert_control_plane_request kube-scheduler memory "${SCHEDULER_MEMORY_REQUEST}" + assert_control_plane_request etcd cpu "${ETCD_CPU_REQUEST}" + assert_control_plane_request etcd memory "${ETCD_MEMORY_REQUEST}" +fi + +if [[ "${CONTROL_PLANE_LEADER_ELECTION_TUNING}" == "true" ]]; then + echo "Verifying control-plane leader election timeout patches..." + for component in kube-controller-manager kube-scheduler; do + assert_control_plane_arg "${component}" "--leader-elect-lease-duration=${LEADER_ELECTION_LEASE_DURATION}" + assert_control_plane_arg "${component}" "--leader-elect-renew-deadline=${LEADER_ELECTION_RENEW_DEADLINE}" + assert_control_plane_arg "${component}" "--leader-elect-retry-period=${LEADER_ELECTION_RETRY_PERIOD}" + done +fi diff --git a/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh new file mode 100644 index 000000000..75d113151 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/delete-stale-kind-cluster.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +docker_timeout() { + timeout 30s docker "$@" +} + +read_kind_container_ids() { + local output + + if ! output="$(docker_timeout ps -aq --filter "label=${kind_cluster_label}" 2>&1)"; then + echo "::error::failed to query stale kind containers for ${KIND_CLUSTER_NAME}" + echo "${output}" + exit 1 + fi + + remaining_containers=() + if [[ -n "${output}" ]]; then + mapfile -t remaining_containers <<< "${output}" + fi +} + +if kind get clusters | grep -Fxq "${KIND_CLUSTER_NAME}"; then + echo "Deleting stale kind cluster: ${KIND_CLUSTER_NAME}" + if ! timeout 180s kind delete cluster --name "${KIND_CLUSTER_NAME}"; then + echo "::warning::kind delete cluster timed out or failed; falling back to direct container cleanup" + fi +else + echo "No stale kind cluster named ${KIND_CLUSTER_NAME}" +fi + +read_kind_container_ids +if (( ${#remaining_containers[@]} > 0 )); then + echo "Removing stale containers for ${KIND_CLUSTER_NAME}:" + docker_timeout ps -a --filter "label=${kind_cluster_label}" + docker_timeout rm -f "${remaining_containers[@]}" +fi + +read_kind_container_ids +if (( ${#remaining_containers[@]} > 0 )); then + echo "::error::stale containers still remain for ${KIND_CLUSTER_NAME}:" + docker_timeout ps -a --filter "label=${kind_cluster_label}" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh new file mode 100644 index 000000000..843496a38 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/increase-inotify-limits.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +sudo sysctl -w fs.inotify.max_user_watches=524288 +sudo sysctl -w fs.inotify.max_user_instances=1024 diff --git a/.github/actions/gpu-cluster-setup/install-nvkind.sh b/.github/actions/gpu-cluster-setup/install-nvkind.sh new file mode 100644 index 000000000..c2200e078 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/install-nvkind.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ -z "${NVKIND_VERSION:-}" ]]; then + echo "::error::NVKIND_VERSION must be set" + exit 1 +fi + +go install "github.com/NVIDIA/nvkind/cmd/nvkind@${NVKIND_VERSION}" +nvkind_bin="${GOBIN:-$(go env GOPATH)/bin}/nvkind" +"${nvkind_bin}" --help diff --git a/.github/actions/gpu-cluster-setup/runner-preflight.sh b/.github/actions/gpu-cluster-setup/runner-preflight.sh new file mode 100644 index 000000000..70d38ecf5 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/runner-preflight.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" + +echo "=== Runner baseline ===" +date -u +hostname +uptime +nproc +free -h +df -h / +df -ih / + +for value_name in MIN_GPU_COUNT MIN_FREE_DISK_GB MIN_AVAILABLE_MEMORY_GB; do + value="${!value_name}" + if ! [[ "${value}" =~ ^[0-9]+$ ]]; then + echo "::error::${value_name} must be an integer, got '${value}'" + exit 1 + fi +done + +echo "=== Docker health ===" +docker info >/dev/null +docker version + +echo "=== Host GPUs ===" +nvidia-smi -L +nvidia-smi + +mapfile -t gpu_names < <(nvidia-smi --query-gpu=name --format=csv,noheader) +if [[ -n "${GPU_MODEL_PATTERN}" ]]; then + set +e + gpu_count=$(printf '%s\n' "${gpu_names[@]}" | grep -Eic -- "${GPU_MODEL_PATTERN}") + grep_status=$? + set -e + if (( grep_status == 2 )); then + echo "::error::invalid gpu_model_pattern regex: ${GPU_MODEL_PATTERN}" + exit 1 + fi + if (( grep_status != 0 )); then + gpu_count=0 + fi + echo "Visible GPUs matching '${GPU_MODEL_PATTERN}': ${gpu_count}" +else + gpu_count="${#gpu_names[@]}" + echo "Visible GPUs: ${gpu_count}" +fi + +if (( gpu_count < MIN_GPU_COUNT )); then + echo "::error::visible GPU count ${gpu_count} is below required minimum ${MIN_GPU_COUNT}" + exit 1 +fi + +echo "=== Existing kind state ===" +kind get clusters || true +docker ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true diff --git a/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh new file mode 100644 index 000000000..6f01ba156 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-docker-gpu-access.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +timeout 300s docker run --rm -v /dev/null:/var/run/nvidia-container-devices/all ubuntu:22.04 nvidia-smi -L diff --git a/.github/actions/gpu-cluster-setup/validate-env.sh b/.github/actions/gpu-cluster-setup/validate-env.sh new file mode 100644 index 000000000..697d077c2 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/validate-env.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then + echo "::error::KIND_CLUSTER_NAME environment variable must be set by the calling workflow" + exit 1 +fi diff --git a/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh new file mode 100644 index 000000000..4a0fcf5e3 --- /dev/null +++ b/.github/actions/gpu-cluster-setup/warm-kind-node-image.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +KIND_NODE_IMAGE="${KIND_NODE_IMAGE:?KIND_NODE_IMAGE must be set}" +MIN_FREE_DISK_GB="${MIN_FREE_DISK_GB:?MIN_FREE_DISK_GB must be set}" +if ! [[ "${MIN_FREE_DISK_GB}" =~ ^[0-9]+$ ]]; then + echo "::error::MIN_FREE_DISK_GB must be an integer, got '${MIN_FREE_DISK_GB}'" + exit 1 +fi + +echo "=== Kind node image cache ===" +if docker image inspect "${KIND_NODE_IMAGE}" >/dev/null 2>&1; then + echo "Kind node image already cached: ${KIND_NODE_IMAGE}" +else + echo "Pulling kind node image: ${KIND_NODE_IMAGE}" + timeout 600s docker pull "${KIND_NODE_IMAGE}" +fi +free_disk_bytes=$(df -B1 --output=avail / | tail -1 | tr -dc '0-9') +min_free_disk_bytes=$((MIN_FREE_DISK_GB * 1024 * 1024 * 1024)) +free_disk_gib=$((free_disk_bytes / 1024 / 1024 / 1024)) +if (( free_disk_bytes < min_free_disk_bytes )); then + echo "::error::free disk on / is ${free_disk_bytes} bytes (${free_disk_gib}GiB) after warming ${KIND_NODE_IMAGE}, need at least ${min_free_disk_bytes} bytes (${MIN_FREE_DISK_GB}GiB)" + exit 1 +fi +echo "Runner disk remains sufficient after kind image warm-up: ${free_disk_gib}GiB (${free_disk_bytes} bytes)" diff --git a/.github/actions/gpu-debug-diagnostics/action.yml b/.github/actions/gpu-debug-diagnostics/action.yml new file mode 100644 index 000000000..42ee4e091 --- /dev/null +++ b/.github/actions/gpu-debug-diagnostics/action.yml @@ -0,0 +1,35 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Debug Diagnostics' +description: 'Print bounded GPU CI diagnostics while the kind cluster is still present.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + mode: + description: 'Diagnostic mode: smoke, training, or inference' + required: false + default: 'smoke' + +runs: + using: 'composite' + steps: + - name: Print GPU debug diagnostics + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + GPU_TEST_DIAGNOSTIC_MODE: ${{ inputs.mode }} + run: bash "${{ github.action_path }}/../../scripts/gpu-debug-diagnostics.sh" diff --git a/.github/actions/gpu-operator-install/action.yml b/.github/actions/gpu-operator-install/action.yml deleted file mode 100644 index e2bdb300c..000000000 --- a/.github/actions/gpu-operator-install/action.yml +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: 'GPU Operator Install' -description: 'Installs the GPU operator via standalone Helm chart or aicr bundle.' - -inputs: - method: - description: 'Installation method: helm (standalone chart) or bundle (aicr recipe+bundle)' - required: true - accelerator: - description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)' - required: false - default: '' - intent: - description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)' - required: false - default: 'inference' - platform: - description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)' - required: false - default: '' - -runs: - using: 'composite' - steps: - - # --- Helm mode: standalone GPU operator chart --- - - - name: Install GPU Operator (helm) - if: inputs.method == 'helm' - shell: bash - run: | - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia - helm repo update - helm upgrade -i \ - --kube-context="kind-${KIND_CLUSTER_NAME}" \ - --namespace gpu-operator \ - --create-namespace \ - --set driver.enabled=false \ - --set toolkit.enabled=false \ - --set dcgmExporter.enabled=false \ - --set nfd.enabled=true \ - --wait --timeout=600s \ - gpu-operator nvidia/gpu-operator - - - name: Wait for GPU operands (helm) - if: inputs.method == 'helm' - shell: bash - run: | - echo "Waiting for device plugin to be ready..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s || true - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods - - # --- Bundle mode: aicr recipe → bundle → deploy --- - - - name: Generate recipe - if: inputs.method == 'bundle' - shell: bash - run: | - PLATFORM_FLAG="" - if [[ -n "${{ inputs.platform }}" ]]; then - PLATFORM_FLAG="--platform ${{ inputs.platform }}" - fi - ./aicr recipe \ - --service kind \ - --accelerator ${{ inputs.accelerator }} \ - --os ubuntu \ - --intent ${{ inputs.intent }} \ - ${PLATFORM_FLAG} \ - --output recipe.yaml - echo "--- Recipe ---" - cat recipe.yaml - - - name: Generate deployment bundle - if: inputs.method == 'bundle' - shell: bash - run: | - ./aicr bundle \ - --recipe recipe.yaml \ - --accelerated-node-toleration nvidia.com/gpu:NoSchedule \ - --output bundle - echo "--- Bundle contents ---" - ls -la bundle/ - - - name: Install bundle into cluster - if: inputs.method == 'bundle' - shell: bash - run: | - cd bundle - # Use --no-wait: several components (gpu-operator ClusterPolicy, - # kai-scheduler SchedulingShard, nvidia-dra-driver-gpu kubelet plugin) - # stay InProgress in kind because their CRs/DaemonSets require - # features not available in kind (DRA feature gates, driver modules). - # The explicit "Wait for GPU operands" step below gates on what - # actually matters (device plugin readiness). - # --best-effort: some components (e.g. network-operator) have Helm - # hooks that may time out in Kind; continue deploying remaining - # components so the overall stack is functional. - chmod +x deploy.sh - echo "--- deploy.sh ---" - cat deploy.sh - ./deploy.sh --no-wait --best-effort - - - name: Wait for GPU operands (bundle) - if: inputs.method == 'bundle' - shell: bash - run: | - echo "Waiting for GPU operator controller to deploy operands..." - # The GPU operator controller watches ClusterPolicy and creates - # DaemonSets for device-plugin, NFD, GFD, etc. This happens - # asynchronously after the helm install completes. - for i in $(seq 1 30); do - count=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get daemonset -l app=nvidia-device-plugin-daemonset --no-headers 2>/dev/null | wc -l) - if [[ "$count" -gt 0 ]]; then - echo "Device plugin DaemonSet found." - break - fi - echo "Waiting for device plugin DaemonSet to be created... (${i}/30)" - sleep 10 - done - echo "Waiting for device plugin rollout..." - # Operands are excluded from control-plane nodes via nodeAffinity in - # the kind overlay, so all scheduled pods should become ready. - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - rollout status daemonset -l app=nvidia-device-plugin-daemonset --timeout=300s - echo "GPU Operator pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods diff --git a/.github/actions/gpu-smoke-nvidia-smi/action.yml b/.github/actions/gpu-smoke-nvidia-smi/action.yml new file mode 100644 index 000000000..cb61b5d0d --- /dev/null +++ b/.github/actions/gpu-smoke-nvidia-smi/action.yml @@ -0,0 +1,36 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Smoke nvidia-smi' +description: 'Run nvidia-smi in a GPU-backed kind pod and print its logs.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + +runs: + using: 'composite' + steps: + - name: Run nvidia-smi in a pod + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-run-nvidia-smi.sh" + - name: Show nvidia-smi output + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh" diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index e1ee3c14b..9f215a3e3 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -26,60 +26,36 @@ inputs: cluster_name: description: 'Kind cluster name (for kubectl context)' required: true + snapshot_timeout: + description: 'Timeout for aicr snapshot' + required: false + default: '5m' runs: using: composite steps: + - name: Build snapshot agent image + uses: ./.github/actions/aicr-build + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + with: + build_cli: 'false' + build_snapshot_agent: 'true' + validator_phases: 'none' - name: Run aicr snapshot shell: bash - run: | - ./aicr snapshot \ - --kubeconfig="${HOME}/.kube/config" \ - --namespace=default \ - --image=ko.local:smoke-test \ - --require-gpu \ - --output=snapshot.yaml - echo "--- Snapshot output ---" - cat snapshot.yaml - + env: + SNAPSHOT_TIMEOUT: ${{ inputs.snapshot_timeout }} + run: bash "${{ github.action_path }}/run-snapshot.sh" - name: Validate snapshot detected GPU shell: bash - run: | - # Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". - GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) - GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) - echo "GPU model: ${GPU_MODEL}" - echo "GPU count: ${GPU_COUNT}" - if [[ "${GPU_MODEL}" != *"${{ inputs.gpu_model }}"* ]]; then - echo "::error::Expected ${{ inputs.gpu_model }} GPU in snapshot, got: ${GPU_MODEL}" - exit 1 - fi - if [[ "${GPU_COUNT}" -lt ${{ inputs.min_gpu_count }} ]]; then - echo "::error::Expected gpu-count >= ${{ inputs.min_gpu_count }}, got: ${GPU_COUNT}" - exit 1 - fi - echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" - + env: + EXPECTED_GPU_MODEL: ${{ inputs.gpu_model }} + MIN_GPU_COUNT: ${{ inputs.min_gpu_count }} + run: bash "${{ github.action_path }}/validate-snapshot-gpu.sh" - name: Debug snapshot Job if: failure() shell: bash - run: | - echo "=== Snapshot Job ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true - echo "=== Snapshot Pods ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get pods -l app.kubernetes.io/name=aicr -o wide || true - echo "=== Snapshot Job describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true - echo "=== Snapshot Pod describe ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - describe pods -l app.kubernetes.io/name=aicr || true - echo "=== Snapshot current logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true - echo "=== Snapshot previous logs ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true - echo "=== Snapshot ConfigMap ===" - kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ - get configmap aicr-snapshot -o yaml || true + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.action_path }}/debug-snapshot-job.sh" diff --git a/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh new file mode 100644 index 000000000..2e0f1547f --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/debug-snapshot-job.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +echo "=== Snapshot Job ===" +kubectl_kind -n default get job aicr -o yaml || true +echo "=== Snapshot Pods ===" +kubectl_kind -n default get pods -l app.kubernetes.io/name=aicr -o wide || true +echo "=== Snapshot Job describe ===" +kubectl_kind -n default describe job aicr || true +echo "=== Snapshot Pod describe ===" +kubectl_kind -n default describe pods -l app.kubernetes.io/name=aicr || true +echo "=== Snapshot current logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true +echo "=== Snapshot previous logs ===" +kubectl_kind -n default logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true +echo "=== Snapshot ConfigMap ===" +kubectl_kind -n default get configmap aicr-snapshot -o yaml || true diff --git a/.github/actions/gpu-snapshot-validate/run-snapshot.sh b/.github/actions/gpu-snapshot-validate/run-snapshot.sh new file mode 100644 index 000000000..e45b575ef --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/run-snapshot.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +./aicr snapshot \ + --kubeconfig="${HOME}/.kube/config" \ + --namespace=default \ + --image=ko.local:smoke-test \ + --require-gpu \ + --timeout="${SNAPSHOT_TIMEOUT}" \ + --output=snapshot.yaml +echo "--- Snapshot output ---" +cat snapshot.yaml diff --git a/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh new file mode 100644 index 000000000..5a27e6093 --- /dev/null +++ b/.github/actions/gpu-snapshot-validate/validate-snapshot-gpu.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Query by subtype field (not index) — #502 added a "hardware" subtype before "smi". +GPU_MODEL=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu.model"]' snapshot.yaml) +GPU_COUNT=$(yq eval '.measurements[] | select(.type == "GPU") | .subtypes[] | select(.subtype == "smi") | .data["gpu-count"]' snapshot.yaml) +echo "GPU model: ${GPU_MODEL}" +echo "GPU count: ${GPU_COUNT}" +if ! [[ "${GPU_COUNT}" =~ ^[0-9]+$ ]]; then + echo "::error::Expected numeric gpu-count in snapshot, got: ${GPU_COUNT}" + exit 1 +fi +if [[ "${GPU_MODEL}" != *"${EXPECTED_GPU_MODEL}"* ]]; then + echo "::error::Expected ${EXPECTED_GPU_MODEL} GPU in snapshot, got: ${GPU_MODEL}" + exit 1 +fi +if [[ "${GPU_COUNT}" -lt ${MIN_GPU_COUNT} ]]; then + echo "::error::Expected gpu-count >= ${MIN_GPU_COUNT}, got: ${GPU_COUNT}" + exit 1 +fi +echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" diff --git a/.github/actions/gpu-test-cleanup/action.yml b/.github/actions/gpu-test-cleanup/action.yml index 30ac7831f..e58588b1a 100644 --- a/.github/actions/gpu-test-cleanup/action.yml +++ b/.github/actions/gpu-test-cleanup/action.yml @@ -23,48 +23,74 @@ inputs: description: 'Prefix for the uploaded artifact name' required: false default: 'gpu-test-debug' + collect_artifacts: + description: 'Collect and upload debug artifacts before deleting the kind cluster' + required: false + default: 'false' + collect_node_runtime_artifacts: + description: 'Collect expensive kind node runtime artifacts such as journalctl, crictl, and kind export logs' + required: false + default: 'false' + diagnostic_mode: + description: 'Optional gpu-debug-diagnostics mode to run when collect_artifacts is true' + required: false + default: '' + upload_validation_artifacts: + description: 'Upload validation result and evidence artifacts before cleanup' + required: false + default: 'false' + validation_artifact_name: + description: 'Name for uploaded validation artifacts' + required: false + default: 'conformance-evidence' + validation_artifact_paths: + description: 'Newline-separated validation artifact paths' + required: false + default: | + conformance-evidence/ + validation-result.yaml runs: using: 'composite' steps: + - name: Debug diagnostics + if: inputs.collect_artifacts == 'true' && inputs.diagnostic_mode != '' + uses: ./.github/actions/gpu-debug-diagnostics + with: + cluster_name: ${{ inputs.cluster_name }} + mode: ${{ inputs.diagnostic_mode }} + - name: Upload validation artifacts + if: always() && inputs.upload_validation_artifacts == 'true' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: ${{ inputs.validation_artifact_name }} + path: ${{ inputs.validation_artifact_paths }} + if-no-files-found: warn - name: Collect debug artifacts - if: failure() + if: always() && inputs.collect_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - mkdir -p /tmp/debug-artifacts - kubectl --context="kind-${KIND_CLUSTER_NAME}" get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true - + COLLECT_NODE_RUNTIME_ARTIFACTS: ${{ inputs.collect_node_runtime_artifacts }} + run: bash "${{ github.action_path }}/collect-debug-artifacts.sh" - name: Export kind logs - if: failure() + if: always() && inputs.collect_artifacts == 'true' && inputs.collect_node_runtime_artifacts == 'true' shell: bash env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - mkdir -p /tmp/kind-logs - kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}" || true - + run: bash "${{ github.action_path }}/export-kind-logs.sh" + - name: Cleanup + if: always() + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.action_path }}/cleanup-kind-cluster.sh" - name: Upload debug artifacts - if: failure() - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + if: always() && inputs.collect_artifacts == 'true' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ inputs.artifact_name_prefix }}-${{ github.run_id }} path: | /tmp/debug-artifacts/ /tmp/kind-logs/ retention-days: 7 - - - name: Cleanup - if: always() - shell: bash - env: - KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} - run: | - kind delete cluster --name "${KIND_CLUSTER_NAME}" || true - docker system prune -f || true diff --git a/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh new file mode 100644 index 000000000..134aa2589 --- /dev/null +++ b/.github/actions/gpu-test-cleanup/cleanup-kind-cluster.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +timeout 300s kind delete cluster --name "${KIND_CLUSTER_NAME}" || true +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} +kind_cluster_label="io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" +mapfile -t remaining_containers < <(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true) +if (( ${#remaining_containers[@]} > 0 )); then + echo "Removing leftover kind containers for ${KIND_CLUSTER_NAME}:" + docker_timeout 30s ps -a --filter "label=${kind_cluster_label}" || true + docker_timeout 30s rm -f "${remaining_containers[@]}" || true + mapfile -t remaining_containers < <(docker_timeout 30s ps -aq --filter "label=${kind_cluster_label}" || true) + if (( ${#remaining_containers[@]} > 0 )); then + echo "::warning::leftover kind containers still present for ${KIND_CLUSTER_NAME}: ${remaining_containers[*]}" + fi +fi diff --git a/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh new file mode 100644 index 000000000..a77744645 --- /dev/null +++ b/.github/actions/gpu-test-cleanup/collect-debug-artifacts.sh @@ -0,0 +1,179 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diagnostic artifact collection intentionally omits -e so one broken cluster +# call does not prevent later artifacts from being collected. +set -uo pipefail +rm -rf /tmp/debug-artifacts /tmp/kind-logs +mkdir -p /tmp/debug-artifacts +mkdir -p /tmp/kind-logs +CONTROL_PLANE_COMPONENTS="kube-apiserver kube-controller-manager kube-scheduler etcd" +MAX_KIND_NODE_ARTIFACT_SECONDS="${MAX_KIND_NODE_ARTIFACT_SECONDS:-600}" +COLLECT_NODE_RUNTIME_ARTIFACTS="${COLLECT_NODE_RUNTIME_ARTIFACTS:-false}" +if ! [[ "${MAX_KIND_NODE_ARTIFACT_SECONDS}" =~ ^[0-9]+$ ]]; then + echo "::warning::MAX_KIND_NODE_ARTIFACT_SECONDS must be an integer; got '${MAX_KIND_NODE_ARTIFACT_SECONDS}', defaulting to 600" >&2 + MAX_KIND_NODE_ARTIFACT_SECONDS=600 +fi +command_timeout() { + local limit="$1" + shift + timeout "${limit}" "$@" +} +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +{ + date -u || true + hostname || true + uptime || true + nproc || true + free -h || true + df -h / || true + df -ih / || true +} > /tmp/debug-artifacts/runner-baseline.txt 2>&1 || true +docker_timeout 30s version > /tmp/debug-artifacts/docker-version.txt 2>&1 || true +docker_timeout 30s info > /tmp/debug-artifacts/docker-info.txt 2>&1 || true +command_timeout 30s nvidia-smi -L > /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +command_timeout 30s nvidia-smi >> /tmp/debug-artifacts/host-gpus.txt 2>&1 || true +command_timeout 30s kind get clusters > /tmp/debug-artifacts/kind-clusters.txt 2>&1 || true +docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + > /tmp/debug-artifacts/kind-node-containers.txt 2>&1 || true + +kubectl_kind get all --all-namespaces > /tmp/debug-artifacts/all-resources.txt || true +kubectl_kind get events --all-namespaces --sort-by='.lastTimestamp' > /tmp/debug-artifacts/events.txt || true +kubectl_kind get --raw='/livez?verbose' > /tmp/debug-artifacts/apiserver-livez.txt 2>&1 || true +kubectl_kind get --raw='/readyz?verbose' > /tmp/debug-artifacts/apiserver-readyz.txt 2>&1 || true +kubectl_kind -n kube-system get pods -l tier=control-plane -o wide \ + > /tmp/debug-artifacts/control-plane-pods.txt 2>&1 || true +kubectl_kind -n kube-system get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/kube-system-events.txt 2>&1 || true +for component in ${CONTROL_PLANE_COMPONENTS}; do + kubectl_kind -n kube-system describe pod -l "component=${component}" \ + > "/tmp/debug-artifacts/${component}-describe.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --tail=300 \ + > "/tmp/debug-artifacts/${component}-logs.txt" 2>&1 || true + kubectl_kind -n kube-system logs -l "component=${component}" --all-containers --previous --tail=300 \ + > "/tmp/debug-artifacts/${component}-previous-logs.txt" 2>&1 || true + kubectl_kind -n kube-system get lease "${component}" -o yaml \ + > "/tmp/debug-artifacts/${component}-lease.yaml" 2>&1 || true +done +kubectl_kind -n gpu-operator get pods -o wide > /tmp/debug-artifacts/gpu-operator-pods.txt || true +kubectl_kind -n gpu-operator logs -l app=nvidia-device-plugin-daemonset --tail=100 > /tmp/debug-artifacts/device-plugin-logs.txt || true +kubectl_kind -n gpu-operator logs -l app.kubernetes.io/component=gpu-operator --tail=100 > /tmp/debug-artifacts/gpu-operator-logs.txt || true +kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide \ + > /tmp/debug-artifacts/monitoring-workloads.txt 2>&1 || true +kubectl_kind -n monitoring describe deployment kube-prometheus-operator \ + > /tmp/debug-artifacts/kube-prometheus-operator-deployment-describe.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-logs.txt 2>&1 || true +kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=300 \ + > /tmp/debug-artifacts/kube-prometheus-operator-previous-logs.txt 2>&1 || true +kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' \ + > /tmp/debug-artifacts/monitoring-events.txt 2>&1 || true +{ + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "=== ${pod} ===" + kubectl_kind -n monitoring describe "${pod}" 2>&1 || true + done +} > /tmp/debug-artifacts/kube-prometheus-operator-pods-describe.txt 2>&1 || true +kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded > /tmp/debug-artifacts/non-running-pods.txt || true +tar_inputs=() +[[ -f recipe.yaml ]] && tar_inputs+=(recipe.yaml) +[[ -d bundle ]] && tar_inputs+=(bundle) +if [[ "${#tar_inputs[@]}" -gt 0 ]]; then + echo "Archiving runtime bundle inputs: ${tar_inputs[*]}" + tar -czf /tmp/debug-artifacts/aicr-runtime-bundle.tar.gz "${tar_inputs[@]}" || true +else + echo "No recipe.yaml or bundle directory found; skipping runtime bundle archive" +fi + +case "${COLLECT_NODE_RUNTIME_ARTIFACTS}" in + true) + artifact_loop_start="$(date +%s)" + docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + artifact_loop_elapsed=$(($(date +%s) - artifact_loop_start)) + if (( artifact_loop_elapsed > MAX_KIND_NODE_ARTIFACT_SECONDS )); then + echo "Kind node artifact collection exceeded ${MAX_KIND_NODE_ARTIFACT_SECONDS}s; stopping after partial collection." + break + fi + node_file="${node_container//[^A-Za-z0-9_.-]/_}" + docker_timeout 30s inspect "${node_container}" \ + > "/tmp/debug-artifacts/${node_file}-docker-inspect.json" 2>&1 || true + docker_timeout 30s exec "${node_container}" journalctl -u kubelet \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-kubelet-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" journalctl -u containerd \ + --since "90 minutes ago" --no-pager \ + > "/tmp/debug-artifacts/${node_file}-containerd-journal.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl ps -a \ + > "/tmp/debug-artifacts/${node_file}-crictl-ps-a.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl pods \ + > "/tmp/debug-artifacts/${node_file}-crictl-pods.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" crictl stats \ + > "/tmp/debug-artifacts/${node_file}-crictl-stats.txt" 2>&1 || true + docker_timeout 30s exec "${node_container}" sh -c ' + date + uptime || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -40 || true + ' > "/tmp/debug-artifacts/${node_file}-node-pressure.txt" 2>&1 || true + # shellcheck disable=SC2016 # Expanded inside the kind node shell. + docker_timeout 120s exec "${node_container}" sh -c ' + for component in kube-apiserver kube-controller-manager kube-scheduler etcd; do + echo "=== ${component} static pod manifest ===" + sed -n "1,220p" "/etc/kubernetes/manifests/${component}.yaml" 2>/dev/null || true + echo "=== ${component} CRI containers ===" + crictl ps -a --name "${component}" || true + count=0 + for container_id in $(crictl ps -a --name "${component}" -q 2>/dev/null); do + count=$((count + 1)) + if [ "${count}" -gt 8 ]; then + echo "Skipping remaining ${component} CRI containers after first 8 entries." + break + fi + echo "=== crictl inspect ${component} ${container_id} ===" + crictl inspect "${container_id}" || true + echo "=== crictl logs ${component} ${container_id} ===" + crictl logs --tail=300 "${container_id}" || true + done + done + ' > "/tmp/debug-artifacts/${node_file}-control-plane-cri.txt" 2>&1 || true + done || true + ;; + ""|false) + echo "Skipped kind node runtime artifacts. Set collect_node_runtime_artifacts=true to collect journalctl, crictl, and kind export logs." \ + > /tmp/debug-artifacts/node-runtime-artifacts-skipped.txt + echo "Skipped kind log export. Set collect_node_runtime_artifacts=true to export kind logs." \ + > /tmp/kind-logs/kind-logs-skipped.txt + ;; + *) + echo "Unknown COLLECT_NODE_RUNTIME_ARTIFACTS=${COLLECT_NODE_RUNTIME_ARTIFACTS}; skipping kind node runtime artifacts." \ + > /tmp/debug-artifacts/node-runtime-artifacts-skipped.txt + echo "Unknown COLLECT_NODE_RUNTIME_ARTIFACTS=${COLLECT_NODE_RUNTIME_ARTIFACTS}; skipping kind log export." \ + > /tmp/kind-logs/kind-logs-skipped.txt + ;; +esac diff --git a/.github/actions/gpu-test-cleanup/export-kind-logs.sh b/.github/actions/gpu-test-cleanup/export-kind-logs.sh new file mode 100644 index 000000000..a46624f60 --- /dev/null +++ b/.github/actions/gpu-test-cleanup/export-kind-logs.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +mkdir -p /tmp/kind-logs +if ! timeout 300s kind export logs /tmp/kind-logs --name "${KIND_CLUSTER_NAME}"; then + echo "::warning::kind log export failed or timed out for ${KIND_CLUSTER_NAME}; continuing cleanup" >&2 +fi diff --git a/.github/actions/gpu-validate-conformance/action.yml b/.github/actions/gpu-validate-conformance/action.yml new file mode 100644 index 000000000..bde5238a9 --- /dev/null +++ b/.github/actions/gpu-validate-conformance/action.yml @@ -0,0 +1,57 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Validate Conformance' +description: 'Run CNCF AI Conformance validation for a GPU Kind test cluster.' + +inputs: + cluster_name: + description: 'Kind cluster name' + required: true + kwok_helm_timeout: + description: 'Timeout for KWOK controller Helm install' + required: false + default: '300s' + ko_build_timeout: + description: 'Timeout for Karpenter KWOK provider ko build' + required: false + default: '900s' + karpenter_helm_timeout: + description: 'Timeout for Karpenter Helm install' + required: false + default: '300s' + +runs: + using: 'composite' + steps: + - name: Install Karpenter + KWOK + uses: ./.github/actions/install-karpenter-kwok + with: + cluster_name: ${{ inputs.cluster_name }} + kwok_helm_timeout: ${{ inputs.kwok_helm_timeout }} + ko_build_timeout: ${{ inputs.ko_build_timeout }} + karpenter_helm_timeout: ${{ inputs.karpenter_helm_timeout }} + - name: Build conformance validator image + uses: ./.github/actions/aicr-build + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + with: + build_cli: 'false' + build_snapshot_agent: 'false' + validator_phases: 'conformance' + - name: Validate CNCF AI Conformance + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + run: bash "${{ github.workspace }}/.github/scripts/gpu-validate-conformance.sh" diff --git a/.github/actions/gpu-workflow-prepare/action.yml b/.github/actions/gpu-workflow-prepare/action.yml new file mode 100644 index 000000000..1faf4496d --- /dev/null +++ b/.github/actions/gpu-workflow-prepare/action.yml @@ -0,0 +1,47 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'GPU Workflow Prepare' +description: 'Print early runner diagnostics and load GPU workflow tool/image versions.' + +outputs: + chainsaw: + description: 'Chainsaw version' + value: ${{ steps.versions.outputs.chainsaw }} + chainsaw_sha256_linux_amd64: + description: 'Chainsaw SHA256 checksum for linux/amd64' + value: ${{ steps.versions.outputs.chainsaw_sha256_linux_amd64 }} + h100_kind_node_image: + description: 'Kind node image for H100 GPU tests' + value: ${{ steps.versions.outputs.h100_kind_node_image }} + +runs: + using: 'composite' + steps: + - name: Runner preflight snapshot + shell: bash + run: | + echo "::group::Runner preflight snapshot" + echo "hostname: $(hostname)" + echo "kernel: $(uname -a)" + echo "uptime: $(uptime)" + echo "loadavg: $(cat /proc/loadavg 2>/dev/null || echo unavailable)" + echo "nproc: $(nproc 2>/dev/null || echo unavailable)" + free -h 2>/dev/null || true + df -h / 2>/dev/null || true + echo "::endgroup::" + + - name: Load GPU test versions + id: versions + uses: ./.github/actions/load-versions diff --git a/.github/actions/install-karpenter-kwok/action.yml b/.github/actions/install-karpenter-kwok/action.yml index fde7bddde..d3570a43b 100644 --- a/.github/actions/install-karpenter-kwok/action.yml +++ b/.github/actions/install-karpenter-kwok/action.yml @@ -19,6 +19,18 @@ inputs: cluster_name: description: 'Kind cluster name (used for kubectl context)' required: true + kwok_helm_timeout: + description: 'Timeout for KWOK controller Helm install' + required: false + default: '300s' + ko_build_timeout: + description: 'Timeout for Karpenter KWOK provider ko build' + required: false + default: '900s' + karpenter_helm_timeout: + description: 'Timeout for Karpenter Helm install' + required: false + default: '300s' runs: using: 'composite' @@ -26,9 +38,12 @@ runs: - name: Resolve versions id: versions shell: bash - run: | - echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" >> "$GITHUB_OUTPUT" - echo "go=$(go env GOVERSION)" >> "$GITHUB_OUTPUT" + run: bash "${{ github.action_path }}/resolve-versions.sh" + - name: Install ko + uses: ./.github/actions/setup-build-tools + with: + install_ko: 'true' + ko_version: ${{ steps.versions.outputs.ko }} - name: Cache Karpenter Go build cache uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 @@ -46,7 +61,7 @@ runs: env: KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} KARPENTER_VERSION: ${{ steps.versions.outputs.karpenter }} - run: | - set -euo pipefail - bash kwok/scripts/install-karpenter-kwok.sh - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f kwok/manifests/karpenter/nodepool.yaml + KWOK_HELM_TIMEOUT: ${{ inputs.kwok_helm_timeout }} + KO_BUILD_TIMEOUT: ${{ inputs.ko_build_timeout }} + KARPENTER_HELM_TIMEOUT: ${{ inputs.karpenter_helm_timeout }} + run: bash "${{ github.action_path }}/install-karpenter-kwok.sh" diff --git a/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh new file mode 100644 index 000000000..0ec6480d1 --- /dev/null +++ b/.github/actions/install-karpenter-kwok/install-karpenter-kwok.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +if [[ -z "${KIND_CLUSTER_NAME:-}" ]]; then + echo "::error::KIND_CLUSTER_NAME is required" + exit 1 +fi +KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}" + +validate_duration_input() { + local input_name="$1" + local input_value="$2" + + if ! [[ "${input_value}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::${input_name} must be a duration like 300s, 10m, or 1h; got '${input_value}'" + exit 1 + fi +} + +validate_duration_input kwok_helm_timeout "${KWOK_HELM_TIMEOUT}" +validate_duration_input ko_build_timeout "${KO_BUILD_TIMEOUT}" +validate_duration_input karpenter_helm_timeout "${KARPENTER_HELM_TIMEOUT}" +bash kwok/scripts/install-karpenter-kwok.sh +timeout 30s kubectl --request-timeout=10s \ + --context="${KUBE_CONTEXT}" \ + apply -f kwok/manifests/karpenter/nodepool.yaml diff --git a/.github/actions/install-karpenter-kwok/resolve-versions.sh b/.github/actions/install-karpenter-kwok/resolve-versions.sh new file mode 100644 index 000000000..6aeb173a7 --- /dev/null +++ b/.github/actions/install-karpenter-kwok/resolve-versions.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +{ + echo "karpenter=$(yq eval '.testing_tools.karpenter' .settings.yaml)" + echo "ko=$(yq eval '.build_tools.ko' .settings.yaml)" + echo "go=go$(yq eval '.languages.go' .settings.yaml)" +} >> "$GITHUB_OUTPUT" diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml index b87e321d1..b3c506d40 100644 --- a/.github/actions/load-versions/action.yml +++ b/.github/actions/load-versions/action.yml @@ -40,6 +40,9 @@ outputs: kind: description: 'Kind version' value: ${{ steps.versions.outputs.kind }} + nvkind: + description: 'nvkind git ref' + value: ${{ steps.versions.outputs.nvkind }} ctlptl: description: 'ctlptl version' value: ${{ steps.versions.outputs.ctlptl }} @@ -91,6 +94,9 @@ outputs: kind_node_image: description: 'Kind node image for testing' value: ${{ steps.versions.outputs.kind_node_image }} + h100_kind_node_image: + description: 'Kind node image for H100 GPU tests' + value: ${{ steps.versions.outputs.h100_kind_node_image }} runs: using: 'composite' @@ -121,6 +127,7 @@ runs: # Testing tools echo "kubectl=$(yq eval '.testing_tools.kubectl' .settings.yaml)" >> $GITHUB_OUTPUT echo "kind=$(yq eval '.testing_tools.kind' .settings.yaml)" >> $GITHUB_OUTPUT + echo "nvkind=$(yq eval '.testing_tools.nvkind' .settings.yaml)" >> $GITHUB_OUTPUT echo "ctlptl=$(yq eval '.testing_tools.ctlptl' .settings.yaml)" >> $GITHUB_OUTPUT echo "tilt=$(yq eval '.testing_tools.tilt' .settings.yaml)" >> $GITHUB_OUTPUT echo "helm=$(yq eval '.testing_tools.helm' .settings.yaml)" >> $GITHUB_OUTPUT @@ -141,6 +148,7 @@ runs: # Testing configuration echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT + echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT - name: Display loaded versions shell: bash @@ -158,6 +166,7 @@ runs: echo " grype: ${{ steps.versions.outputs.grype }}" echo " kubectl: ${{ steps.versions.outputs.kubectl }}" echo " kind: ${{ steps.versions.outputs.kind }}" + echo " nvkind: ${{ steps.versions.outputs.nvkind }}" echo " ctlptl: ${{ steps.versions.outputs.ctlptl }}" echo " tilt: ${{ steps.versions.outputs.tilt }}" echo " helm: ${{ steps.versions.outputs.helm }}" @@ -172,3 +181,4 @@ runs: echo " lint_timeout: ${{ steps.versions.outputs.lint_timeout }}" echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}" echo " kind_node_image: ${{ steps.versions.outputs.kind_node_image }}" + echo " h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}" diff --git a/.github/actions/runtime-install/action.yml b/.github/actions/runtime-install/action.yml new file mode 100644 index 000000000..1adfea364 --- /dev/null +++ b/.github/actions/runtime-install/action.yml @@ -0,0 +1,104 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: 'Runtime Install' +description: 'Installs the standalone GPU operator for smoke tests or the full AICR runtime bundle.' + +inputs: + method: + description: 'Installation method: helm (standalone chart) or bundle (aicr recipe+bundle)' + required: true + accelerator: + description: 'Accelerator type for recipe generation (bundle mode only, e.g. h100)' + required: false + default: '' + intent: + description: 'Intent for recipe generation (bundle mode only, e.g. inference, training)' + required: false + default: 'inference' + platform: + description: 'Platform for recipe generation (bundle mode only, e.g. dynamo)' + required: false + default: '' + wait: + description: 'Wait for bundle Helm resources during deploy' + required: false + default: 'false' + best_effort: + description: 'Continue deploying remaining bundle components after a component failure' + required: false + default: 'true' + +runs: + using: 'composite' + steps: + - name: Validate installation method + shell: bash + env: + RUNTIME_INSTALL_METHOD: ${{ inputs.method }} + run: | + case "${RUNTIME_INSTALL_METHOD}" in + helm|bundle) ;; + *) + echo "::error::unsupported runtime install method: ${RUNTIME_INSTALL_METHOD}" + exit 1 + ;; + esac + + # --- Helm mode: standalone GPU operator chart --- + + - name: Install GPU Operator (helm) + if: inputs.method == 'helm' + shell: bash + run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh" + - name: Wait for GPU operands (helm) + if: inputs.method == 'helm' + shell: bash + run: bash "${{ github.action_path }}/wait-gpu-operands-helm.sh" + # --- Bundle mode: aicr recipe → bundle → deploy --- + + - name: Validate bundle inputs + if: inputs.method == 'bundle' + shell: bash + env: + AICR_ACCELERATOR: ${{ inputs.accelerator }} + run: | + if [[ -z "${AICR_ACCELERATOR}" ]]; then + echo "::error::inputs.accelerator is required when inputs.method is 'bundle'" + exit 1 + fi + + - name: Generate recipe + if: inputs.method == 'bundle' + shell: bash + env: + AICR_ACCELERATOR: ${{ inputs.accelerator }} + AICR_INTENT: ${{ inputs.intent }} + AICR_PLATFORM: ${{ inputs.platform }} + run: bash "${{ github.action_path }}/generate-recipe.sh" + - name: Generate deployment bundle + if: inputs.method == 'bundle' + shell: bash + run: bash "${{ github.action_path }}/generate-bundle.sh" + - name: Install bundle into cluster + if: inputs.method == 'bundle' + shell: bash + env: + AICR_DEPLOY_WAIT: ${{ inputs.wait }} + AICR_DEPLOY_BEST_EFFORT: ${{ inputs.best_effort }} + run: bash "${{ github.action_path }}/install-bundle.sh" + - name: Wait for GPU operands (bundle) + if: inputs.method == 'bundle' + shell: bash + run: bash "${{ github.action_path }}/wait-gpu-operands-bundle.sh" diff --git a/.github/actions/runtime-install/generate-bundle.sh b/.github/actions/runtime-install/generate-bundle.sh new file mode 100644 index 000000000..8e3f8436d --- /dev/null +++ b/.github/actions/runtime-install/generate-bundle.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +rm -rf bundle + +BUNDLE_ARGS=( + --recipe recipe.yaml + --accelerated-node-toleration nvidia.com/gpu:NoSchedule + --output bundle +) + +./aicr bundle "${BUNDLE_ARGS[@]}" +echo "--- Bundle contents ---" +ls -la bundle/ diff --git a/.github/actions/runtime-install/generate-recipe.sh b/.github/actions/runtime-install/generate-recipe.sh new file mode 100644 index 000000000..b3555ef78 --- /dev/null +++ b/.github/actions/runtime-install/generate-recipe.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +RECIPE_ARGS=( + --service kind + --accelerator "${AICR_ACCELERATOR}" + --os ubuntu + --intent "${AICR_INTENT}" +) +if [[ -n "${AICR_PLATFORM:-}" ]]; then + RECIPE_ARGS+=(--platform "${AICR_PLATFORM}") +fi + +./aicr recipe "${RECIPE_ARGS[@]}" --output recipe.yaml +echo "Recipe written to recipe.yaml" diff --git a/.github/actions/runtime-install/install-bundle.sh b/.github/actions/runtime-install/install-bundle.sh new file mode 100644 index 000000000..1068cddaa --- /dev/null +++ b/.github/actions/runtime-install/install-bundle.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +cd bundle +# The default keeps legacy bundle-mode behavior: do not wait on every +# Helm resource and keep deploying after component failures. H100 +# qualification jobs override these inputs to hard-fail and wait. +chmod +x deploy.sh +AICR_DEPLOY_WAIT="${AICR_DEPLOY_WAIT:-false}" +AICR_DEPLOY_BEST_EFFORT="${AICR_DEPLOY_BEST_EFFORT:-true}" +for deploy_flag_name in AICR_DEPLOY_WAIT AICR_DEPLOY_BEST_EFFORT; do + case "${!deploy_flag_name}" in + true|false) ;; + *) + echo "::error::${deploy_flag_name} must be true or false, got '${!deploy_flag_name}'" + exit 1 + ;; + esac +done + +DEPLOY_ARGS=() +if [[ "${AICR_DEPLOY_WAIT}" != "true" ]]; then + DEPLOY_ARGS+=(--no-wait) +fi +if [[ "${AICR_DEPLOY_BEST_EFFORT}" == "true" ]]; then + DEPLOY_ARGS+=(--best-effort) +fi +if [[ "${#DEPLOY_ARGS[@]}" -gt 0 ]]; then + echo "Deploying bundle with args: ${DEPLOY_ARGS[*]}" +else + echo "Deploying bundle with default args" +fi +./deploy.sh "${DEPLOY_ARGS[@]}" diff --git a/.github/actions/runtime-install/install-gpu-operator-helm.sh b/.github/actions/runtime-install/install-gpu-operator-helm.sh new file mode 100644 index 000000000..f20527ed3 --- /dev/null +++ b/.github/actions/runtime-install/install-gpu-operator-helm.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if ! command -v yq >/dev/null 2>&1; then + echo "::error::yq is required to read testing.gpu_operator_chart_version from .settings.yaml" + exit 1 +fi + +GPU_OPERATOR_CHART_VERSION="$(yq eval '.testing.gpu_operator_chart_version // ""' .settings.yaml)" +if [[ -z "${GPU_OPERATOR_CHART_VERSION}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then + echo "::error::testing.gpu_operator_chart_version must be set in .settings.yaml" + exit 1 +fi + +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --force-update +helm repo update +helm upgrade -i \ + --kube-context="kind-${KIND_CLUSTER_NAME}" \ + --namespace gpu-operator \ + --create-namespace \ + --set driver.enabled=false \ + --set toolkit.enabled=false \ + --set dcgmExporter.enabled=false \ + --set nfd.enabled=true \ + --version="${GPU_OPERATOR_CHART_VERSION}" \ + --wait --timeout=600s \ + gpu-operator nvidia/gpu-operator diff --git a/.github/actions/runtime-install/wait-gpu-operands-bundle.sh b/.github/actions/runtime-install/wait-gpu-operands-bundle.sh new file mode 100644 index 000000000..9133ba435 --- /dev/null +++ b/.github/actions/runtime-install/wait-gpu-operands-bundle.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}" +DEVICE_PLUGIN_WAIT_TIMEOUT="${DEVICE_PLUGIN_WAIT_TIMEOUT:-300s}" +KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-330s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@" +} + +kubectl_kind_wait() { + timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \ + --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \ + --context="${KUBE_CONTEXT}" "$@" +} + +echo "Waiting for GPU operator controller to deploy operands..." +# The GPU operator controller watches ClusterPolicy and creates +# DaemonSets for device-plugin, NFD, GFD, etc. This happens +# asynchronously after the bundle deploy applies the ClusterPolicy. +if ! kubectl_kind_wait -n gpu-operator wait --for=create \ + daemonset/nvidia-device-plugin-daemonset \ + --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then + echo "::error::device plugin DaemonSet was not created within ${DEVICE_PLUGIN_WAIT_TIMEOUT}" + kubectl_kind -n gpu-operator get pods || true + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true + exit 1 +fi +echo "Device plugin DaemonSet found." +echo "Waiting for device plugin rollout..." +# Operands are excluded from control-plane nodes via nodeAffinity in +# the kind overlay, so all scheduled pods should become ready. +if ! kubectl_kind_wait -n gpu-operator rollout status daemonset/nvidia-device-plugin-daemonset \ + --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then + echo "::error::device plugin DaemonSet did not roll out within ${DEVICE_PLUGIN_WAIT_TIMEOUT}" + kubectl_kind -n gpu-operator get pods -o wide || true + kubectl_kind -n gpu-operator describe daemonset/nvidia-device-plugin-daemonset || true + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true + exit 1 +fi +echo "GPU Operator pods:" +kubectl_kind -n gpu-operator get pods diff --git a/.github/actions/runtime-install/wait-gpu-operands-helm.sh b/.github/actions/runtime-install/wait-gpu-operands-helm.sh new file mode 100644 index 000000000..ccd47670d --- /dev/null +++ b/.github/actions/runtime-install/wait-gpu-operands-helm.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}" +DEVICE_PLUGIN_WAIT_TIMEOUT="${DEVICE_PLUGIN_WAIT_TIMEOUT:-300s}" +KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-330s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@" +} + +kubectl_kind_wait() { + timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \ + --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \ + --context="${KUBE_CONTEXT}" "$@" +} + +echo "Waiting for device plugin to be ready..." +if ! kubectl_kind_wait -n gpu-operator wait --for=create \ + daemonset/nvidia-device-plugin-daemonset \ + --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then + echo "::error::device plugin DaemonSet was not created within ${DEVICE_PLUGIN_WAIT_TIMEOUT}" + kubectl_kind -n gpu-operator get pods || true + exit 1 +fi +echo "Device plugin DaemonSet found." + +if ! kubectl_kind_wait -n gpu-operator rollout status daemonset/nvidia-device-plugin-daemonset \ + --timeout="${DEVICE_PLUGIN_WAIT_TIMEOUT}"; then + echo "::error::device plugin DaemonSet did not roll out within ${DEVICE_PLUGIN_WAIT_TIMEOUT}" + kubectl_kind -n gpu-operator get pods -o wide || true + kubectl_kind -n gpu-operator describe daemonset/nvidia-device-plugin-daemonset || true + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' || true + exit 1 +fi +echo "GPU Operator pods:" +kubectl_kind -n gpu-operator get pods diff --git a/.github/scripts/gpu-chainsaw-health.sh b/.github/scripts/gpu-chainsaw-health.sh new file mode 100644 index 000000000..5b9b4c9c7 --- /dev/null +++ b/.github/scripts/gpu-chainsaw-health.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +if [[ $# -ne 1 ]]; then + echo "::error::Usage: $0 " + exit 2 +fi +test_dir="$1" +if [[ ! -d "${test_dir}" ]]; then + echo "::error::Test directory not found: ${test_dir}" + exit 1 +fi + +CHAINSAW_TEST_TIMEOUT="${CHAINSAW_TEST_TIMEOUT:-30m}" +if ! [[ "${CHAINSAW_TEST_TIMEOUT}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::CHAINSAW_TEST_TIMEOUT must be a duration like 30m, 180s, or 1h; got '${CHAINSAW_TEST_TIMEOUT}'" + exit 1 +fi +MONITORING_READY_TIMEOUT="${MONITORING_READY_TIMEOUT:-180s}" +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}" +KUBECTL_WAIT_GRACE_SECONDS="${KUBECTL_WAIT_GRACE_SECONDS:-30}" + +if ! [[ "${MONITORING_READY_TIMEOUT}" =~ ^[0-9]+[smh]$ ]]; then + echo "::error::MONITORING_READY_TIMEOUT must be a duration like 180s, 5m, or 1h; got '${MONITORING_READY_TIMEOUT}'" + exit 1 +fi + +duration_seconds() { + local input_value="$1" + local number="${input_value%[smh]}" + local unit="${input_value: -1}" + + case "${unit}" in + s) echo "$((10#${number}))" ;; + m) echo "$((10#${number} * 60))" ;; + h) echo "$((10#${number} * 3600))" ;; + *) + echo "::error::unsupported duration '${input_value}'" >&2 + exit 1 + ;; + esac +} + +if ! [[ "${KUBECTL_WAIT_GRACE_SECONDS}" =~ ^[0-9]+$ ]]; then + echo "::error::KUBECTL_WAIT_GRACE_SECONDS must be a non-negative integer, got '${KUBECTL_WAIT_GRACE_SECONDS}'" + exit 1 +fi +monitoring_ready_timeout_seconds="$(duration_seconds "${MONITORING_READY_TIMEOUT}")" +KUBECTL_WAIT_OUTER_TIMEOUT="${KUBECTL_WAIT_OUTER_TIMEOUT:-$((monitoring_ready_timeout_seconds + KUBECTL_WAIT_GRACE_SECONDS))s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-${KUBECTL_WAIT_OUTER_TIMEOUT}}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="${KUBE_CONTEXT}" "$@" +} + +kubectl_kind_wait() { + timeout "${KUBECTL_WAIT_OUTER_TIMEOUT}" kubectl \ + --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" \ + --context="${KUBE_CONTEXT}" "$@" +} + +print_monitoring_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null \ + | grep -E '(^NAME|^kube-prometheus-operator-)' || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -100 || true +} + +wait_for_monitoring_operator() { + echo "Waiting for monitoring/kube-prometheus-operator before Chainsaw..." + if kubectl_kind_wait -n monitoring rollout status deployment/kube-prometheus-operator \ + --timeout="${MONITORING_READY_TIMEOUT}"; then + echo "monitoring/kube-prometheus-operator is rolled out." + return 0 + fi + + echo "::error::monitoring/kube-prometheus-operator did not become available within ${MONITORING_READY_TIMEOUT}" + print_monitoring_diagnostics + return 1 +} + +wait_for_monitoring_operator + +# --skip-delete: these tests assert the already-deployed runtime bundle. Letting +# Chainsaw delete asserted resources would tear down the system under test. +timeout "${CHAINSAW_TEST_TIMEOUT}" chainsaw test \ + --test-dir "${test_dir}" \ + --config tests/chainsaw/chainsaw-config.yaml \ + --skip-delete diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh new file mode 100644 index 000000000..c1d7c7b3c --- /dev/null +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -0,0 +1,291 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Diagnostic script: intentionally omits -e so each mode can keep collecting +# partial failure data. Keep -u and pipefail to catch script bugs and pipeline +# failures while individual kubectl_kind calls tolerate cluster errors. +set -uo pipefail + +mode="${GPU_TEST_DIAGNOSTIC_MODE:-smoke}" +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" + +kubectl_kind() { + timeout 30s kubectl --request-timeout=10s --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +docker_timeout() { + local limit="$1" + shift + timeout "${limit}" docker "$@" +} + +command_timeout() { + local limit="$1" + shift + timeout "${limit}" "$@" +} + +print_setup_diagnostics() { + echo "=== Runner baseline ===" + date -u || true + hostname || true + uptime || true + cat /proc/loadavg || true + nproc || true + free -h || true + df -h / || true + df -ih / || true + echo "=== Docker health ===" + docker_timeout 30s info >/dev/null 2>&1 && docker_timeout 30s version || true + echo "=== Host GPUs ===" + command_timeout 30s nvidia-smi -L || true + command_timeout 30s nvidia-smi || true + echo "=== Kind clusters ===" + command_timeout 30s kind get clusters || true + echo "=== Kind node containers ===" + docker_timeout 30s ps -a --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" || true + echo "=== Kind node container resources ===" + docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + docker_timeout 30s inspect "${node_container}" \ + --format '{{.Name}} State={{.State.Status}} NanoCpus={{.HostConfig.NanoCpus}} CpuShares={{.HostConfig.CpuShares}} Memory={{.HostConfig.Memory}} MemoryReservation={{.HostConfig.MemoryReservation}}' || true + done || true + print_kind_node_pressure +} + +print_kind_node_pressure() { + local node_container + + echo "=== Kind node pressure snapshots ===" + docker_timeout 30s ps --filter "label=io.x-k8s.kind.cluster=${KIND_CLUSTER_NAME}" \ + --format '{{.Names}}' | sort | while read -r node_container; do + [[ -z "${node_container}" ]] && continue + echo "--- ${node_container} docker stats ---" + docker_timeout 30s stats --no-stream \ + --format 'table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}\t{{.PIDs}}' \ + "${node_container}" || true + echo "--- ${node_container} node pressure ---" + docker_timeout 30s exec "${node_container}" sh -c ' + date + hostname || true + uptime || true + cat /proc/loadavg || true + nproc || true + free -h || true + df -h / /var/lib/containerd /var/lib/kubelet 2>/dev/null || df -h + echo "--- top cpu/memory processes ---" + ps -eo pid,ppid,stat,etime,%cpu,%mem,comm,args --sort=-%cpu | head -25 || true + ' || true + done || true +} + +print_workload_images() { + local ns="$1" + kubectl_kind -n "${ns}" get deployment,daemonset,statefulset -o json 2>/dev/null \ + | jq -r ' + .items[] | + [ + .kind, + .metadata.namespace + "/" + .metadata.name, + (([.spec.template.spec.containers[]?.image] + + [.spec.template.spec.initContainers[]?.image]) | unique | join(",")) + ] | @tsv + ' || true +} + +print_workload_inventory() { + local ns + echo "=== Workload image inventory ===" + for ns in "$@"; do + echo "--- ${ns} ---" + print_workload_images "${ns}" + done +} + +print_component_status_summary() { + echo "=== Component workload status ===" + kubectl_kind get deployments,statefulsets,daemonsets,pods -A -o wide 2>/dev/null || true + echo "=== Component rollout conditions ===" + kubectl_kind get deployments,statefulsets,daemonsets -A \ + -o custom-columns='KIND:.kind,NAMESPACE:.metadata.namespace,NAME:.metadata.name,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas,DESIRED:.status.replicas,UPDATED:.status.updatedReplicas,AGE:.metadata.creationTimestamp' \ + 2>/dev/null || true + echo "=== Non-ready pods ===" + kubectl_kind get pods -A \ + --field-selector=status.phase!=Running,status.phase!=Succeeded \ + -o wide 2>/dev/null || true +} + +print_kube_prometheus_operator_diagnostics() { + echo "=== Monitoring workloads ===" + kubectl_kind -n monitoring get deployment,statefulset,daemonset,pods -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment ===" + kubectl_kind -n monitoring get deployment kube-prometheus-operator -o wide 2>/dev/null || true + echo "=== kube-prometheus-operator deployment describe ===" + kubectl_kind -n monitoring describe deployment kube-prometheus-operator 2>/dev/null || true + echo "=== kube-prometheus-operator pod describe ===" + kubectl_kind -n monitoring get pods -o name 2>/dev/null \ + | grep '^pod/kube-prometheus-operator-' \ + | while read -r pod; do + echo "--- ${pod} ---" + kubectl_kind -n monitoring describe "${pod}" 2>/dev/null || true + done || true + echo "=== kube-prometheus-operator logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --tail=200 2>/dev/null || true + echo "=== kube-prometheus-operator previous logs ===" + kubectl_kind -n monitoring logs deployment/kube-prometheus-operator --all-containers --previous --tail=200 2>/dev/null || true + echo "=== Recent events (monitoring) ===" + kubectl_kind -n monitoring get events --sort-by='.lastTimestamp' 2>/dev/null | tail -80 || true +} + +print_kai_diagnostics() { + echo "=== KAI scheduler pods ===" + kubectl_kind -n kai-scheduler get pods -o wide 2>/dev/null || true + echo "=== KAI admission deployment ===" + kubectl_kind -n kai-scheduler get deployment admission -o wide 2>/dev/null || true + echo "=== KAI admission deployment describe ===" + kubectl_kind -n kai-scheduler describe deployment admission 2>/dev/null || true + echo "=== KAI admission pod describe ===" + kubectl_kind -n kai-scheduler get pods -o name 2>/dev/null \ + | grep '^pod/admission-' \ + | while read -r pod; do + kubectl_kind -n kai-scheduler describe "${pod}" 2>/dev/null || true + done || true + echo "=== KAI admission logs ===" + kubectl_kind -n kai-scheduler logs deployment/admission --all-containers --tail=200 2>/dev/null || true + echo "=== KAI scheduler logs ===" + kubectl_kind -n kai-scheduler logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true + echo "=== KAI scheduler queues ===" + kubectl_kind get queues -A 2>/dev/null || true + echo "=== KAI scheduler podgroups ===" + kubectl_kind get podgroups -A 2>/dev/null || true + echo "=== Recent events (kai-scheduler) ===" + kubectl_kind -n kai-scheduler get events --sort-by='.lastTimestamp' 2>/dev/null | tail -50 || true +} + +print_custom_metrics() { + local metric + local ns + local namespaces=("$@") + + echo "=== Custom metrics API ===" + for metric in gpu_utilization gpu_memory_used gpu_power_usage; do + for ns in "${namespaces[@]}"; do + echo "--- ${ns}/${metric} ---" + kubectl_kind get --raw "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${ns}/pods/*/${metric}" 2>/dev/null \ + | jq . || true + done + done +} + +print_metrics_pipeline_diagnostics() { + echo "=== prometheus-adapter pods ===" + kubectl_kind -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true + echo "=== DCGM Exporter pods ===" + kubectl_kind -n gpu-operator get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true + echo "=== Monitoring pods ===" + kubectl_kind -n monitoring get pods -o wide 2>/dev/null || true + echo "=== DRA ResourceSlices ===" + kubectl_kind get resourceslices -o wide 2>/dev/null || true + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true +} + +print_common_gpu_diagnostics() { + echo "=== ClusterPolicy status ===" + kubectl_kind get clusterpolicy -o yaml 2>/dev/null || true + echo "=== GPU Operator pods ===" + kubectl_kind -n gpu-operator get pods -o wide 2>/dev/null || true + echo "=== Non-running pods (all namespaces) ===" + kubectl_kind get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true + echo "=== Recent events (gpu-operator) ===" + kubectl_kind -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +print_h100_common_diagnostics() { + local metric_namespaces=("$@") + local common_namespaces=( + cert-manager + gpu-operator + monitoring + skyhook + nvsentinel + nvidia-dra-driver + nvidia-network-operator + kai-scheduler + ) + + print_setup_diagnostics + print_component_status_summary + print_workload_inventory "${common_namespaces[@]}" "${metric_namespaces[@]}" + print_common_gpu_diagnostics + print_kube_prometheus_operator_diagnostics + print_kai_diagnostics + print_custom_metrics gpu-operator "${metric_namespaces[@]}" + print_metrics_pipeline_diagnostics + echo "=== Node resources ===" + kubectl_kind describe nodes 2>/dev/null | grep -A 20 "Allocated resources" || true +} + +print_kubeflow_diagnostics() { + echo "=== Kubeflow Trainer deployment ===" + kubectl_kind -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true + echo "=== Kubeflow pods ===" + kubectl_kind -n kubeflow get pods -o wide 2>/dev/null || true + echo "=== Kubeflow validating webhooks ===" + kubectl_kind get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true + echo "=== Kubeflow Trainer CRD ===" + kubectl_kind get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true +} + +print_dynamo_diagnostics() { + echo "=== Dynamo pods ===" + kubectl_kind -n dynamo-system get pods -o wide 2>/dev/null || true + echo "=== Dynamo operator logs ===" + kubectl_kind -n dynamo-system logs deployment/dynamo-platform-dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true + echo "=== Recent events (dynamo-system) ===" + kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true +} + +print_kgateway_diagnostics() { + echo "=== kgateway pods ===" + kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true + echo "=== GatewayClass status ===" + kubectl_kind get gatewayclass -o yaml 2>/dev/null || true + echo "=== Gateway status ===" + kubectl_kind get gateways -A -o yaml 2>/dev/null || true +} + +case "${mode}" in + smoke) + print_setup_diagnostics + print_common_gpu_diagnostics + echo "=== Node status ===" + kubectl_kind get nodes -o wide 2>/dev/null || true + ;; + training) + print_h100_common_diagnostics kubeflow + print_kubeflow_diagnostics + ;; + inference) + print_h100_common_diagnostics dynamo-system kgateway-system + print_dynamo_diagnostics + print_kgateway_diagnostics + ;; + *) + echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" + exit 1 + ;; +esac diff --git a/.github/scripts/gpu-smoke-run-nvidia-smi.sh b/.github/scripts/gpu-smoke-run-nvidia-smi.sh new file mode 100644 index 000000000..8751bb513 --- /dev/null +++ b/.github/scripts/gpu-smoke-run-nvidia-smi.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +: "${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +KUBECTL_WAIT_REQUEST_TIMEOUT="${KUBECTL_WAIT_REQUEST_TIMEOUT:-130s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +kubectl_kind_wait() { + timeout 150s kubectl --request-timeout="${KUBECTL_WAIT_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +pod_name=$(cat <<'EOF' | kubectl_kind create -f - -o jsonpath='{.metadata.name}' +apiVersion: v1 +kind: Pod +metadata: + generateName: gpu-smoke-test- + labels: + app: gpu-smoke-test +spec: + restartPolicy: Never + containers: + - name: nvidia-smi + # Intentionally use a small base image: NVIDIA Container Toolkit should + # inject nvidia-smi into GPU containers. This smoke test should fail if it + # does not. + image: ubuntu:22.04 + command: ["nvidia-smi"] + resources: + limits: + nvidia.com/gpu: 1 +EOF +) + +mkdir -p "$(dirname "${POD_NAME_FILE}")" +echo "${pod_name}" > "${POD_NAME_FILE}" + +echo "Waiting for ${pod_name} pod to complete..." +kubectl_kind_wait wait "pod/${pod_name}" \ + --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s diff --git a/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh new file mode 100644 index 000000000..2510e1742 --- /dev/null +++ b/.github/scripts/gpu-smoke-show-nvidia-smi-log.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" +KUBECTL_REQUEST_TIMEOUT="${KUBECTL_REQUEST_TIMEOUT:-10s}" +POD_NAME_FILE="${POD_NAME_FILE:-/tmp/aicr-gpu-smoke-pod-name-${KIND_CLUSTER_NAME}}" +trap 'rm -f "${POD_NAME_FILE}"' EXIT + +kubectl_kind() { + kubectl --request-timeout="${KUBECTL_REQUEST_TIMEOUT}" --context="kind-${KIND_CLUSTER_NAME}" "$@" +} + +pod_name="" +if [[ -f "${POD_NAME_FILE}" ]]; then + pod_name="$(cat "${POD_NAME_FILE}")" + if [[ -n "${pod_name}" ]] && ! kubectl_kind get pod "${pod_name}" >/dev/null 2>&1; then + pod_name="" + fi +fi + +if [[ -z "${pod_name}" ]]; then + pod_name=$(kubectl_kind get pods \ + -l app=gpu-smoke-test \ + --sort-by=.metadata.creationTimestamp \ + -o jsonpath='{.items[-1:].metadata.name}') +fi + +if [[ -z "${pod_name}" ]]; then + echo "::error::no gpu-smoke-test pod found" + exit 1 +fi + +kubectl_kind logs "${pod_name}" diff --git a/.github/scripts/gpu-validate-conformance.sh b/.github/scripts/gpu-validate-conformance.sh new file mode 100644 index 000000000..76c354249 --- /dev/null +++ b/.github/scripts/gpu-validate-conformance.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ +./aicr validate \ + --recipe recipe.yaml \ + --snapshot snapshot.yaml \ + --phase conformance \ + --namespace gpu-operator \ + --kubeconfig="${HOME}/.kube/config" \ + --require-gpu \ + --image=ko.local:smoke-test \ + --timeout=10m \ + --toleration '*' \ + --output=validation-result.yaml \ + --evidence-dir=conformance-evidence diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index c5e1882d4..07e8c6c74 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -12,17 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: GPU Inference Test (nvkind + H100 x2) +name: GPU Inference Test (nvkind + H100 x1) on: schedule: - - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test + - cron: '15 6 * * *' # Daily, 6h offset from training test push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] - workflow_dispatch: {} # Allow manual runs + workflow_dispatch: + inputs: + run_full_validation: + description: 'Run snapshot and CNCF AI Conformance validation' + required: false + type: boolean + default: false permissions: contents: read @@ -40,6 +44,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -49,16 +55,24 @@ jobs: - '.github/workflows/gpu-h100-inference-test.yaml' - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - - 'pkg/evidence/**' + - 'validators/conformance/**' + - 'recipes/validators/catalog.yaml' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-workflow-prepare/**' + - '.github/actions/gpu-debug-diagnostics/**' + - '.github/actions/gpu-chainsaw-health/**' + - '.github/actions/gpu-validate-conformance/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**' @@ -72,213 +86,28 @@ jobs: - 'recipes/overlays/kind-inference.yaml' - 'recipes/overlays/h100-kind-inference.yaml' - 'recipes/overlays/h100-kind-inference-dynamo.yaml' - - 'kwok/manifests/karpenter/**' - - 'kwok/scripts/install-karpenter-kwok.sh' - 'pkg/collector/**' - 'pkg/snapshotter/**' - '.github/actions/gpu-snapshot-validate/**' - - 'pkg/validator/job/**' - - 'pkg/validator/catalog/**' - - 'pkg/defaults/timeouts.go' - - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-inference-test: needs: [check-paths] if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) - name: GPU Inference Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 - - env: - KIND_CLUSTER_NAME: gpu-inference-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - - - name: Build aicr - uses: ./.github/actions/aicr-build - with: - validator_phases: 'conformance' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - platform: dynamo - - # --- Snapshot and GPU validation --- - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Health checks --- - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - - name: Run chainsaw health checks - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --config tests/chainsaw/chainsaw-config.yaml - - # --- CNCF AI Conformance validation --- - # Runs after the stack health checks so gateway and metrics validators - # see a settled inference stack. - - - name: Verify expected resources exist - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence - - # Dynamo smoke is intentionally disabled for now. The vLLM runtime image - # adds significant latency and flakiness in Kind CI, and training has no - # matching smoke path yet. Reintroduce it later alongside a symmetric - # training smoke test if needed. - # --- Validation artifacts --- - - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - - name: Upload validation artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - - name: Debug diagnostics - if: failure() - run: | - echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Dynamo pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== Dynamo operator logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true - echo "=== Recent events (dynamo-system) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Custom metrics API ===" - for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do - echo "--- ${METRIC} ---" - for NS in gpu-operator dynamo-system; do - kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ - "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true - done - done - echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== prometheus-adapter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true - echo "=== kgateway pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true - echo "=== GatewayClass status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true - echo "=== Gateway status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true - echo "=== DCGM Exporter pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ - get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true - echo "=== Monitoring pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true - echo "=== DRA ResourceSlices ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true - echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-inference-test-debug + name: GPU Inference Test (nvkind + H100 x1) + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Inference Test (nvkind + H100 x1) + cluster_name: gpu-inference-test + intent: inference + platform: dynamo + chainsaw_path: tests/chainsaw/ai-conformance/kind-inference-dynamo + artifact_name_prefix: gpu-inference-test-debug + run_full_validation: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.run_full_validation) }} diff --git a/.github/workflows/gpu-h100-kind-runtime-test.yaml b/.github/workflows/gpu-h100-kind-runtime-test.yaml new file mode 100644 index 000000000..c2140082f --- /dev/null +++ b/.github/workflows/gpu-h100-kind-runtime-test.yaml @@ -0,0 +1,146 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: GPU H100 Kind Runtime Test + +on: + workflow_call: + inputs: + job_name: + description: 'Display name for the H100 runtime job' + required: true + type: string + cluster_name: + description: 'Kind cluster name' + required: true + type: string + intent: + description: 'Runtime intent passed to the bundle installer' + required: true + type: string + platform: + description: 'Runtime platform passed to the bundle installer' + required: true + type: string + chainsaw_path: + description: 'Chainsaw health-check directory' + required: true + type: string + artifact_name_prefix: + description: 'Prefix for uploaded debug artifacts' + required: true + type: string + run_full_validation: + description: 'Run snapshot validation and CNCF AI Conformance validation' + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + gpu-h100-kind-runtime-test: + name: ${{ inputs.job_name }} + runs-on: linux-amd64-gpu-h100-latest-1 + timeout-minutes: ${{ inputs.run_full_validation && 180 || 150 }} + concurrency: + group: gpu-h100-${{ github.event_name }}-${{ github.ref }}-${{ inputs.intent }} + cancel-in-progress: true + + env: + KIND_CLUSTER_NAME: ${{ inputs.cluster_name }} + + steps: + - name: Checkout Code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Prepare GPU workflow + id: prepare + uses: ./.github/actions/gpu-workflow-prepare + + - name: Set up GPU cluster + timeout-minutes: 25 + uses: ./.github/actions/gpu-cluster-setup + with: + kind_node_image: ${{ steps.prepare.outputs.h100_kind_node_image }} + min_gpu_count: '1' + gpu_model_pattern: H100 + min_free_disk_gb: '50' + min_available_memory_gb: '16' + cluster_create_timeout: 900s + control_plane_resource_patches: 'true' + control_plane_leader_election_tuning: 'true' + + - name: Build AICR CLI + timeout-minutes: 10 + uses: ./.github/actions/aicr-build + with: + build_snapshot_agent: 'false' + validator_phases: 'none' + + - name: Install runtime bundle + id: bundle-install + timeout-minutes: 80 + uses: ./.github/actions/runtime-install + with: + method: bundle + accelerator: h100 + intent: ${{ inputs.intent }} + platform: ${{ inputs.platform }} + wait: 'true' + best_effort: 'false' + + - name: Snapshot and validate GPU + if: inputs.run_full_validation + timeout-minutes: 30 + uses: ./.github/actions/gpu-snapshot-validate + with: + gpu_model: H100 + min_gpu_count: '1' + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + snapshot_timeout: 10m + + - name: Run chainsaw health checks + timeout-minutes: 20 + uses: ./.github/actions/gpu-chainsaw-health + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + chainsaw_path: ${{ inputs.chainsaw_path }} + chainsaw_version: ${{ steps.prepare.outputs.chainsaw }} + chainsaw_sha256: ${{ steps.prepare.outputs.chainsaw_sha256_linux_amd64 }} + + - name: Run CNCF AI Conformance + if: inputs.run_full_validation + timeout-minutes: 60 + id: validate-conformance + uses: ./.github/actions/gpu-validate-conformance + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + kwok_helm_timeout: 600s + ko_build_timeout: 1200s + karpenter_helm_timeout: 600s + + - name: GPU Test Cleanup + if: always() + timeout-minutes: 15 + uses: ./.github/actions/gpu-test-cleanup + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + artifact_name_prefix: ${{ inputs.artifact_name_prefix }} + collect_artifacts: ${{ job.status != 'success' }} + diagnostic_mode: ${{ inputs.intent }} + upload_validation_artifacts: ${{ inputs.run_full_validation && steps.validate-conformance.outcome != 'skipped' }} diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index d3a04de03..4dd30f872 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -12,17 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: GPU Training Test (nvkind + H100 x2) +name: GPU Training Test (nvkind + H100 x1) on: schedule: - - cron: '30 6,18 * * *' # Every 12 hours (2x daily), offset from inference test + - cron: '15 0 * * *' # Daily, 6h offset from inference test push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] - workflow_dispatch: {} # Allow manual runs + workflow_dispatch: + inputs: + run_full_validation: + description: 'Run snapshot and CNCF AI Conformance validation' + required: false + type: boolean + default: false permissions: contents: read @@ -40,6 +44,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -49,16 +55,24 @@ jobs: - '.github/workflows/gpu-h100-training-test.yaml' - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' + - '.github/actions/check-control-plane-health/**' - '.github/actions/aicr-build/**' - '.github/actions/setup-build-tools/**' - - '.github/actions/install-karpenter-kwok/**' - 'validators/*/Dockerfile' - - 'pkg/evidence/**' + - 'validators/conformance/**' + - 'recipes/validators/catalog.yaml' + - '.github/workflows/gpu-h100-kind-runtime-test.yaml' + - '.github/actions/gpu-workflow-prepare/**' + - '.github/actions/gpu-debug-diagnostics/**' + - '.github/actions/gpu-chainsaw-health/**' + - '.github/actions/gpu-validate-conformance/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-chainsaw-health.sh' + - '.github/scripts/gpu-debug-diagnostics.sh' + - 'pkg/bundler/deployer/helm/**' - 'tests/chainsaw/chainsaw-config.yaml' - - 'tests/chainsaw/ai-conformance/main.go' - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - 'tests/chainsaw/ai-conformance/kind-training-kubeflow/**' @@ -67,198 +81,29 @@ jobs: - 'recipes/overlays/h100-kind-training-kubeflow.yaml' - 'recipes/mixins/platform-kubeflow.yaml' - 'recipes/components/kubeflow-trainer/**' - - 'kwok/manifests/karpenter/**' - - 'kwok/scripts/install-karpenter-kwok.sh' - 'recipes/components/prometheus-adapter/**' - 'pkg/collector/**' - 'pkg/snapshotter/**' - '.github/actions/gpu-snapshot-validate/**' - - 'pkg/validator/job/**' - - 'pkg/validator/catalog/**' - - 'pkg/defaults/timeouts.go' - - 'validators/conformance/**' + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. gpu-training-test: needs: [check-paths] if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) - name: GPU Training Test (nvkind + H100 x2) - concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} - cancel-in-progress: true - runs-on: linux-amd64-gpu-h100-latest-2 - timeout-minutes: 120 - - env: - KIND_CLUSTER_NAME: gpu-training-test - - steps: - - - name: Checkout Code - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - persist-credentials: false - - - name: Set up GPU cluster - uses: ./.github/actions/gpu-cluster-setup - - - name: Build aicr - uses: ./.github/actions/aicr-build - with: - validator_phases: 'conformance' - - - name: Install runtime bundle - id: bundle-install - uses: ./.github/actions/gpu-operator-install - with: - method: bundle - accelerator: h100 - intent: training - platform: kubeflow - - # --- Snapshot and GPU validation --- - - - name: Snapshot and validate GPU - uses: ./.github/actions/gpu-snapshot-validate - with: - gpu_model: H100 - min_gpu_count: '2' - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Install Karpenter + KWOK early to give monitoring stack settle time --- - - - name: Install Karpenter + KWOK - uses: ./.github/actions/install-karpenter-kwok - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - # --- Health checks --- - - - name: Prepare chainsaw - id: versions - uses: ./.github/actions/load-versions - - - name: Install chainsaw - uses: ./.github/actions/setup-build-tools - with: - install_chainsaw: 'true' - chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' - - - name: Run chainsaw health checks - run: | - chainsaw test \ - --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --config tests/chainsaw/chainsaw-config.yaml - - # --- CNCF AI Conformance validation --- - # Runs last to ensure the DCGM → Prometheus → adapter pipeline - # has had time to bootstrap (pod-autoscaling check needs live metric data). - - - name: Verify expected resources exist - run: | - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug - - - name: Validate CNCF AI Conformance - id: validate-conformance - run: | - AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ - ./aicr validate \ - --recipe recipe.yaml \ - --phase conformance \ - --namespace gpu-operator \ - --kubeconfig="${HOME}/.kube/config" \ - --require-gpu \ - --image=ko.local:smoke-test \ - --timeout=10m \ - --toleration '*' \ - --output=validation-result.yaml \ - --evidence-dir=conformance-evidence - - # --- Validation artifacts --- - - # Collect a post-run resource snapshot regardless of whether conformance - # validation ran, so triage always has a cluster-state artifact. - - name: Collect validation artifacts - if: >- - always() - && !cancelled() - && steps.bundle-install.outcome == 'success' - continue-on-error: true - shell: bash - run: | - set -o pipefail - mkdir -p conformance-evidence - go run ./tests/chainsaw/ai-conformance/ \ - --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --dir tests/chainsaw/ai-conformance/common \ - --dir tests/chainsaw/ai-conformance/kind-common \ - --kubeconfig="${HOME}/.kube/config" \ - --debug | tee conformance-evidence/resource-existence-post.txt - - - name: Upload validation artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: conformance-evidence - path: | - conformance-evidence/ - validation-result.yaml - if-no-files-found: warn - - # --- Debug diagnostics (before cleanup so resources still exist) --- - - - name: Debug diagnostics - if: failure() - run: | - echo "=== Grafana deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get deployment grafana -o wide 2>/dev/null || true - echo "=== Grafana pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods \ - -l app.kubernetes.io/name=grafana -o wide 2>/dev/null || true - echo "=== Grafana deployment describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe deployment grafana 2>/dev/null || true - echo "=== Grafana pod describe ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring describe pods \ - -l app.kubernetes.io/name=grafana 2>/dev/null || true - echo "=== KAI scheduler pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler get pods -o wide 2>/dev/null || true - echo "=== KAI scheduler logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kai-scheduler \ - logs deployment/kai-scheduler-default --tail=100 2>/dev/null || true - echo "=== KAI scheduler queues ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get queues -A 2>/dev/null || true - echo "=== KAI scheduler podgroups ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get podgroups -A 2>/dev/null || true - echo "=== Kubeflow Trainer deployment ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get deployment kubeflow-trainer-controller-manager -o wide 2>/dev/null || true - echo "=== Kubeflow pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kubeflow get pods -o wide 2>/dev/null || true - echo "=== Kubeflow validating webhooks ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get validatingwebhookconfigurations validator.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Kubeflow Trainer CRD ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get crd trainjobs.trainer.kubeflow.org -o yaml 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A \ - --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Node resources ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" describe nodes 2>/dev/null | \ - grep -A 20 "Allocated resources" || true - - - name: GPU Test Cleanup - if: always() - uses: ./.github/actions/gpu-test-cleanup - with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - artifact_name_prefix: gpu-training-test-debug + name: GPU Training Test (nvkind + H100 x1) + uses: ./.github/workflows/gpu-h100-kind-runtime-test.yaml + with: + job_name: GPU Training Test (nvkind + H100 x1) + cluster_name: gpu-training-test + intent: training + platform: kubeflow + chainsaw_path: tests/chainsaw/ai-conformance/kind-training-kubeflow + artifact_name_prefix: gpu-training-test-debug + run_full_validation: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.run_full_validation) }} diff --git a/.github/workflows/gpu-smoke-test.yaml b/.github/workflows/gpu-smoke-test.yaml index d5b8c5c74..b2b609962 100644 --- a/.github/workflows/gpu-smoke-test.yaml +++ b/.github/workflows/gpu-smoke-test.yaml @@ -20,8 +20,6 @@ on: push: branches: - "pull-request/[0-9]+" - pull_request: - types: [labeled] workflow_dispatch: {} # Allow manual runs permissions: @@ -40,6 +38,8 @@ jobs: should-run: ${{ steps.filter.outputs.matched }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 id: filter with: @@ -47,11 +47,17 @@ jobs: filters: | matched: - '.github/workflows/gpu-smoke-test.yaml' + - '.settings.yaml' - '.github/actions/gpu-cluster-setup/**' - - '.github/actions/gpu-operator-install/**' + - '.github/actions/runtime-install/**' - '.github/actions/aicr-build/**' + - '.github/actions/gpu-debug-diagnostics/**' - '.github/actions/gpu-test-cleanup/**' + - '.github/actions/gpu-smoke-nvidia-smi/**' - '.github/actions/load-versions/**' + - '.github/scripts/gpu-debug-diagnostics.sh' + - '.github/scripts/gpu-smoke-run-nvidia-smi.sh' + - '.github/scripts/gpu-smoke-show-nvidia-smi-log.sh' - 'pkg/collector/**' - 'pkg/snapshotter/**' - '.github/actions/gpu-snapshot-validate/**' @@ -62,11 +68,13 @@ jobs: gpu-smoke-test: needs: [check-paths] + # NVIDIA self-hosted GPU runners reject pull_request event jobs before + # checkout. PR GPU coverage runs through the pull-request/ push + # mirror after ok-to-test approval. if: > always() && ( github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && github.event.label.name == 'run-gpu-tests') || (github.event_name == 'push' && needs.check-paths.outputs.should-run == 'true') ) name: GPU Smoke Test (nvkind + L40G) @@ -86,45 +94,43 @@ jobs: with: persist-credentials: false + - name: Runner preflight snapshot + shell: bash + run: | + echo "::group::Runner preflight snapshot" + echo "hostname: $(hostname)" + echo "kernel: $(uname -a)" + echo "uptime: $(uptime)" + echo "loadavg: $(cat /proc/loadavg 2>/dev/null || echo unavailable)" + echo "nproc: $(nproc 2>/dev/null || echo unavailable)" + free -h 2>/dev/null || true + df -h / 2>/dev/null || true + echo "::endgroup::" + - name: Set up GPU cluster uses: ./.github/actions/gpu-cluster-setup + with: + # Keep smoke runner preflight explicit so action default changes do not + # silently alter L40G coverage. + min_gpu_count: '1' + min_free_disk_gb: '20' + min_available_memory_gb: '8' - name: Build aicr uses: ./.github/actions/aicr-build with: + build_snapshot_agent: 'false' validator_phases: 'none' - name: Install GPU operator (helm) - uses: ./.github/actions/gpu-operator-install + uses: ./.github/actions/runtime-install with: method: helm - name: Run nvidia-smi in a pod - run: | - cat <<'EOF' | kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - - apiVersion: v1 - kind: Pod - metadata: - name: gpu-smoke-test - spec: - restartPolicy: Never - containers: - - name: nvidia-smi - image: ubuntu:22.04 - command: ["nvidia-smi"] - resources: - limits: - nvidia.com/gpu: 1 - EOF - - echo "Waiting for gpu-smoke-test pod to complete..." - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=condition=Ready --timeout=120s || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" wait pod/gpu-smoke-test \ - --for=jsonpath='{.status.phase}'=Succeeded --timeout=120s - - - name: Show nvidia-smi output - run: kubectl --context="kind-${KIND_CLUSTER_NAME}" logs gpu-smoke-test + uses: ./.github/actions/gpu-smoke-nvidia-smi + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} # --- Snapshot and validation --- @@ -135,22 +141,10 @@ jobs: min_gpu_count: '1' cluster_name: ${{ env.KIND_CLUSTER_NAME }} - - name: Debug diagnostics - if: failure() - run: | - echo "=== ClusterPolicy status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true - echo "=== GPU Operator pods ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true - echo "=== Non-running pods (all namespaces) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true - echo "=== Recent events (gpu-operator) ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true - echo "=== Node status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true - - name: GPU Test Cleanup if: always() uses: ./.github/actions/gpu-test-cleanup with: cluster_name: ${{ env.KIND_CLUSTER_NAME }} + collect_artifacts: ${{ job.status != 'success' }} + diagnostic_mode: smoke diff --git a/.settings.yaml b/.settings.yaml index 75b4559b1..43da8de37 100644 --- a/.settings.yaml +++ b/.settings.yaml @@ -40,6 +40,7 @@ security_tools: testing_tools: kubectl: 'v1.35.0' kind: '0.31.0' + nvkind: '78a0a514c41c3e77ac0d935f38d971d3b4455138' ctlptl: '0.9.0' tilt: '0.37.0' helm: 'v4.1.1' @@ -71,6 +72,9 @@ docs_tools: # Testing Configuration testing: kind_node_image: 'kindest/node:v1.32.0' + h100_kind_node_image: 'kindest/node:v1.35.0' + gpu_operator_chart_version: 'v25.10.1' + snapshot_agent_cuda_image: 'nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04' # Component test harness configuration # Used by tools/component-test/ scripts to validate individual components diff --git a/kwok/scripts/install-karpenter-kwok.sh b/kwok/scripts/install-karpenter-kwok.sh index 72b64dae1..53e70fabd 100755 --- a/kwok/scripts/install-karpenter-kwok.sh +++ b/kwok/scripts/install-karpenter-kwok.sh @@ -41,7 +41,10 @@ KARPENTER_VERSION="${KARPENTER_VERSION:-v1.8.0}" KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME:?KIND_CLUSTER_NAME must be set}" KARPENTER_NAMESPACE="${KARPENTER_NAMESPACE:-karpenter}" KARPENTER_CLONE_DIR="${KARPENTER_CLONE_DIR:-/tmp/karpenter}" -KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900}" # 15 minutes +KWOK_HELM_TIMEOUT="${KWOK_HELM_TIMEOUT:-300s}" +KO_BUILD_TIMEOUT="${KO_BUILD_TIMEOUT:-900s}" # 15 minutes +KARPENTER_HELM_TIMEOUT="${KARPENTER_HELM_TIMEOUT:-300s}" +KUBE_CONTEXT="${KUBE_CONTEXT:-kind-${KIND_CLUSTER_NAME}}" RED='\033[0;31m' GREEN='\033[0;32m' @@ -52,6 +55,14 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +kubectl_kind() { + kubectl --context="${KUBE_CONTEXT}" "$@" +} + +helm_kind() { + helm --kube-context "${KUBE_CONTEXT}" "$@" +} + # ------------------------------------------------------------------- # Step 1: Install KWOK controller # Uses the same approach as kwok/scripts/run-all-recipes.sh @@ -59,19 +70,20 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*"; } install_kwok() { log_info "Installing KWOK controller..." - if kubectl get deployment -n kube-system kwok-controller &>/dev/null; then + if kubectl_kind get deployment -n kube-system kwok-controller &>/dev/null; then log_info "KWOK controller already installed, skipping" return 0 fi helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update - helm upgrade --install kwok-controller kwok/kwok \ + helm_kind upgrade --install kwok-controller kwok/kwok \ --namespace kube-system \ --set hostNetwork=true \ - --wait --timeout 300s + --wait --timeout "${KWOK_HELM_TIMEOUT}" - helm upgrade --install kwok-stage-fast kwok/stage-fast \ - --namespace kube-system + helm_kind upgrade --install kwok-stage-fast kwok/stage-fast \ + --namespace kube-system \ + --wait --timeout "${KWOK_HELM_TIMEOUT}" log_info "KWOK controller installed" } @@ -98,11 +110,16 @@ build_karpenter() { # Redirect stderr to avoid Go compilation warnings corrupting the image reference. # Output format: kind.local/: # Hard timeout prevents a slow/stuck compilation from consuming the entire job. + local ko_stderr="${KARPENTER_CLONE_DIR}/ko-build.stderr" CONTROLLER_IMG=$(timeout "${KO_BUILD_TIMEOUT}" \ env KO_DOCKER_REPO=kind.local \ KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \ - ko build sigs.k8s.io/karpenter/kwok 2>/dev/null) || { - log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}s" + ko build sigs.k8s.io/karpenter/kwok 2>"${ko_stderr}") || { + log_error "ko build failed or timed out after ${KO_BUILD_TIMEOUT}" + if [[ -s "${ko_stderr}" ]]; then + log_error "ko build stderr:" + sed 's/^/ /' "${ko_stderr}" || true + fi exit 1 } @@ -141,20 +158,20 @@ deploy_karpenter() { log_info "Deploying Karpenter to namespace ${KARPENTER_NAMESPACE}..." # Apply CRDs first - kubectl apply -f "${KARPENTER_CLONE_DIR}/kwok/charts/crds" + kubectl_kind apply -f "${KARPENTER_CLONE_DIR}/kwok/charts/crds" # Create namespace and instance types ConfigMap before Helm install # so the volume mount can reference it immediately. - kubectl create namespace "${KARPENTER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + kubectl_kind create namespace "${KARPENTER_NAMESPACE}" --dry-run=client -o yaml | kubectl_kind apply -f - local instance_types_file="${MANIFESTS_DIR}/instance-types.json" if [[ ! -f "${instance_types_file}" ]]; then log_error "Instance types file not found: ${instance_types_file}" exit 1 fi - kubectl create configmap -n "${KARPENTER_NAMESPACE}" karpenter-instance-types \ + kubectl_kind create configmap -n "${KARPENTER_NAMESPACE}" karpenter-instance-types \ --from-file=instance-types.json="${instance_types_file}" \ - --dry-run=client -o yaml | kubectl apply -f - + --dry-run=client -o yaml | kubectl_kind apply -f - # Build the image tag argument. If ko provided a tag, use it. # If not, omit it and let the chart default to its AppVersion. @@ -169,7 +186,7 @@ deploy_karpenter() { # - extraVolumes + extraVolumeMounts: mount the instance types ConfigMap # - controller.env: set INSTANCE_TYPES_FILE_PATH for the KWOK provider # shellcheck disable=SC2086 - helm upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \ + helm_kind upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \ --namespace "${KARPENTER_NAMESPACE}" --create-namespace \ --set controller.image.repository="${IMG_REPOSITORY}" \ ${tag_arg} \ @@ -187,17 +204,17 @@ deploy_karpenter() { --set 'controller.extraVolumeMounts[0].readOnly=true' \ --set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \ --set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \ - --wait --timeout 300s \ + --wait --timeout "${KARPENTER_HELM_TIMEOUT}" \ || { log_error "Helm install failed. Diagnostics:" - kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true - kubectl -n "${KARPENTER_NAMESPACE}" describe deployment karpenter 2>/dev/null || true + kubectl_kind -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true + kubectl_kind -n "${KARPENTER_NAMESPACE}" describe deployment karpenter 2>/dev/null || true local POD - POD=$(kubectl -n "${KARPENTER_NAMESPACE}" get pods -l app.kubernetes.io/name=karpenter \ + POD=$(kubectl_kind -n "${KARPENTER_NAMESPACE}" get pods -l app.kubernetes.io/name=karpenter \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) if [[ -n "${POD}" ]]; then - kubectl -n "${KARPENTER_NAMESPACE}" describe pod "${POD}" 2>/dev/null || true - kubectl -n "${KARPENTER_NAMESPACE}" logs "${POD}" --tail=50 2>/dev/null || true + kubectl_kind -n "${KARPENTER_NAMESPACE}" describe pod "${POD}" 2>/dev/null || true + kubectl_kind -n "${KARPENTER_NAMESPACE}" logs "${POD}" --tail=50 2>/dev/null || true fi exit 1 } @@ -212,7 +229,9 @@ main() { log_info "=== Karpenter KWOK Provider Installation ===" log_info "Karpenter version: ${KARPENTER_VERSION}" log_info "Kind cluster: ${KIND_CLUSTER_NAME}" + log_info "Kube context: ${KUBE_CONTEXT}" log_info "Namespace: ${KARPENTER_NAMESPACE}" + log_info "Timeouts: kwok=${KWOK_HELM_TIMEOUT} ko-build=${KO_BUILD_TIMEOUT} karpenter=${KARPENTER_HELM_TIMEOUT}" install_kwok build_karpenter diff --git a/kwok/scripts/run-all-recipes.sh b/kwok/scripts/run-all-recipes.sh index 459b054b5..6b4af1549 100755 --- a/kwok/scripts/run-all-recipes.sh +++ b/kwok/scripts/run-all-recipes.sh @@ -37,6 +37,31 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $*"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } log_error() { echo -e "${RED}[ERROR]${NC} $*"; } +retry_command() { + local description="$1" + shift + + local max_attempts="${KWOK_COMMAND_RETRIES:-3}" + local delay="${KWOK_COMMAND_RETRY_DELAY:-5}" + local attempt=1 + + while true; do + if "$@"; then + return 0 + fi + + if ((attempt >= max_attempts)); then + log_error "${description} failed after ${attempt} attempt(s)" + return 1 + fi + + log_warn "${description} failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..." + sleep "${delay}" + attempt=$((attempt + 1)) + delay=$((delay * 2)) + done +} + # Find recipes with service criteria (testable cloud configurations) get_recipes() { for overlay in "${OVERLAYS_DIR}"/*.yaml; do @@ -68,10 +93,13 @@ ensure_cluster() { if ! kubectl get deployment -n kube-system kwok-controller &>/dev/null; then log_info "Installing KWOK controller..." - helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update - helm upgrade --install kwok-controller kwok/kwok \ + retry_command "Adding KWOK Helm repository" \ + helm repo add kwok https://kwok.sigs.k8s.io/charts/ --force-update + retry_command "Installing KWOK controller" \ + helm upgrade --install kwok-controller kwok/kwok \ --namespace kube-system --set hostNetwork=true --wait - helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system + retry_command "Installing KWOK stage-fast" \ + helm upgrade --install kwok-stage-fast kwok/stage-fast --namespace kube-system fi # Patch kindnet to exclude KWOK nodes diff --git a/recipes/overlays/kind.yaml b/recipes/overlays/kind.yaml index b0d8dbd76..f38462580 100644 --- a/recipes/overlays/kind.yaml +++ b/recipes/overlays/kind.yaml @@ -115,6 +115,11 @@ spec: - name: kube-prometheus-stack type: Helm overrides: + # CI only needs component health, not the full upstream alerting rule + # set. Skipping default rules reduces PrometheusRule churn during + # install on small kind control planes. + defaultRules: + create: false prometheus: prometheusSpec: # Smaller storage for local testing @@ -132,14 +137,35 @@ spec: memory: 1Gi # Shorter retention for local testing retention: 7d - grafana: + prometheusOperator: + # Keep operator-owned monitoring custom resources in the monitoring + # namespace for kind. Do not scope ServiceMonitor discovery here; + # GPU, Kubeflow, and Dynamo monitors may live in their own namespaces. + alertmanagerInstanceNamespaces: + - monitoring + alertmanagerConfigNamespaces: + - monitoring + prometheusInstanceNamespaces: + - monitoring + thanosRulerInstanceNamespaces: + - monitoring + # CI kind control planes can be slow under image pulls and controller + # startup. Avoid restarting the operator on short health probe stalls. + livenessProbe: + timeoutSeconds: 10 + failureThreshold: 10 + readinessProbe: + timeoutSeconds: 10 + failureThreshold: 6 resources: requests: cpu: 100m - memory: 128Mi + memory: 256Mi limits: cpu: 500m memory: 512Mi + grafana: + enabled: false alertmanager: alertmanagerSpec: resources: diff --git a/recipes/validators/README.md b/recipes/validators/README.md index e4a7bfa47..b45808a4a 100644 --- a/recipes/validators/README.md +++ b/recipes/validators/README.md @@ -55,7 +55,7 @@ Applied by `catalog.Load` in order: | Name | Description | Timeout | |------|-------------|---------| | `dra-support` | Verify Dynamic Resource Allocation support | 5m | -| `gang-scheduling` | Verify gang scheduling with KAI scheduler | 10m | +| `gang-scheduling` | Verify gang scheduling with KAI scheduler using CPU-only workers | 10m | | `accelerator-metrics` | Verify accelerator metrics from DCGM exporter | 5m | | `ai-service-metrics` | Verify AI service metrics via Prometheus | 5m | | `inference-gateway` | Verify inference gateway (kgateway) is operational | 5m | diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml index 6f50bc695..d322ac011 100644 --- a/recipes/validators/catalog.yaml +++ b/recipes/validators/catalog.yaml @@ -88,7 +88,7 @@ validators: env: [] - name: gang-scheduling phase: conformance - description: "Verify gang scheduling with KAI scheduler" + description: "Verify gang scheduling with KAI scheduler using CPU-only workers" image: ghcr.io/nvidia/aicr-validators/conformance:latest timeout: 10m args: ["gang-scheduling"] diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index b1a88e9d4..a69b88f13 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -73,10 +73,11 @@ tests/chainsaw/ai-conformance/ │ ├── assert-cert-manager.yaml # cert-manager healthy │ ├── assert-dra-driver.yaml # DRA driver healthy │ ├── assert-kai-scheduler.yaml # KAI scheduler healthy -│ ├── assert-monitoring.yaml # Prometheus stack healthy +│ ├── assert-monitoring.yaml # Prometheus stack healthy with Grafana │ └── assert-skyhook.yaml # Skyhook operator healthy ├── kind-common/ # Shared Kind-only assertions │ ├── assert-gpu-operator.yaml # GPU operator healthy on kind +│ ├── assert-monitoring.yaml # Prometheus stack healthy without Grafana │ ├── assert-network-operator.yaml # Network operator healthy on kind │ └── assert-nvsentinel.yaml # NVSentinel healthy on kind ├── kind-inference-dynamo/ # Kind + H100 + inference + dynamo leaf suite diff --git a/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml new file mode 100644 index 000000000..868be3fea --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-common/assert-monitoring.yaml @@ -0,0 +1,85 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert kind monitoring stack components required by H100 CI are healthy. +# Grafana is intentionally not asserted here because conformance metrics use +# Prometheus, DCGM exporter, and prometheus-adapter directly. + +# Prometheus Operator - manages Prometheus, Alertmanager, and ServiceMonitor CRs +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-prometheus-operator + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# kube-state-metrics - Kubernetes object state metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus StatefulSet - time series database +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus-kube-prometheus-prometheus + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Alertmanager StatefulSet - alert routing and silencing +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: alertmanager-kube-prometheus-alertmanager + namespace: monitoring +status: + (readyReplicas > `0`): true +--- +# Prometheus Node Exporter DaemonSet - node-level hardware/OS metrics +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prometheus-node-exporter + namespace: monitoring +status: + (numberReady > `0`): true + (desiredNumberScheduled > `0`): true +--- +# k8s-ephemeral-storage-metrics - ephemeral storage usage metrics +apiVersion: apps/v1 +kind: Deployment +metadata: + name: k8s-ephemeral-storage-metrics + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" +--- +# Prometheus Adapter - custom metrics API for HPA +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 1b1f701ad..51c7af093 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -65,10 +65,10 @@ spec: # ── Monitoring ───────────────────────────────────────────────────── - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack components. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml # ── kgateway ─────────────────────────────────────────────────────── - name: assert-kgateway @@ -110,6 +110,8 @@ spec: # ── KAI Scheduler ────────────────────────────────────────────────── - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 382d99104..d16d3ad38 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -60,10 +60,10 @@ spec: file: ../kind-common/assert-gpu-operator.yaml - name: assert-monitoring - description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. + description: Verify kind monitoring stack components. try: - assert: - file: ../common/assert-monitoring.yaml + file: ../kind-common/assert-monitoring.yaml - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. @@ -73,6 +73,8 @@ spec: - name: assert-kubeflow-trainer description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available. + timeouts: + assert: 600s try: - assert: file: assert-kubeflow-trainer.yaml @@ -99,6 +101,8 @@ spec: - name: assert-kai-scheduler description: Verify KAI scheduler is available. + timeouts: + assert: 600s try: - assert: file: ../common/assert-kai-scheduler.yaml diff --git a/validators/conformance/gang_scheduling_check.go b/validators/conformance/gang_scheduling_check.go index 2bfda2612..e29e20de7 100644 --- a/validators/conformance/gang_scheduling_check.go +++ b/validators/conformance/gang_scheduling_check.go @@ -26,7 +26,6 @@ import ( "github.com/NVIDIA/aicr/pkg/errors" "github.com/NVIDIA/aicr/pkg/k8s" "github.com/NVIDIA/aicr/validators" - "github.com/NVIDIA/aicr/validators/helper" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -40,7 +39,6 @@ const ( gangTestNamespace = "gang-scheduling-test" gangTestPrefix = "gang-test-" gangPodPrefix = "gang-worker-" - gangClaimPrefix = "gang-gpu-claim-" gangGroupPrefix = "gang-group-" gangMinMembers = 2 ) @@ -60,12 +58,15 @@ var podGroupGVR = schema.GroupVersionResource{ Group: "scheduling.run.ai", Version: "v2alpha2", Resource: "podgroups", } +// Gang scheduling scope: this check validates KAI PodGroup co-scheduling only. +// GPU access and DRA allocation are covered by the DRA support and secure +// accelerator access checks so full conformance can run on one H100. + // gangTestRun holds per-invocation resource names to avoid collisions. type gangTestRun struct { suffix string groupName string pods [gangMinMembers]string - claims [gangMinMembers]string } type gangSchedulingReport struct { @@ -86,15 +87,16 @@ func newGangTestRun() (*gangTestRun, error) { } for i := range gangMinMembers { run.pods[i] = fmt.Sprintf("%s%s-%d", gangPodPrefix, suffix, i) - run.claims[i] = fmt.Sprintf("%s%s-%d", gangClaimPrefix, suffix, i) } return run, nil } // CheckGangScheduling validates CNCF requirement #7: Gang Scheduling. // Verifies KAI scheduler deployments are running, required CRDs exist, and -// exercises gang scheduling by creating a PodGroup with 2 GPU pods that must -// be co-scheduled via the KAI scheduler. +// exercises gang scheduling by creating a PodGroup with 2 CPU-only pods that +// must be co-scheduled via the KAI scheduler. GPU access and DRA isolation are +// validated separately by the DRA and secure accelerator access checks; keeping +// this workload CPU-only lets one-GPU CI clusters run the full conformance phase. func CheckGangScheduling(ctx *validators.Context) error { if ctx.Clientset == nil { return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available") @@ -162,20 +164,7 @@ func CheckGangScheduling(ctx *validators.Context) error { "kubectl get crd queues.scheduling.run.ai podgroups.scheduling.run.ai", crdSummary.String()) - // 3. Pre-flight: ensure enough free GPUs for the gang test. - total, free, gpuErr := countAvailableGPUs(ctx.Ctx, dynClient) - if gpuErr != nil { - return gpuErr - } - recordArtifact(ctx, "GPU Availability", - fmt.Sprintf("Total GPUs: %d\nFree GPUs: %d\nRequired: %d", total, free, gangMinMembers)) - if free < gangMinMembers { - return errors.New(errors.ErrCodeUnavailable, - fmt.Sprintf("insufficient free GPUs for gang scheduling test: %d free of %d total (need %d)", - free, total, gangMinMembers)) - } - - // 4. Functional test: create PodGroup with 2 GPU pods, verify co-scheduling. + // 3. Functional test: create PodGroup with 2 CPU-only pods, verify co-scheduling. run, err := newGangTestRun() if err != nil { return err @@ -187,13 +176,13 @@ func CheckGangScheduling(ctx *validators.Context) error { cleanupGangTestResources(cleanupCtx, ctx.Clientset, dynClient, run) recordRawTextArtifact(ctx, "Delete test namespace", "kubectl delete namespace gang-scheduling-test --ignore-not-found", - "Deleted gang test pods, claims, and PodGroup; namespace retained intentionally to avoid DRA finalizer stalls.") + "Deleted gang test pods and PodGroup; namespace retained intentionally to keep cleanup bounded.") }() recordRawTextArtifact(ctx, "Apply test manifest", - "kubectl apply -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml", - fmt.Sprintf("Created PodGroup=%s ResourceClaims=%s,%s Pods=%s,%s in namespace=%s", - run.groupName, run.claims[0], run.claims[1], run.pods[0], run.pods[1], gangTestNamespace)) + "kubectl apply generated CPU-only PodGroup test resources", + fmt.Sprintf("Created PodGroup=%s Pods=%s,%s in namespace=%s", + run.groupName, run.pods[0], run.pods[1], gangTestNamespace)) if err = deployGangTestResources(ctx.Ctx, ctx.Clientset, dynClient, run, ctx.Tolerations); err != nil { return err @@ -274,7 +263,7 @@ func collectGangTestArtifacts(ctx *validators.Context, dynClient dynamic.Interfa } } -// deployGangTestResources creates the namespace, PodGroup, ResourceClaims, and Pods. +// deployGangTestResources creates the namespace, PodGroup, and worker Pods. // tolerations, when non-nil, replace the default tolerate-all policy on test pods. func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *gangTestRun, tolerations []corev1.Toleration) error { // 1. Create namespace (idempotent). @@ -292,15 +281,8 @@ func deployGangTestResources(ctx context.Context, clientset kubernetes.Interface return errors.Wrap(errors.ErrCodeInternal, "failed to create PodGroup", err) } - // 3. Create ResourceClaims and Pods. + // 3. Create Pods. for i := range gangMinMembers { - claim := buildGangResourceClaim(run, i) - if _, err := dynClient.Resource(claimGVR).Namespace(gangTestNamespace).Create( - ctx, claim, metav1.CreateOptions{}); err != nil { - return errors.Wrap(errors.ErrCodeInternal, - fmt.Sprintf("failed to create ResourceClaim %s", run.claims[i]), err) - } - pod := buildGangTestPod(run, i, tolerations) if _, err := clientset.CoreV1().Pods(gangTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil { return errors.Wrap(errors.ErrCodeInternal, @@ -380,10 +362,11 @@ func validateGangPatterns(pods [gangMinMembers]*corev1.Pod, run *gangTestRun) (* run.pods[i], run.groupName)) } - // Pod must use DRA (resourceClaims, not device plugin). - if len(pod.Spec.ResourceClaims) == 0 { + // Gang scheduling is intentionally CPU-only. DRA behavior is validated + // separately by dra-support and secure-accelerator-access. + if len(pod.Spec.ResourceClaims) != 0 { return nil, errors.New(errors.ErrCodeInternal, - fmt.Sprintf("gang test pod %s does not use DRA resourceClaims", run.pods[i])) + fmt.Sprintf("gang test pod %s unexpectedly uses resourceClaims", run.pods[i])) } } @@ -445,11 +428,6 @@ func cleanupGangTestResources(ctx context.Context, clientset kubernetes.Interfac return err }) } - // Delete claims. - for i := range gangMinMembers { - _ = k8s.IgnoreNotFound(dynClient.Resource(claimGVR).Namespace(gangTestNamespace).Delete( - ctx, run.claims[i], metav1.DeleteOptions{})) - } // Delete PodGroup. _ = k8s.IgnoreNotFound(dynClient.Resource(podGroupGVR).Namespace(gangTestNamespace).Delete( ctx, run.groupName, metav1.DeleteOptions{})) @@ -473,38 +451,6 @@ func buildPodGroup(run *gangTestRun) *unstructured.Unstructured { } } -// buildGangResourceClaim returns the unstructured ResourceClaim for a gang test pod. -// The kai.scheduler/queue label is required by KAI v0.13.0+ for DRA claims. -func buildGangResourceClaim(run *gangTestRun, index int) *unstructured.Unstructured { - return &unstructured.Unstructured{ - Object: map[string]interface{}{ - "apiVersion": "resource.k8s.io/v1", - "kind": "ResourceClaim", - "metadata": map[string]interface{}{ - "name": run.claims[index], - "namespace": gangTestNamespace, - "labels": map[string]interface{}{ - "kai.scheduler/queue": "default-queue", - }, - }, - "spec": map[string]interface{}{ - "devices": map[string]interface{}{ - "requests": []interface{}{ - map[string]interface{}{ - "name": "gpu", - "exactly": map[string]interface{}{ - "deviceClassName": "gpu.nvidia.com", - "allocationMode": "ExactCount", - "count": int64(1), - }, - }, - }, - }, - }, - }, - } -} - // buildGangTestPod returns the Pod spec for a gang scheduling test worker. // tolerations, when non-nil, replace the default tolerate-all policy. func buildGangTestPod(run *gangTestRun, index int, tolerations []corev1.Toleration) *corev1.Pod { @@ -524,22 +470,11 @@ func buildGangTestPod(run *gangTestRun, index int, tolerations []corev1.Tolerati SchedulerName: "kai-scheduler", RestartPolicy: corev1.RestartPolicyNever, Tolerations: tolerations, - ResourceClaims: []corev1.PodResourceClaim{ - { - Name: "gpu", - ResourceClaimName: helper.StrPtr(run.claims[index]), - }, - }, Containers: []corev1.Container{ { Name: "worker", - Image: "nvidia/cuda:12.9.0-base-ubuntu24.04", - Command: []string{"bash", "-c", fmt.Sprintf("nvidia-smi && echo 'Gang worker %d completed successfully'", index)}, - Resources: corev1.ResourceRequirements{ - Claims: []corev1.ResourceClaim{ - {Name: "gpu"}, - }, - }, + Image: defaults.ProbeImage, + Command: []string{"sh", "-c", fmt.Sprintf("echo 'Gang worker %d completed successfully'", index)}, }, }, }, diff --git a/validators/conformance/helpers.go b/validators/conformance/helpers.go index 22ccc8452..5b78d849c 100644 --- a/validators/conformance/helpers.go +++ b/validators/conformance/helpers.go @@ -292,51 +292,3 @@ func waitForDeletion(ctx context.Context, getFunc func() error) { }, ) } - -// gpuDriverName is the DRA driver name for NVIDIA GPUs. -const gpuDriverName = "gpu.nvidia.com" - -// countAvailableGPUs counts total GPU devices from ResourceSlices and subtracts -// allocated devices from ResourceClaims to determine how many are free. -func countAvailableGPUs(ctx context.Context, dynClient dynamic.Interface) (total, free int, err error) { - // Count total GPU devices from ResourceSlices. - // Uses package-level resourceSliceGVR defined in secure_access_check.go. - slices, err := dynClient.Resource(resourceSliceGVR).List(ctx, metav1.ListOptions{}) - if err != nil { - return 0, 0, errors.Wrap(errors.ErrCodeInternal, "failed to list ResourceSlices", err) - } - for _, slice := range slices.Items { - driver, _, _ := unstructured.NestedString(slice.Object, "spec", "driver") - if driver != gpuDriverName { - continue - } - devices, found, _ := unstructured.NestedSlice(slice.Object, "spec", "devices") - if found { - total += len(devices) - } - } - - // Count allocated GPU devices from ResourceClaims. - var allocated int - claims, err := dynClient.Resource(claimGVR).List(ctx, metav1.ListOptions{}) - if err != nil { - return 0, 0, errors.Wrap(errors.ErrCodeInternal, "failed to list ResourceClaims", err) - } - for _, claim := range claims.Items { - results, found, _ := unstructured.NestedSlice(claim.Object, "status", "allocation", "devices", "results") - if !found { - continue - } - for _, r := range results { - result, ok := r.(map[string]interface{}) - if !ok { - continue - } - if result["driver"] == gpuDriverName { - allocated++ - } - } - } - - return total, total - allocated, nil -} diff --git a/validators/conformance/helpers_test.go b/validators/conformance/helpers_test.go index 1097ccc8b..5b2566b67 100644 --- a/validators/conformance/helpers_test.go +++ b/validators/conformance/helpers_test.go @@ -18,6 +18,7 @@ import ( "strings" "testing" + "github.com/NVIDIA/aicr/pkg/defaults" corev1 "k8s.io/api/core/v1" ) @@ -397,12 +398,6 @@ func TestNewGangTestRun(t *testing.T) { if !strings.HasPrefix(run.pods[i], gangPodPrefix) { t.Errorf("newGangTestRun() pods[%d] = %q, want prefix %q", i, run.pods[i], gangPodPrefix) } - if run.claims[i] == "" { - t.Errorf("newGangTestRun() claims[%d] is empty", i) - } - if !strings.HasPrefix(run.claims[i], gangClaimPrefix) { - t.Errorf("newGangTestRun() claims[%d] = %q, want prefix %q", i, run.claims[i], gangClaimPrefix) - } } // Two calls should produce different suffixes. @@ -414,3 +409,31 @@ func TestNewGangTestRun(t *testing.T) { t.Error("newGangTestRun() two calls produced identical suffixes") } } + +func TestBuildGangTestPodUsesCPUOnlyWorkload(t *testing.T) { + run, err := newGangTestRun() + if err != nil { + t.Fatalf("newGangTestRun() error = %v", err) + } + + pod := buildGangTestPod(run, 0, nil) + if pod.Spec.SchedulerName != "kai-scheduler" { + t.Errorf("SchedulerName = %q, want kai-scheduler", pod.Spec.SchedulerName) + } + if got := pod.Labels["pod-group.scheduling.run.ai/name"]; got != run.groupName { + t.Errorf("pod group label = %q, want %q", got, run.groupName) + } + if len(pod.Spec.ResourceClaims) != 0 { + t.Fatalf("ResourceClaims length = %d, want 0", len(pod.Spec.ResourceClaims)) + } + if len(pod.Spec.Containers) != 1 { + t.Fatalf("containers length = %d, want 1", len(pod.Spec.Containers)) + } + container := pod.Spec.Containers[0] + if container.Image != defaults.ProbeImage { + t.Errorf("container image = %q, want %q", container.Image, defaults.ProbeImage) + } + if len(container.Resources.Claims) != 0 { + t.Errorf("container resource claims length = %d, want 0", len(container.Resources.Claims)) + } +} diff --git a/validators/conformance/testdata/README.md b/validators/conformance/testdata/README.md new file mode 100644 index 000000000..efd9e8043 --- /dev/null +++ b/validators/conformance/testdata/README.md @@ -0,0 +1,6 @@ +# Conformance Validator Test Data + +The conformance validator currently does not require static fixtures, but the +CI image build copies each validator phase's `testdata` directory into the +image. Keep this directory tracked so the image build fails only when a phase's +fixture directory is genuinely missing.