From 376af148006f6423845c6b88945391bb671d3c9c Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Tue, 28 Apr 2026 12:29:32 -0700 Subject: [PATCH] ci: centralize GPU CI runtime pins --- .github/actions/aicr-build/action.yml | 11 +++++++++++ .github/actions/aicr-build/build-snapshot-agent.sh | 10 ++-------- .github/actions/load-versions/action.yml | 10 ++++++++++ .github/actions/runtime-install/action.yml | 10 ++++++++++ .../runtime-install/install-gpu-operator-helm.sh | 10 ++-------- .settings.yaml | 2 ++ 6 files changed, 37 insertions(+), 16 deletions(-) diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index 6bbd6a0ab..b40b656ab 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -32,6 +32,10 @@ inputs: description: 'Comma-separated validator phases to build (e.g., "conformance,deployment"), or "none" to skip all. Takes precedence over build_validators.' required: false default: '' + snapshot_agent_cuda_image: + description: 'CUDA base image for the snapshot agent image; defaults to .settings.yaml via load-versions' + required: false + default: '' runs: using: 'composite' @@ -50,9 +54,16 @@ runs: GOFLAGS: -mod=vendor run: bash "${{ github.action_path }}/build-cli.sh" + - name: Load snapshot agent image version + if: inputs.build_snapshot_agent == 'true' && inputs.snapshot_agent_cuda_image == '' + id: snapshot-versions + uses: ./.github/actions/load-versions + - name: Build snapshot agent image and load into kind if: inputs.build_snapshot_agent == 'true' shell: bash + env: + SNAPSHOT_AGENT_CUDA_IMAGE: ${{ inputs.snapshot_agent_cuda_image || steps.snapshot-versions.outputs.snapshot_agent_cuda_image }} run: bash "${{ github.action_path }}/build-snapshot-agent.sh" - name: Build validator images and load into kind diff --git a/.github/actions/aicr-build/build-snapshot-agent.sh b/.github/actions/aicr-build/build-snapshot-agent.sh index a48d99615..6e5703985 100644 --- a/.github/actions/aicr-build/build-snapshot-agent.sh +++ b/.github/actions/aicr-build/build-snapshot-agent.sh @@ -15,14 +15,8 @@ set -euo pipefail -if ! command -v yq >/dev/null 2>&1; then - echo "::error::yq is required to read testing.snapshot_agent_cuda_image from .settings.yaml" - exit 1 -fi - -SNAPSHOT_AGENT_CUDA_IMAGE="$(yq eval '.testing.snapshot_agent_cuda_image // ""' .settings.yaml)" -if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then - echo "::error::testing.snapshot_agent_cuda_image must be set in .settings.yaml" +if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE:-}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then + echo "::error::SNAPSHOT_AGENT_CUDA_IMAGE must be provided by the aicr-build action" exit 1 fi diff --git a/.github/actions/load-versions/action.yml b/.github/actions/load-versions/action.yml index b3c506d40..edbe087db 100644 --- a/.github/actions/load-versions/action.yml +++ b/.github/actions/load-versions/action.yml @@ -97,6 +97,12 @@ outputs: h100_kind_node_image: description: 'Kind node image for H100 GPU tests' value: ${{ steps.versions.outputs.h100_kind_node_image }} + gpu_operator_chart_version: + description: 'GPU Operator Helm chart version for GPU smoke tests' + value: ${{ steps.versions.outputs.gpu_operator_chart_version }} + snapshot_agent_cuda_image: + description: 'CUDA base image for the snapshot agent image' + value: ${{ steps.versions.outputs.snapshot_agent_cuda_image }} runs: using: 'composite' @@ -149,6 +155,8 @@ runs: # Testing configuration echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT + echo "gpu_operator_chart_version=$(yq eval '.testing.gpu_operator_chart_version' .settings.yaml)" >> $GITHUB_OUTPUT + echo "snapshot_agent_cuda_image=$(yq eval '.testing.snapshot_agent_cuda_image' .settings.yaml)" >> $GITHUB_OUTPUT - name: Display loaded versions shell: bash @@ -182,3 +190,5 @@ runs: echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}" echo " kind_node_image: ${{ steps.versions.outputs.kind_node_image }}" echo " h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}" + echo " gpu_operator_chart_version: ${{ steps.versions.outputs.gpu_operator_chart_version }}" + echo " snapshot_agent_cuda_image: ${{ steps.versions.outputs.snapshot_agent_cuda_image }}" diff --git a/.github/actions/runtime-install/action.yml b/.github/actions/runtime-install/action.yml index 1adfea364..3d5239b8b 100644 --- a/.github/actions/runtime-install/action.yml +++ b/.github/actions/runtime-install/action.yml @@ -39,6 +39,10 @@ inputs: description: 'Continue deploying remaining bundle components after a component failure' required: false default: 'true' + gpu_operator_chart_version: + description: 'GPU Operator Helm chart version for helm mode; defaults to .settings.yaml via load-versions' + required: false + default: '' runs: using: 'composite' @@ -58,9 +62,15 @@ runs: # --- Helm mode: standalone GPU operator chart --- + - name: Load GPU Operator chart version + if: inputs.method == 'helm' && inputs.gpu_operator_chart_version == '' + id: helm-versions + uses: ./.github/actions/load-versions - name: Install GPU Operator (helm) if: inputs.method == 'helm' shell: bash + env: + GPU_OPERATOR_CHART_VERSION: ${{ inputs.gpu_operator_chart_version || steps.helm-versions.outputs.gpu_operator_chart_version }} run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh" - name: Wait for GPU operands (helm) if: inputs.method == 'helm' diff --git a/.github/actions/runtime-install/install-gpu-operator-helm.sh b/.github/actions/runtime-install/install-gpu-operator-helm.sh index f20527ed3..542c04afa 100644 --- a/.github/actions/runtime-install/install-gpu-operator-helm.sh +++ b/.github/actions/runtime-install/install-gpu-operator-helm.sh @@ -15,14 +15,8 @@ set -euo pipefail -if ! command -v yq >/dev/null 2>&1; then - echo "::error::yq is required to read testing.gpu_operator_chart_version from .settings.yaml" - exit 1 -fi - -GPU_OPERATOR_CHART_VERSION="$(yq eval '.testing.gpu_operator_chart_version // ""' .settings.yaml)" -if [[ -z "${GPU_OPERATOR_CHART_VERSION}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then - echo "::error::testing.gpu_operator_chart_version must be set in .settings.yaml" +if [[ -z "${GPU_OPERATOR_CHART_VERSION:-}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then + echo "::error::GPU_OPERATOR_CHART_VERSION must be provided by the runtime-install action" exit 1 fi diff --git a/.settings.yaml b/.settings.yaml index 43da8de37..2dc253a88 100644 --- a/.settings.yaml +++ b/.settings.yaml @@ -73,6 +73,8 @@ docs_tools: testing: kind_node_image: 'kindest/node:v1.32.0' h100_kind_node_image: 'kindest/node:v1.35.0' + + # GPU CI runtime pins consumed through .github/actions/load-versions. gpu_operator_chart_version: 'v25.10.1' snapshot_agent_cuda_image: 'nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04'