Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/actions/aicr-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ inputs:
description: 'Comma-separated validator phases to build (e.g., "conformance,deployment"), or "none" to skip all. Takes precedence over build_validators.'
required: false
default: ''
snapshot_agent_cuda_image:
description: 'CUDA base image for the snapshot agent image; defaults to .settings.yaml via load-versions'
required: false
default: ''

runs:
using: 'composite'
Expand All @@ -50,9 +54,16 @@ runs:
GOFLAGS: -mod=vendor
run: bash "${{ github.action_path }}/build-cli.sh"

- name: Load snapshot agent image version
if: inputs.build_snapshot_agent == 'true' && inputs.snapshot_agent_cuda_image == ''
id: snapshot-versions
uses: ./.github/actions/load-versions

- name: Build snapshot agent image and load into kind
if: inputs.build_snapshot_agent == 'true'
shell: bash
env:
SNAPSHOT_AGENT_CUDA_IMAGE: ${{ inputs.snapshot_agent_cuda_image || steps.snapshot-versions.outputs.snapshot_agent_cuda_image }}
run: bash "${{ github.action_path }}/build-snapshot-agent.sh"

- name: Build validator images and load into kind
Expand Down
10 changes: 2 additions & 8 deletions .github/actions/aicr-build/build-snapshot-agent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,8 @@

set -euo pipefail

if ! command -v yq >/dev/null 2>&1; then
echo "::error::yq is required to read testing.snapshot_agent_cuda_image from .settings.yaml"
exit 1
fi

SNAPSHOT_AGENT_CUDA_IMAGE="$(yq eval '.testing.snapshot_agent_cuda_image // ""' .settings.yaml)"
if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then
echo "::error::testing.snapshot_agent_cuda_image must be set in .settings.yaml"
if [[ -z "${SNAPSHOT_AGENT_CUDA_IMAGE:-}" || "${SNAPSHOT_AGENT_CUDA_IMAGE}" == "null" ]]; then
echo "::error::SNAPSHOT_AGENT_CUDA_IMAGE must be provided by the aicr-build action"
exit 1
fi

Expand Down
10 changes: 10 additions & 0 deletions .github/actions/load-versions/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ outputs:
h100_kind_node_image:
description: 'Kind node image for H100 GPU tests'
value: ${{ steps.versions.outputs.h100_kind_node_image }}
gpu_operator_chart_version:
description: 'GPU Operator Helm chart version for GPU smoke tests'
value: ${{ steps.versions.outputs.gpu_operator_chart_version }}
snapshot_agent_cuda_image:
description: 'CUDA base image for the snapshot agent image'
value: ${{ steps.versions.outputs.snapshot_agent_cuda_image }}

runs:
using: 'composite'
Expand Down Expand Up @@ -149,6 +155,8 @@ runs:
# Testing configuration
echo "kind_node_image=$(yq eval '.testing.kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
echo "h100_kind_node_image=$(yq eval '.testing.h100_kind_node_image' .settings.yaml)" >> $GITHUB_OUTPUT
echo "gpu_operator_chart_version=$(yq eval '.testing.gpu_operator_chart_version' .settings.yaml)" >> $GITHUB_OUTPUT
echo "snapshot_agent_cuda_image=$(yq eval '.testing.snapshot_agent_cuda_image' .settings.yaml)" >> $GITHUB_OUTPUT

- name: Display loaded versions
shell: bash
Expand Down Expand Up @@ -182,3 +190,5 @@ runs:
echo " test_timeout: ${{ steps.versions.outputs.test_timeout }}"
echo " kind_node_image: ${{ steps.versions.outputs.kind_node_image }}"
echo " h100_kind_node_image: ${{ steps.versions.outputs.h100_kind_node_image }}"
echo " gpu_operator_chart_version: ${{ steps.versions.outputs.gpu_operator_chart_version }}"
echo " snapshot_agent_cuda_image: ${{ steps.versions.outputs.snapshot_agent_cuda_image }}"
10 changes: 10 additions & 0 deletions .github/actions/runtime-install/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ inputs:
description: 'Continue deploying remaining bundle components after a component failure'
required: false
default: 'true'
gpu_operator_chart_version:
description: 'GPU Operator Helm chart version for helm mode; defaults to .settings.yaml via load-versions'
required: false
default: ''

runs:
using: 'composite'
Expand All @@ -58,9 +62,15 @@ runs:

# --- Helm mode: standalone GPU operator chart ---

- name: Load GPU Operator chart version
if: inputs.method == 'helm' && inputs.gpu_operator_chart_version == ''
id: helm-versions
uses: ./.github/actions/load-versions
- name: Install GPU Operator (helm)
if: inputs.method == 'helm'
shell: bash
env:
GPU_OPERATOR_CHART_VERSION: ${{ inputs.gpu_operator_chart_version || steps.helm-versions.outputs.gpu_operator_chart_version }}
run: bash "${{ github.action_path }}/install-gpu-operator-helm.sh"
- name: Wait for GPU operands (helm)
if: inputs.method == 'helm'
Expand Down
10 changes: 2 additions & 8 deletions .github/actions/runtime-install/install-gpu-operator-helm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,8 @@

set -euo pipefail

if ! command -v yq >/dev/null 2>&1; then
echo "::error::yq is required to read testing.gpu_operator_chart_version from .settings.yaml"
exit 1
fi

GPU_OPERATOR_CHART_VERSION="$(yq eval '.testing.gpu_operator_chart_version // ""' .settings.yaml)"
if [[ -z "${GPU_OPERATOR_CHART_VERSION}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then
echo "::error::testing.gpu_operator_chart_version must be set in .settings.yaml"
if [[ -z "${GPU_OPERATOR_CHART_VERSION:-}" || "${GPU_OPERATOR_CHART_VERSION}" == "null" ]]; then
echo "::error::GPU_OPERATOR_CHART_VERSION must be provided by the runtime-install action"
exit 1
fi

Expand Down
2 changes: 2 additions & 0 deletions .settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ docs_tools:
testing:
kind_node_image: 'kindest/node:v1.32.0'
h100_kind_node_image: 'kindest/node:v1.35.0'

# GPU CI runtime pins consumed through .github/actions/load-versions.
gpu_operator_chart_version: 'v25.10.1'
snapshot_agent_cuda_image: 'nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04'

Expand Down
Loading