From 3f110f1d182fcd3bbde4ebee7aec2a60195d3fc1 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Thu, 7 May 2026 14:34:24 +0300 Subject: [PATCH 1/8] docs(design): add ADR-007 for OpenShift OLM integration Signed-off-by: Shai Kapon --- docs/design/007-openshift-integration.md | 395 +++++++++++++++++++++++ 1 file changed, 395 insertions(+) create mode 100644 docs/design/007-openshift-integration.md diff --git a/docs/design/007-openshift-integration.md b/docs/design/007-openshift-integration.md new file mode 100644 index 000000000..17616abf1 --- /dev/null +++ b/docs/design/007-openshift-integration.md @@ -0,0 +1,395 @@ +# ADR-007: OpenShift Integration + +## Status + +**Proposed** — 2026-05-07 + +## Context + +AICR generates GPU-accelerated Kubernetes configurations through a Helm-centric +pipeline: snapshot → recipe → bundle. The bundler outputs Helm charts with +values files that can be deployed via `helm install` or other GitOps tools. + +OpenShift Container Platform (OCP) is a Kubernetes distribution with additional +security, networking, and operational capabilities. While OpenShift supports +Helm installations, the **preferred and recommended installation mechanism** for +operators is the **Operator Lifecycle Manager (OLM)**. OLM is the OpenShift-native +way to install, manage, and upgrade operators. + +The current AICR architecture produces Helm artifacts but does not produce +OLM-compatible operator bundles. + +This ADR explores how AICR can support OpenShift users by generating +OLM-compatible operator bundles alongside (or instead of) Helm charts for +OpenShift-targeted recipes. + +## OLM Lifecycle + +The Operator Lifecycle Manager (OLM) is a declarative framework for managing +operator installation, upgrades, and dependency resolution in OpenShift. + +### Key OLM Resources + +**ClusterServiceVersion (CSV)** — Defines a single version of an operator +with metadata, RBAC, CRDs, and install strategy (Deployment spec). + +**Bundle** — Immutable container image containing CSVs, CRDs, and metadata +annotations. The deployment unit; never executed, only unpacked by OLM. + +**CatalogSource** — Points to an index image organizing bundles into +channels (`stable`, `fast`, `candidate`). + +**Subscription** — User-facing resource declaring intent to install an +operator from a catalog, specifying package name and channel. + +**InstallPlan** — Generated by OLM with resolved dependencies; may require +manual approval. + +### Lifecycle Flow + +``` +Subscription created → CatalogSource queried → InstallPlan generated +→ CSV resources applied → Operator deployed → Auto-upgrade on new bundle +``` + +### Key Differences from Helm + +- **Automatic dependency resolution** vs manual installation +- **Channel-based auto-upgrades** vs explicit `helm upgrade` +- **Immutable bundle images** vs mutable chart versions +- **RBAC pre-validation** vs runtime failures +- **OpenShift Console UI integration** vs no visibility + +## Proposed Architecture + +### Dual-Component Pattern + +Each OpenShift operator requires **two AICR components**: + +1. **Operator component** (`-olm`) — Installs the operator via OLM + - Contains Subscription, CatalogSource, Namespace, OperatorGroup + - Uses `direct` deployment type + - Example: `gpu-operator-olm` + +2. **CR component** (``) — Installs the custom resource(s) + - Contains operator-specific CRs (e.g., ClusterPolicy for GPU operator) + - Uses `direct` deployment type + - Example: `gpu-operator` + - Depends on `-olm` to ensure operator is installed first + +### Direct Deployment Type + +A new deployment type `direct` is introduced alongside `helm` and `kustomize`. +Unlike Helm (which references external charts) or Kustomize (which references +external repos), `direct` points to **static YAML manifests embedded in AICR**. + +**Registry entry:** +```yaml +- name: gpu-operator-olm + displayName: GPU Operator (OLM) + direct: + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml +``` + +**File structure:** +``` +recipes/components/ +├── gpu-operator-olm/ +│ └── direct/ +│ └── olm.yaml # All OLM resources in one file +└── gpu-operator/ + └── direct/ + └── clusterpolicy.yaml # CR definition +``` + +**Characteristics:** +- No external dependencies (chart repos, git repos) +- All manifests embedded at build time +- Bundler copies files directly to output (no templating, no value merging) +- Simple `kubectl apply -f` deployment model + +### Component Dependencies + +The existing AICR dependency mechanism ensures correct installation order: + +```yaml +# In registry.yaml or recipe overlay +- name: gpu-operator + displayName: GPU Operator + direct: + sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml + dependsOn: + - gpu-operator-olm # Operator must be installed first +``` + +### Bundler Behavior + +For `direct` components, the bundler: + +1. Reads the file at `sourceFile` +2. Copies it to the bundle output directory unchanged +3. Generates `install.sh` wrapper script that applies the YAML +4. If `waitScript` is specified, templates and includes wait.sh execution +5. No value merging, no templating on YAML — direct passthrough + +**Bundle output:** +``` +bundles/ +├── gpu-operator-olm/ +│ ├── olm.yaml # Copied from recipes/components/ +│ └── install.sh # Generated wrapper script +└── gpu-operator/ + ├── clusterpolicy.yaml # Copied from recipes/components/ + └── install.sh # Generated wrapper script +``` + +### Post-Install Wait Scripts + +Components can optionally include a wait script that runs after resource +installation to verify deployment readiness. This is especially important for +OLM components where the Subscription apply returns immediately but the operator +may take time to install. + +**Registry configuration:** +```yaml +- name: gpu-operator-olm + displayName: GPU Operator (OLM) + direct: + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml + waitScript: recipes/components/gpu-operator-olm/direct/wait.sh.tmpl +``` + +**Source structure:** +``` +recipes/components/gpu-operator-olm/direct/ +├── olm.yaml # OLM resources +└── wait.sh.tmpl # Wait script template +``` + +**Template variables:** +- `{{.Namespace}}` — Target namespace for the component +- `{{.ComponentName}}` — Component name from registry + +**Bundler behavior:** +1. Generates `install.sh` wrapper script that runs `kubectl apply -f olm.yaml` +2. When `direct.waitScript` is specified: + - Reads the template file at `waitScript` path + - Templates it with namespace and component name + - Outputs `wait.sh` in bundle + - Updates `install.sh` to run `wait.sh` after applying resources +3. Makes all scripts executable + +**Wait script contract:** +- Runs after `kubectl apply -f olm.yaml` +- Polls for readiness condition in a loop +- Has internal timeout configuration +- Exits 0 on success, non-zero on timeout or failure +- For OLM components: waits for CSV `status.phase: Succeeded` +- For CR components: waits for custom resource status conditions + +**Example bundler output:** +``` +bundles/ +├── gpu-operator-olm/ +│ ├── olm.yaml +│ ├── install.sh # Generated wrapper script +│ └── wait.sh # Generated from wait.sh.tmpl +└── gpu-operator/ + ├── clusterpolicy.yaml + ├── install.sh # Generated wrapper script + └── wait.sh # Generated from wait.sh.tmpl +``` + +**Deployment flow with wait scripts:** +```bash +# Install OLM resources +./bundles/gpu-operator-olm/install.sh # Applies olm.yaml + runs wait.sh + +# Install CR (depends on operator being ready) +./bundles/gpu-operator/install.sh # Applies clusterpolicy.yaml + runs wait.sh +``` + +### Example: GPU Operator on OpenShift + +**Registry entries:** +```yaml +# Operator installation via OLM +- name: gpu-operator-olm + displayName: GPU Operator (OLM) + direct: + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml + waitScript: recipes/components/gpu-operator-olm/direct/wait.sh.tmpl + +# ClusterPolicy CR +- name: gpu-operator + displayName: GPU Operator + direct: + sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml + waitScript: recipes/components/gpu-operator/direct/wait.sh.tmpl + dependsOn: + - gpu-operator-olm +``` + +**Component files:** + +`recipes/components/gpu-operator-olm/direct/olm.yaml`: +```yaml +--- +apiVersion: v1 +kind: Namespace +metadata: + name: nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nvidia-gpu-operator-group + namespace: nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: CatalogSource +metadata: + name: certified-operators + namespace: openshift-marketplace +spec: + sourceType: grpc + image: registry.redhat.io/redhat/certified-operator-index:v4.14 +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: stable + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace +``` + +`recipes/components/gpu-operator/direct/clusterpolicy.yaml`: +```yaml +--- +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + operator: + defaultRuntime: crio + driver: + enabled: true + version: "550.54.15" + # ... rest of ClusterPolicy spec +``` + +### Workflow + +``` +User: aicr bundle --service ocp --accelerator h100 --output ./bundles + ↓ +Recipe resolver selects overlay with gpu-operator-olm + gpu-operator + ↓ +Bundler processes direct components: + - Copies recipes/components/gpu-operator-olm/direct/olm.yaml → bundles/ + - Templates recipes/components/gpu-operator-olm/direct/wait.sh.tmpl → bundles/gpu-operator-olm/wait.sh + - Copies recipes/components/gpu-operator/direct/clusterpolicy.yaml → bundles/ + - Templates recipes/components/gpu-operator/direct/wait.sh.tmpl → bundles/gpu-operator/wait.sh + ↓ +User deployment: + 1. ./bundles/gpu-operator-olm/install.sh # Applies resources + waits for CSV ready + 2. ./bundles/gpu-operator/install.sh # Applies resources + waits for ClusterPolicy ready +``` + +## Consequences + +### Positive + +- **OpenShift-native installation** — Uses OLM (the recommended OpenShift + mechanism) instead of forcing Helm on OCP users. Operators appear in + OpenShift Console OperatorHub with proper lifecycle management. +- **No bundle image management** — Unlike full OLM integration, AICR does not + need to build, push, or maintain operator bundle container images. All + manifests are static YAML embedded in AICR. +- **Reuses existing patterns** — The `direct` deployment type fits naturally + alongside `helm` and `kustomize`. Component registry, dependencies, and + bundler architecture remain unchanged. +- **Simple deployment model** — Users run `install.sh` scripts; no need to + understand OLM mechanics or manually sequence kubectl commands. +- **Deployment verification** — Wait scripts provide immediate feedback on + installation success/failure instead of leaving users to manually check + operator readiness. +- **Clean separation of concerns** — Dual-component pattern (operator vs CR) + separates operator installation from configuration, making it easy to swap + CR configurations without touching operator installation. +- **Zero external dependencies** — All manifests embedded at build time; + no runtime dependencies on external chart repos, git repos, or catalog sources + (beyond the OpenShift-provided certified operator index). + +### Negative + +- **Dual-component overhead** — Each operator requires two registry entries, + two component directories, and explicit dependency wiring. This doubles the + per-operator maintenance surface. +- **Manual wait script implementation** — Each component needs a custom + `wait.sh.tmpl`. No shared templates or centralized wait logic; every + component reimplements polling. +- **Timeout configuration scattered** — Timeout values live inside individual + wait scripts, not in a central configuration. Changing timeouts requires + editing templates across multiple components. + +### Neutral + +- **New deployment type** — `direct` is a third deployment mechanism alongside + `helm` and `kustomize`. Bundler complexity increases, but the pattern is + consistent with existing types. +- **Component count growth** — Logical operators (gpu-operator, network-operator) + become two components each. Registry size increases, but dependency graph + remains manageable. +- **Static manifests trade-off** — Embedding YAML in AICR guarantees + reproducibility and eliminates runtime dependencies, but means updates require + AICR releases instead of just pointing to new chart versions. +- **Wait scripts are optional** — Components can omit `waitScript` if they + don't need deployment verification. The pattern supports both simple (apply + only) and verified (apply + wait) deployments. + +## Alternatives Considered + +### Temporary Helm Chart Generation + +Instead of introducing a new `direct` deployment type, generate temporary Helm +charts from the static YAML files and reuse the existing Helm bundler flow. + +**Approach:** +1. Component has static YAML in `recipes/components//direct/olm.yaml` +2. Bundler generates a minimal Helm chart structure on-the-fly: + ``` + temp-chart/ + ├── Chart.yaml + └── templates/ + └── olm.yaml + ``` +3. Bundle output references this temporary chart +4. Deployment uses `helm install` instead of `kubectl apply` + +**Why rejected:** + +- **Unnecessary indirection** — Adds Helm packaging overhead (Chart.yaml, + templates directory) for manifests that need no templating or value merging. + The Helm chart becomes an empty wrapper. +- **Confusing abstraction** — Users expect Helm charts to have values files, + templating, and chart dependencies. A chart with none of these is misleading. +- **Bundler complexity** — Requires bundler to generate Chart.yaml metadata, + manage temporary directories, and package charts for manifests that are + already complete. +- **No Helm benefits** — The OLM manifests don't use Helm features (templating, + hooks, dependencies). Using Helm provides no value over `kubectl apply`. +- **Wait script integration** — Helm has its own hook system; integrating + custom wait scripts with Helm hooks is more complex than standalone scripts. + +The `direct` deployment type is simpler: copy YAML, generate wrapper script, +done. No need to force-fit static manifests into the Helm model. + +## References + +- [Understanding the Operator Lifecycle Manager (OLM)](https://docs.redhat.com/en/documentation/openshift_container_platform/4.2/html/operators/understanding-the-operator-lifecycle-manager-olm) From eb58dcb68c93ed53788807289987b5d9ab1f85e5 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Mon, 11 May 2026 13:23:20 +0300 Subject: [PATCH 2/8] feat(recipes): add OpenShift support with OLM integration (#566) Add OpenShift (ocp) as a new service type with Operator Lifecycle Manager support. Changes: - Add CriteriaServiceOCP to service enumeration - Implement ComponentTypeDirect for static Kubernetes manifests - Add recipes/overlays/ocp.yaml with dual-component pattern: * nfd-olm + nfd (OLM Subscription + NodeFeatureDiscovery CR) * gpu-operator-olm + gpu-operator (OLM Subscription + ClusterPolicy CR) - Create OLM installation manifests and CR configuration files - Register gpu-operator-olm and nfd-olm components in registry.yaml - Update API spec and documentation to include "ocp" service type Signed-off-by: Shai Kapon --- .github/ISSUE_TEMPLATE/bug_report.yml | 4 +- api/aicr/v1/server.yaml | 6 +- docs/README.md | 2 +- docs/contributor/api-server.md | 33 ++++- docs/contributor/cli.md | 2 +- docs/contributor/data.md | 2 +- docs/contributor/validations.md | 2 +- docs/integrator/data-flow.md | 2 +- docs/user/api-reference.md | 2 +- docs/user/cli-reference.md | 2 +- pkg/recipe/criteria.go | 5 +- pkg/recipe/criteria_test.go | 2 +- pkg/recipe/metadata.go | 13 ++ pkg/recipe/yaml_test.go | 3 +- .../gpu-operator-olm/direct/olm.yaml | 57 +++++++++ .../gpu-operator/direct/clusterpolicy.yaml | 51 ++++++++ recipes/components/nfd-olm/direct/olm.yaml | 43 +++++++ .../nfd/direct/nodefeaturediscovery.yaml | 39 ++++++ recipes/overlays/ocp.yaml | 113 ++++++++++++++++++ recipes/registry.yaml | 14 +++ .../nccl_all_reduce_bw_constraint.go | 2 +- 21 files changed, 381 insertions(+), 18 deletions(-) create mode 100644 recipes/components/gpu-operator-olm/direct/olm.yaml create mode 100644 recipes/components/gpu-operator/direct/clusterpolicy.yaml create mode 100644 recipes/components/nfd-olm/direct/olm.yaml create mode 100644 recipes/components/nfd/direct/nodefeaturediscovery.yaml create mode 100644 recipes/overlays/ocp.yaml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 13a0bbc54..5281a1e84 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -113,7 +113,7 @@ body: placeholder: | - AICR version (CLI `aicr version`, API image tag, or commit SHA): - Install method (release binary / build from source / container image): - - Platform (eks/gke/aks/oke/kind/lke/other): + - Platform (eks/gke/aks/oke/ocp/kind/lke/other): - Kubernetes version: - OS (ubuntu/cos/other) + version: - Kernel version: @@ -122,7 +122,7 @@ body: value: | - AICR version (CLI `aicr version`, API image tag, or commit SHA): - Install method (release binary / build from source / container image): - - Platform (eks/gke/aks/oke/kind/lke/other): + - Platform (eks/gke/aks/oke/ocp/kind/lke/other): - Kubernetes version: - OS (ubuntu/cos/other) + version: - Kernel version: diff --git a/api/aicr/v1/server.yaml b/api/aicr/v1/server.yaml index 09403fe54..455f4f8d9 100644 --- a/api/aicr/v1/server.yaml +++ b/api/aicr/v1/server.yaml @@ -112,7 +112,7 @@ paths: description: Kubernetes service/environment type. If omitted, treated as "any" (wildcard). schema: type: string - enum: [eks, gke, aks, oke, kind, lke, any] + enum: [eks, gke, aks, oke, ocp, kind, lke, any] default: any - name: accelerator in: query @@ -479,7 +479,7 @@ paths: description: Kubernetes service/environment type. If omitted, treated as "any" (wildcard). schema: type: string - enum: [eks, gke, aks, oke, kind, lke, any] + enum: [eks, gke, aks, oke, ocp, kind, lke, any] default: any - name: accelerator in: query @@ -1240,7 +1240,7 @@ components: service: type: string description: Kubernetes service type - enum: [eks, gke, aks, oke, kind, lke, any] + enum: [eks, gke, aks, oke, ocp, kind, lke, any] example: eks accelerator: type: string diff --git a/docs/README.md b/docs/README.md index 7dd0bfabe..ec8c93282 100644 --- a/docs/README.md +++ b/docs/README.md @@ -8,7 +8,7 @@ NVIDIA AI Cluster Runtime (AICR) is a suite of tooling designed to automate the |------|-------------| | **Snapshot** | A captured state of a system including OS, kernel, Kubernetes, GPU, and SystemD configuration. Created by `aicr snapshot` or the Kubernetes agent. | | **Recipe** | A generated configuration recommendation containing component references, constraints, and deployment order. Created by `aicr recipe` based on criteria or snapshot analysis. | -| **Criteria** | Query parameters that define the target environment: `service` (eks/gke/aks/oke/kind/lke), `accelerator` (h100/gb200/b200/a100/l40/rtx-pro-6000), `intent` (training/inference), `os` (ubuntu/rhel/cos/amazonlinux/talos), `platform` (kubeflow), and `nodes`. | +| **Criteria** | Query parameters that define the target environment: `service` (eks/gke/aks/oke/ocp/kind/lke), `accelerator` (h100/gb200/b200/a100/l40/rtx-pro-6000), `intent` (training/inference), `os` (ubuntu/rhel/cos/amazonlinux/talos), `platform` (kubeflow), and `nodes`. | | **Overlay** | A recipe metadata file that extends the base recipe for specific environments. Overlays are matched against criteria using asymmetric matching. | | **Mixin** | A composable recipe fragment (`kind: RecipeMixin`) that carries only `constraints` and `componentRefs`. Mixins live in `recipes/mixins/`, are excluded from overlay discovery, and are referenced by leaf overlays via `spec.mixins` to share orthogonal content (e.g., OS constraints, platform components) without duplication. See [ADR-005](design/005-overlay-refactoring.md). | | **Bundle** | Deployment artifacts generated from a recipe: Helm values files, Kubernetes manifests, installation scripts, and checksums. | diff --git a/docs/contributor/api-server.md b/docs/contributor/api-server.md index 14d7019e2..dfd17cba0 100644 --- a/docs/contributor/api-server.md +++ b/docs/contributor/api-server.md @@ -262,7 +262,7 @@ Supported content types: | Parameter | Type | Validation | Example | |-----------|------|------------|--------| -| `service` | ServiceType | Enum: eks, gke, aks, oke, kind, lke, any | `service=eks` | +| `service` | ServiceType | Enum: eks, gke, aks, oke, ocp, kind, lke, any | `service=eks` | | `accelerator` | AcceleratorType | Enum: h100, gb200, b200, a100, l40, rtx-pro-6000, any | `accelerator=h100` | | `gpu` | AcceleratorType | Alias for accelerator | `gpu=h100` | | `intent` | IntentType | Enum: training, inference, any | `intent=training` | @@ -277,7 +277,36 @@ Shared with CLI - same logic as described in CLI architecture. ### Recipe Generation -Endpoints `GET /v1/recipe` (query parameters) and `POST /v1/recipe` (criteria body, `application/json` or `application/x-yaml`). See [Query Parameter Parsing](#query-parameter-parsing) above for the GET parameter table and [POST Request Body Format](#post-request-body-format) above for the body schema. +**Endpoints**: +- `GET /v1/recipe` - Generate recipe from query parameters +- `POST /v1/recipe` - Generate recipe from criteria body + +#### GET Method + +**Query Parameters**: +- `service` - Kubernetes service type (eks, gke, aks, oke, ocp, kind, lke) +- `accelerator` - GPU/accelerator type (h100, gb200, b200, a100, l40, rtx-pro-6000) +- `gpu` - Alias for accelerator (backwards compatibility) +- `intent` - Workload intent (training, inference) +- `os` - Operating system family (ubuntu, rhel, cos, amazonlinux, talos) +- `nodes` - Number of GPU nodes (0 = any/unspecified) + +#### POST Method + +**Content Types**: `application/json`, `application/x-yaml` + +**Request Body**: `RecipeCriteria` resource with kind, apiVersion, metadata, and spec fields. + +```yaml +kind: RecipeCriteria +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: my-criteria +spec: + service: eks + accelerator: h100 + intent: training +``` **Response**: 200 OK diff --git a/docs/contributor/cli.md b/docs/contributor/cli.md index 598a5dbea..a6248fb0a 100644 --- a/docs/contributor/cli.md +++ b/docs/contributor/cli.md @@ -1701,7 +1701,7 @@ mkdir -p "$OUTPUT_DIR" GPU_TYPES=("h100" "gb200" "b200" "a100" "l40" "rtx-pro-6000") # Kubernetes services -K8S_SERVICES=("eks" "gke" "aks" "oke" "kind" "lke") +K8S_SERVICES=("eks" "gke" "aks" "oke" "ocp" "kind" "lke") # OS distributions OS_TYPES=("ubuntu" "rhel" "cos") diff --git a/docs/contributor/data.md b/docs/contributor/data.md index fc89986f3..83cfa478c 100644 --- a/docs/contributor/data.md +++ b/docs/contributor/data.md @@ -123,7 +123,7 @@ Criteria define when a recipe matches a user query: | Field | Type | Description | Example Values | |-------|------|-------------|----------------| -| `service` | String | Kubernetes platform | `eks`, `gke`, `aks`, `oke`, `kind`, `lke` | +| `service` | String | Kubernetes platform | `eks`, `gke`, `aks`, `oke`, `ocp`, `kind`, `lke` | | `accelerator` | String | GPU hardware type | `h100`, `gb200`, `b200`, `a100`, `l40`, `rtx-pro-6000` | | `os` | String | Operating system | `ubuntu`, `rhel`, `cos`, `amazonlinux` | | `intent` | String | Workload purpose | `training`, `inference` | diff --git a/docs/contributor/validations.md b/docs/contributor/validations.md index 2fbc9956c..fe1350cb8 100644 --- a/docs/contributor/validations.md +++ b/docs/contributor/validations.md @@ -93,7 +93,7 @@ conditions: **Supported Condition Keys:** - `intent`: Workload intent (training, inference) -- `service`: Kubernetes service (eks, gke, aks, oke, kind, lke) +- `service`: Kubernetes service (eks, gke, aks, oke, ocp, kind, lke) - `accelerator`: GPU type (h100, gb200, b200, a100, l40, rtx-pro-6000) - `os`: Operating system (ubuntu, rhel, cos, amazonlinux, talos) - `platform`: Platform/framework (kubeflow) diff --git a/docs/integrator/data-flow.md b/docs/integrator/data-flow.md index f48538571..c12194158 100644 --- a/docs/integrator/data-flow.md +++ b/docs/integrator/data-flow.md @@ -898,7 +898,7 @@ X-RateLimit-Reset: 1735650000 **Query Parameters:** - Type checking (string, int, bool) -- Enum validation (eks, gke, aks, etc.) +- Enum validation (eks, gke, aks, ocp, etc.) - Version format validation (regex) - Range validation (if applicable) diff --git a/docs/user/api-reference.md b/docs/user/api-reference.md index dfb1e513d..6f34d7978 100644 --- a/docs/user/api-reference.md +++ b/docs/user/api-reference.md @@ -116,7 +116,7 @@ Generate an optimized configuration recipe based on environment parameters. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `service` | string | any | K8s service: `eks`, `gke`, `aks`, `oke`, `kind`, `lke`, `any` | +| `service` | string | any | K8s service: `eks`, `gke`, `aks`, `oke`, `ocp`, `kind`, `lke`, `any` | | `accelerator` | string | any | GPU type: `h100`, `gb200`, `b200`, `a100`, `l40`, `rtx-pro-6000`, `any` | | `gpu` | string | any | Alias for `accelerator` | | `intent` | string | any | Workload: `training`, `inference`, `any` | diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index d0a775f08..ba1f4afb2 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -329,7 +329,7 @@ Generate recipes using direct system parameters: **Flags:** | Flag | Short | Type | Description | |------|-------|------|-------------| -| `--service` | | string | K8s service: eks, gke, aks, oke, kind, lke | +| `--service` | | string | K8s service: eks, gke, aks, oke, ocp, kind, lke | | `--accelerator` | `--gpu` | string | Accelerator/GPU type: h100, gb200, b200, a100, l40, rtx-pro-6000 | | `--intent` | | string | Workload intent: training, inference | | `--os` | | string | OS family: ubuntu, rhel, cos, amazonlinux, talos | diff --git a/pkg/recipe/criteria.go b/pkg/recipe/criteria.go index c673fd1ff..98e699d77 100644 --- a/pkg/recipe/criteria.go +++ b/pkg/recipe/criteria.go @@ -49,6 +49,7 @@ const ( CriteriaServiceGKE CriteriaServiceType = "gke" CriteriaServiceAKS CriteriaServiceType = "aks" CriteriaServiceOKE CriteriaServiceType = "oke" + CriteriaServiceOCP CriteriaServiceType = "ocp" CriteriaServiceKind CriteriaServiceType = "kind" CriteriaServiceLKE CriteriaServiceType = "lke" ) @@ -66,6 +67,8 @@ func ParseCriteriaServiceType(s string) (CriteriaServiceType, error) { return CriteriaServiceAKS, nil case "oke": return CriteriaServiceOKE, nil + case "ocp": + return CriteriaServiceOCP, nil case string(CriteriaServiceKind): return CriteriaServiceKind, nil case "lke": @@ -77,7 +80,7 @@ func ParseCriteriaServiceType(s string) (CriteriaServiceType, error) { // GetCriteriaServiceTypes returns all supported service types sorted alphabetically. func GetCriteriaServiceTypes() []string { - return []string{"aks", "eks", "gke", "kind", "lke", "oke"} + return []string{"aks", "eks", "gke", "kind", "lke", "ocp", "oke"} } // CriteriaAcceleratorType represents the GPU/accelerator type. diff --git a/pkg/recipe/criteria_test.go b/pkg/recipe/criteria_test.go index 6a1b529c8..b48d4d9d5 100644 --- a/pkg/recipe/criteria_test.go +++ b/pkg/recipe/criteria_test.go @@ -699,7 +699,7 @@ func TestGetCriteriaServiceTypes(t *testing.T) { types := GetCriteriaServiceTypes() // Should return sorted list - expected := []string{"aks", "eks", "gke", "kind", "lke", "oke"} + expected := []string{"aks", "eks", "gke", "kind", "lke", "ocp", "oke"} if len(types) != len(expected) { t.Errorf("GetCriteriaServiceTypes() returned %d types, want %d", len(types), len(expected)) } diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index 904c61839..00b302b0b 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -42,6 +42,7 @@ type ComponentType string const ( ComponentTypeHelm ComponentType = "Helm" ComponentTypeKustomize ComponentType = "Kustomize" + ComponentTypeDirect ComponentType = "Direct" ) // Constraint represents a deployment constraint/assumption. @@ -88,6 +89,10 @@ type ComponentRef struct { // ValuesFile is the path to the values file (relative to data directory). ValuesFile string `json:"valuesFile,omitempty" yaml:"valuesFile,omitempty"` + // SourceFile is the path to the source YAML file for Direct components (relative to data directory). + // Used by Direct deployment type to specify static Kubernetes manifests. + SourceFile string `json:"sourceFile,omitempty" yaml:"sourceFile,omitempty"` + // Overrides contains inline values that override those from ValuesFile. // Merge order: base values → ValuesFile → Overrides (highest precedence). Overrides map[string]any `json:"overrides,omitempty" yaml:"overrides,omitempty"` @@ -238,6 +243,9 @@ func (ref *ComponentRef) ApplyRegistryDefaults(config *ComponentConfig) { if ref.Path == "" && config.Kustomize.DefaultPath != "" { ref.Path = config.Kustomize.DefaultPath } + case ComponentTypeDirect: + // Direct components use static YAML manifests embedded in AICR. + // No defaults to apply - sourceFile comes from registry config. } // NOTE: healthCheck.assertFile content is intentionally NOT loaded here. @@ -567,6 +575,11 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { result.ValuesFile = overlay.ValuesFile } + // SourceFile: overlay takes precedence if set + if overlay.SourceFile != "" { + result.SourceFile = overlay.SourceFile + } + // Overrides: deep-merge maps, overlay takes precedence if len(overlay.Overrides) > 0 { if result.Overrides == nil { diff --git a/pkg/recipe/yaml_test.go b/pkg/recipe/yaml_test.go index fa68322a4..27db2c1c1 100644 --- a/pkg/recipe/yaml_test.go +++ b/pkg/recipe/yaml_test.go @@ -847,6 +847,7 @@ func TestAllComponentTypesValid(t *testing.T) { validTypes := map[ComponentType]bool{ ComponentTypeHelm: true, ComponentTypeKustomize: true, + ComponentTypeDirect: true, } for _, path := range files { @@ -867,7 +868,7 @@ func TestAllComponentTypesValid(t *testing.T) { continue } if !validTypes[comp.Type] { - t.Errorf("componentRef %q has invalid type %q; valid types: Helm, Kustomize", + t.Errorf("componentRef %q has invalid type %q; valid types: Helm, Kustomize, Direct", comp.Name, comp.Type) } } diff --git a/recipes/components/gpu-operator-olm/direct/olm.yaml b/recipes/components/gpu-operator-olm/direct/olm.yaml new file mode 100644 index 000000000..ca7f0f206 --- /dev/null +++ b/recipes/components/gpu-operator-olm/direct/olm.yaml @@ -0,0 +1,57 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# GPU Operator OLM Installation Resources for OpenShift +# These manifests install the NVIDIA GPU Operator via OpenShift's Operator Lifecycle Manager (OLM). +# Reference: https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html +--- +apiVersion: v1 +kind: Namespace +metadata: + name: nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nvidia-gpu-operator-group + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: CatalogSource +metadata: + name: certified-operators + namespace: openshift-marketplace +spec: + sourceType: grpc + image: registry.redhat.io/redhat/certified-operator-index:v4.14 + displayName: Certified Operators + publisher: Red Hat + updateStrategy: + registryPoll: + interval: 10m +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: stable + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic diff --git a/recipes/components/gpu-operator/direct/clusterpolicy.yaml b/recipes/components/gpu-operator/direct/clusterpolicy.yaml new file mode 100644 index 000000000..0936e506c --- /dev/null +++ b/recipes/components/gpu-operator/direct/clusterpolicy.yaml @@ -0,0 +1,51 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ClusterPolicy CR for NVIDIA GPU Operator on OpenShift +# This configures the GPU operator that was installed via OLM. +# Reference: https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html +--- +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + operator: + # OpenShift uses CRI-O as the default container runtime + defaultRuntime: crio + + driver: + enabled: true + # Driver version for OpenShift / RHEL CoreOS + version: "550.54.15" + + toolkit: + enabled: true + + devicePlugin: + enabled: true + + # DCGM exporter for Prometheus metrics + dcgmExporter: + enabled: true + config: + name: "" + + # GPU Feature Discovery + gfd: + enabled: true + + # Node Feature Discovery - disabled because it's deployed as a standalone component + nfd: + enabled: false diff --git a/recipes/components/nfd-olm/direct/olm.yaml b/recipes/components/nfd-olm/direct/olm.yaml new file mode 100644 index 000000000..916fbbddb --- /dev/null +++ b/recipes/components/nfd-olm/direct/olm.yaml @@ -0,0 +1,43 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Node Feature Discovery OLM Installation Resources for OpenShift +# These manifests install NFD via OpenShift's Operator Lifecycle Manager (OLM). +# Reference: https://docs.openshift.com/container-platform/latest/hardware_enablement/psap-node-feature-discovery-operator.html +--- +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-nfd +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nfd-operator-group + namespace: openshift-nfd +spec: + targetNamespaces: + - openshift-nfd +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + namespace: openshift-nfd +spec: + channel: stable + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic diff --git a/recipes/components/nfd/direct/nodefeaturediscovery.yaml b/recipes/components/nfd/direct/nodefeaturediscovery.yaml new file mode 100644 index 000000000..f5ca307d4 --- /dev/null +++ b/recipes/components/nfd/direct/nodefeaturediscovery.yaml @@ -0,0 +1,39 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NodeFeatureDiscovery CR for NFD configuration on OpenShift +# This configures the NFD operator that was installed via OLM. +# Reference: https://docs.openshift.com/container-platform/latest/hardware_enablement/psap-node-feature-discovery-operator.html +--- +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.14 + imagePullPolicy: Always + workerConfig: + configData: | + core: + sleepInterval: 60s + sources: + pci: + deviceClassWhitelist: + - "0200" + - "03" + - "12" + deviceLabelFields: + - "vendor" diff --git a/recipes/overlays/ocp.yaml b/recipes/overlays/ocp.yaml new file mode 100644 index 000000000..62130e6f6 --- /dev/null +++ b/recipes/overlays/ocp.yaml @@ -0,0 +1,113 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: ocp + +spec: + # Inherits from base (implicit when spec.base is empty) + # This recipe contains OpenShift-specific settings shared by all OCP deployments + + criteria: + service: ocp + + # OpenShift-specific constraints + # OpenShift 4.14+ includes Kubernetes 1.27+ + constraints: + - name: K8s.server.version + value: ">= 1.27" + + # OpenShift-specific components + componentRefs: + # Node Feature Discovery installation via OLM + # NFD must be installed before GPU Operator + - name: nfd-olm + type: Direct + + # Node Feature Discovery CR (custom resource configuration) + # Overrides the Helm-based nfd from base with a Direct CR + - name: nfd + type: Direct + sourceFile: recipes/components/nfd/direct/nodefeaturediscovery.yaml + dependencyRefs: + - nfd-olm + + # GPU Operator installation via OLM (OpenShift's native operator installation method) + # This installs the operator; GPU configuration is handled separately + - name: gpu-operator-olm + type: Direct + dependencyRefs: + - nfd-olm + + # GPU Operator ClusterPolicy CR (custom resource configuration) + # Overrides the Helm-based gpu-operator from base with a Direct CR + # The operator itself is installed via OLM (gpu-operator-olm above) + - name: gpu-operator + type: Direct + sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml + dependencyRefs: + - gpu-operator-olm + - nfd + + # Disable components from base that are not needed for OpenShift minimal setup + - name: cert-manager + type: Helm + overrides: + enabled: false + + - name: kube-prometheus-stack + type: Helm + overrides: + enabled: false + + - name: prometheus-adapter + type: Helm + overrides: + enabled: false + + - name: k8s-ephemeral-storage-metrics + type: Helm + overrides: + enabled: false + + - name: nodewright-operator + type: Helm + overrides: + enabled: false + + - name: nvidia-dra-driver-gpu + type: Helm + overrides: + enabled: false + + - name: nvsentinel + type: Helm + overrides: + enabled: false + + - name: kai-scheduler + type: Helm + overrides: + enabled: false + + validation: + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics diff --git a/recipes/registry.yaml b/recipes/registry.yaml index c86f5826c..ca31b06e2 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -520,3 +520,17 @@ components: - acceleratedNodeSelector tolerationPaths: - acceleratedTolerations + + - name: gpu-operator-olm + displayName: GPU Operator (OLM) + valueOverrideKeys: + - gpuoperatorolm + direct: + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml + + - name: nfd-olm + displayName: Node Feature Discovery (OLM) + valueOverrideKeys: + - nfdolm + direct: + sourceFile: recipes/components/nfd-olm/direct/olm.yaml diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go index 3644ceeca..6adf4b91a 100644 --- a/validators/performance/nccl_all_reduce_bw_constraint.go +++ b/validators/performance/nccl_all_reduce_bw_constraint.go @@ -840,7 +840,7 @@ func platformWorkerScheduling(service recipe.CriteriaServiceType, instanceType s {Operator: v1.TolerationOpExists}, {Key: "nvidia.com/gpu", Operator: v1.TolerationOpEqual, Value: "present", Effect: v1.TaintEffectNoSchedule}, } - case recipe.CriteriaServiceAny, recipe.CriteriaServiceAKS, recipe.CriteriaServiceOKE, recipe.CriteriaServiceKind, recipe.CriteriaServiceLKE: + case recipe.CriteriaServiceAny, recipe.CriteriaServiceAKS, recipe.CriteriaServiceOKE, recipe.CriteriaServiceOCP, recipe.CriteriaServiceKind, recipe.CriteriaServiceLKE: return nil, nil default: return nil, nil From 1bf668339b3e96f2c970754ddd1912eb681016f1 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Mon, 11 May 2026 14:52:36 +0300 Subject: [PATCH 3/8] feat(bundler): implement Direct deployment type (#566) Add Direct component support for kubectl-based deployments (OLM Subscriptions, CRs). --- pkg/bundler/deployer/argocd/argocd.go | 4 + pkg/bundler/deployer/helm/helm.go | 3 + pkg/bundler/deployer/localformat/direct.go | 99 +++++++++++++++++++ pkg/bundler/deployer/localformat/folder.go | 5 + .../templates/install-direct.sh.tmpl | 21 ++++ pkg/bundler/deployer/localformat/writer.go | 16 ++- recipes/data.go | 2 +- 7 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 pkg/bundler/deployer/localformat/direct.go create mode 100644 pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl diff --git a/pkg/bundler/deployer/argocd/argocd.go b/pkg/bundler/deployer/argocd/argocd.go index ae5b4742f..84ffa49a6 100644 --- a/pkg/bundler/deployer/argocd/argocd.go +++ b/pkg/bundler/deployer/argocd/argocd.go @@ -538,6 +538,10 @@ func buildApplicationData(comp recipe.ComponentRef, f localformat.Folder, syncWa data.Version = deployer.NormalizeVersion(comp.Version) } data.Chart = chart + case localformat.KindDirect: + // Direct components are not supported in ArgoCD deployer. + // Direct components use kubectl apply instead of Helm, which doesn't fit + // the Argo CD Application resource model. Use the helm deployer instead. } return data } diff --git a/pkg/bundler/deployer/helm/helm.go b/pkg/bundler/deployer/helm/helm.go index 5a9d09308..77b032151 100644 --- a/pkg/bundler/deployer/helm/helm.go +++ b/pkg/bundler/deployer/helm/helm.go @@ -56,6 +56,7 @@ type ComponentData struct { IsOCI bool Tag string // Git ref for Kustomize-typed components (tag/branch/commit) Path string // Path within the repository to the kustomization + SourceFile string // Path to static YAML file for Direct components } // compile-time interface check @@ -263,6 +264,7 @@ func (g *Generator) buildComponentDataList() ([]ComponentData, error) { IsOCI: strings.HasPrefix(ref.Source, "oci://"), Tag: ref.Tag, Path: ref.Path, + SourceFile: ref.SourceFile, }) } @@ -289,6 +291,7 @@ func toLocalformatComponents( IsOCI: c.IsOCI, Tag: c.Tag, Path: c.Path, + SourceFile: c.SourceFile, Values: values[c.Name], DynamicPaths: dynamic[c.Name], }) diff --git a/pkg/bundler/deployer/localformat/direct.go b/pkg/bundler/deployer/localformat/direct.go new file mode 100644 index 000000000..52a729a43 --- /dev/null +++ b/pkg/bundler/deployer/localformat/direct.go @@ -0,0 +1,99 @@ +// Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package localformat + +import ( + _ "embed" + "fmt" + "os" + "path/filepath" + "strings" + "text/template" + + "github.com/NVIDIA/aicr/pkg/bundler/deployer" + "github.com/NVIDIA/aicr/pkg/errors" + "github.com/NVIDIA/aicr/pkg/recipe" +) + +//go:embed templates/install-direct.sh.tmpl +var installDirectTemplate string + +// writeDirectFolder emits a Direct deployment folder containing: +// - Static YAML manifest file (copied from SourceFile) +// - install.sh script that runs `kubectl apply -f -n ` +// +// Direct components use static YAML manifests embedded in AICR (no Helm, no templating). +// The install.sh script applies the YAML using kubectl instead of helm install. +func writeDirectFolder(outputDir, dir string, idx int, c Component) (Folder, error) { + folderDir, err := deployer.SafeJoin(outputDir, dir) + if err != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInvalidRequest, "unsafe folder path", err) + } + if mkdirErr := os.MkdirAll(folderDir, 0o755); mkdirErr != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInternal, "create direct folder", mkdirErr) + } + + files := make([]string, 0) + + // Copy the static YAML file from the source location + // Strip "recipes/" prefix if present (embedded FS doesn't include it) + sourcePath := strings.TrimPrefix(c.SourceFile, "recipes/") + manifestContent, err := recipe.GetDataProvider().ReadFile(sourcePath) + if err != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInternal, + fmt.Sprintf("failed to read source file %s for component %s", c.SourceFile, c.Name), err) + } + + // Extract the filename from the source path + manifestFilename := filepath.Base(c.SourceFile) + manifestPath, joinErr := deployer.SafeJoin(folderDir, manifestFilename) + if joinErr != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInvalidRequest, "unsafe manifest path", joinErr) + } + if err := writeFile(manifestPath, manifestContent, 0o600); err != nil { + return Folder{}, err + } + files = append(files, filepath.Join(dir, manifestFilename)) + + // Generate install.sh + tmpl, parseErr := template.New("install-direct.sh").Parse(installDirectTemplate) + if parseErr != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInternal, "parse install-direct.sh template", parseErr) + } + + data := struct { + ComponentName string + Namespace string + ManifestFilename string + }{ + ComponentName: c.Name, + Namespace: c.Namespace, + ManifestFilename: manifestFilename, + } + + if err := renderTemplateToFile(tmpl, data, folderDir, "install.sh", 0o755); err != nil { + return Folder{}, err + } + files = append(files, filepath.Join(dir, "install.sh")) + + return Folder{ + Index: idx, + Dir: dir, + Kind: KindDirect, + Name: c.Name, + Parent: c.Name, + Files: files, + }, nil +} diff --git a/pkg/bundler/deployer/localformat/folder.go b/pkg/bundler/deployer/localformat/folder.go index 7234fb098..8264ec4f4 100644 --- a/pkg/bundler/deployer/localformat/folder.go +++ b/pkg/bundler/deployer/localformat/folder.go @@ -24,6 +24,9 @@ const ( // KindLocalHelm: folder contains a generated Chart.yaml + templates/; // install.sh installs ./ as a local chart. KindLocalHelm + // KindDirect: folder contains static YAML manifests; install.sh runs + // kubectl apply instead of helm install. + KindDirect ) // String returns the stable textual name for the kind. Used by logs and @@ -34,6 +37,8 @@ func (k FolderKind) String() string { return "upstream-helm" case KindLocalHelm: return "local-helm" + case KindDirect: + return "direct" default: return "unknown" } diff --git a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl new file mode 100644 index 000000000..e2861637a --- /dev/null +++ b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}" + +# Apply the static YAML manifest using kubectl +kubectl apply -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace }}{{- end }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} diff --git a/pkg/bundler/deployer/localformat/writer.go b/pkg/bundler/deployer/localformat/writer.go index ed306a9af..35852d54a 100644 --- a/pkg/bundler/deployer/localformat/writer.go +++ b/pkg/bundler/deployer/localformat/writer.go @@ -41,6 +41,8 @@ type Component struct { // Kustomize (empty for helm components) Tag string Path string + // Direct (static YAML manifests) + SourceFile string // path to static YAML file for Direct components // Values hydrated by the component bundler Values map[string]any DynamicPaths []string // paths moved from values.yaml into cluster-values.yaml @@ -118,7 +120,7 @@ func renderInputFor(c Component) manifest.RenderInput { // one VendorRecord per pulled upstream chart for inclusion in the bundle's // provenance.yaml. The records slice is empty when VendorCharts is false. // -//nolint:funlen // single-pass component loop; further extraction reduces locality of the index/branch logic. +//nolint:funlen // Central orchestrator handling Helm, Kustomize, and Direct components; single-pass component loop. func Write(ctx context.Context, opts Options) (WriteResult, error) { // Honor cancellation before any filesystem mutation. if err := ctx.Err(); err != nil { @@ -294,6 +296,14 @@ func Write(ctx context.Context, opts Options) (WriteResult, error) { folders = append(folders, f) slog.Info("wrote local chart folder", "index", idx, "dir", dir, "kind", kind.String(), "parent", c.Name) idx++ + case KindDirect: + f, err := writeDirectFolder(opts.OutputDir, dir, idx, c) + if err != nil { + return WriteResult{}, err + } + folders = append(folders, f) + slog.Info("wrote direct folder", "index", idx, "dir", dir, "kind", kind.String(), "parent", c.Name) + idx++ } } return WriteResult{Folders: folders, VendoredCharts: vendorRecords}, nil @@ -331,6 +341,10 @@ func splitDynamicPaths(values map[string]any, dynamicPaths []string) valueSplit // classify determines the primary folder kind for a component. func classify(c Component, manifests map[string][]byte) FolderKind { + // Direct components have SourceFile set + if c.SourceFile != "" { + return KindDirect + } if c.Tag != "" || c.Path != "" { // Kustomize-typed — Task 9 adds actual kustomize build support. return KindLocalHelm diff --git a/recipes/data.go b/recipes/data.go index b85ee049b..80c74264e 100644 --- a/recipes/data.go +++ b/recipes/data.go @@ -18,5 +18,5 @@ import ( "embed" ) -//go:embed overlays/*.yaml mixins/*.yaml registry.yaml validators/catalog.yaml components/*/*.yaml components/*/manifests/*.yaml checks/*/*.yaml +//go:embed overlays/*.yaml mixins/*.yaml registry.yaml validators/catalog.yaml components/*/*.yaml components/*/manifests/*.yaml components/*/direct/*.yaml checks/*/*.yaml var FS embed.FS From e27ae58efc3bca0f5592cce8da5e7e646760351f Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Mon, 11 May 2026 14:49:27 +0300 Subject: [PATCH 4/8] feat(bundler): add Direct deployment type with kubectl-based install/uninstall (#566) - Add Direct component type for static YAML manifests (OLM Subscriptions, CRs) - Generate install.sh with kubectl apply and namespace auto-creation - Generate uninstall.sh with kubectl delete for Direct components - Update undeploy.sh to detect and handle Direct vs Helm components - Configure OCP overlay with Direct components (nfd-olm, gpu-operator-olm, nfd, gpu-operator) - Skip value extraction optimization for Direct components in bundler --- pkg/bundler/bundler.go | 7 +++ .../deployer/helm/templates/undeploy.sh.tmpl | 45 +++++++++++-------- .../kai_scheduler_present/undeploy.sh | 45 +++++++++++-------- .../helm/testdata/manifest_only/undeploy.sh | 45 +++++++++++-------- .../testdata/mixed_gpu_operator/undeploy.sh | 45 +++++++++++-------- .../testdata/nodewright_present/undeploy.sh | 45 +++++++++++-------- .../testdata/upstream_helm_only/undeploy.sh | 45 +++++++++++-------- pkg/bundler/deployer/localformat/direct.go | 14 ++++++ .../templates/install-direct.sh.tmpl | 5 +++ .../templates/uninstall-direct.sh.tmpl | 21 +++++++++ recipes/overlays/ocp.yaml | 6 +++ 11 files changed, 215 insertions(+), 108 deletions(-) create mode 100644 pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl diff --git a/pkg/bundler/bundler.go b/pkg/bundler/bundler.go index d23353f3f..fbef9490f 100644 --- a/pkg/bundler/bundler.go +++ b/pkg/bundler/bundler.go @@ -486,6 +486,13 @@ func (b *DefaultBundler) extractComponentValues(ctx context.Context, recipeResul return nil, errors.Wrap(errors.ErrCodeTimeout, "context cancelled during component value extraction", err) } + // Direct components use static YAML manifests and don't need Helm values. + // Skip value extraction and processing for them. + if ref.Type == recipe.ComponentTypeDirect { + componentValues[ref.Name] = make(map[string]any) + continue + } + // Get base values from recipe values, err := recipeResult.GetValuesForComponent(ref.Name) if err != nil { diff --git a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl index ab6952ffe..bbb24159e 100644 --- a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl @@ -450,9 +450,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -473,22 +473,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh index 3f129a39a..4c6539da3 100644 --- a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh @@ -449,9 +449,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -468,22 +468,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh index a6179f05f..9c1bd9d0e 100644 --- a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh @@ -449,9 +449,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -468,22 +468,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh index 00ddbf2ea..896254d6a 100644 --- a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh @@ -449,9 +449,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -468,22 +468,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh index 5bfcb1c24..1653572a8 100644 --- a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh @@ -449,9 +449,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -468,22 +468,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh index 1b0f536c0..fe91e0594 100644 --- a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh @@ -449,9 +449,9 @@ fi # ============================================================================== # Uninstall components in reverse install order # ============================================================================== -# Generic reverse loop: every folder is a Helm release (local-helm or upstream-helm). -# `helm uninstall ` works uniformly for both kinds — that's one of the -# benefits of the uniform local-chart bundle format. +# Generic reverse loop handles both Direct and Helm components: +# - Direct components (with uninstall.sh) use kubectl delete +# - Helm components (local-helm or upstream-helm) use helm uninstall for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); do [[ -d "${dir}" ]] || continue base="${dir%/}" @@ -468,22 +468,31 @@ for dir in $(ls -1d "${SCRIPT_DIR}"/[0-9][0-9][0-9]-*/ 2>/dev/null | sort -r); d continue fi echo "Uninstalling ${name} (${ns})..." - # For local-helm folders (Chart.yaml + templates/), kubectl-delete the - # rendered templates BEFORE helm uninstall. The templates may carry - # helm.sh/hook annotations (post-install/post-upgrade) which helm does - # not track or clean up on uninstall — so without this pre-delete, the - # operator gets removed but its hook-created CRs (and their finalizers) - # linger. Doing the delete while the controller is still running lets - # finalizers clear naturally. - if [[ -d "${dir}/templates" ]]; then - for tpl in "${dir}/templates/"*.yaml; do - [[ -f "${tpl}" ]] || continue - kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true - done + + # Check if this is a Direct component (has uninstall.sh) + if [[ -f "${dir}/uninstall.sh" ]]; then + # Direct component - use kubectl delete via uninstall.sh + ( cd "${dir}" && bash uninstall.sh ) || echo "Warning: uninstall.sh failed for ${name}" >&2 + delete_orphaned_webhooks_for_ns "${ns}" + else + # Helm component - use helm uninstall + # For local-helm folders (Chart.yaml + templates/), kubectl-delete the + # rendered templates BEFORE helm uninstall. The templates may carry + # helm.sh/hook annotations (post-install/post-upgrade) which helm does + # not track or clean up on uninstall — so without this pre-delete, the + # operator gets removed but its hook-created CRs (and their finalizers) + # linger. Doing the delete while the controller is still running lets + # finalizers clear naturally. + if [[ -d "${dir}/templates" ]]; then + for tpl in "${dir}/templates/"*.yaml; do + [[ -f "${tpl}" ]] || continue + kubectl delete -n "${ns}" -f "${tpl}" --ignore-not-found --timeout="${HELM_TIMEOUT}s" || true + done + fi + helm_force_uninstall "${name}" "${ns}" + delete_release_cluster_resources "${name}" "${ns}" + delete_orphaned_webhooks_for_ns "${ns}" fi - helm_force_uninstall "${name}" "${ns}" - delete_release_cluster_resources "${name}" "${ns}" - delete_orphaned_webhooks_for_ns "${ns}" done # Remove nodewright node taints that persist after operator removal. diff --git a/pkg/bundler/deployer/localformat/direct.go b/pkg/bundler/deployer/localformat/direct.go index 52a729a43..1daf9d51f 100644 --- a/pkg/bundler/deployer/localformat/direct.go +++ b/pkg/bundler/deployer/localformat/direct.go @@ -30,6 +30,9 @@ import ( //go:embed templates/install-direct.sh.tmpl var installDirectTemplate string +//go:embed templates/uninstall-direct.sh.tmpl +var uninstallDirectTemplate string + // writeDirectFolder emits a Direct deployment folder containing: // - Static YAML manifest file (copied from SourceFile) // - install.sh script that runs `kubectl apply -f -n ` @@ -88,6 +91,17 @@ func writeDirectFolder(outputDir, dir string, idx int, c Component) (Folder, err } files = append(files, filepath.Join(dir, "install.sh")) + // Generate uninstall.sh + uninstallTmpl, parseErr := template.New("uninstall-direct.sh").Parse(uninstallDirectTemplate) + if parseErr != nil { + return Folder{}, errors.Wrap(errors.ErrCodeInternal, "parse uninstall-direct.sh template", parseErr) + } + + if err := renderTemplateToFile(uninstallTmpl, data, folderDir, "uninstall.sh", 0o755); err != nil { + return Folder{}, err + } + files = append(files, filepath.Join(dir, "uninstall.sh")) + return Folder{ Index: idx, Dir: dir, diff --git a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl index e2861637a..ffc17867b 100644 --- a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl @@ -17,5 +17,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SCRIPT_DIR}" +{{- if .Namespace }} +# Create namespace if it doesn't exist +kubectl create namespace {{ .Namespace }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} 2>/dev/null || true + +{{- end }} # Apply the static YAML manifest using kubectl kubectl apply -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace }}{{- end }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} diff --git a/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl new file mode 100644 index 000000000..879b1cab2 --- /dev/null +++ b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}" + +# Delete the static YAML manifest using kubectl +kubectl delete -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace }}{{- end }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} --ignore-not-found --timeout="${HELM_TIMEOUT:-120}s" || true diff --git a/recipes/overlays/ocp.yaml b/recipes/overlays/ocp.yaml index 62130e6f6..2d0df56d6 100644 --- a/recipes/overlays/ocp.yaml +++ b/recipes/overlays/ocp.yaml @@ -36,11 +36,14 @@ spec: # NFD must be installed before GPU Operator - name: nfd-olm type: Direct + namespace: openshift-nfd + sourceFile: recipes/components/nfd-olm/direct/olm.yaml # Node Feature Discovery CR (custom resource configuration) # Overrides the Helm-based nfd from base with a Direct CR - name: nfd type: Direct + namespace: openshift-nfd sourceFile: recipes/components/nfd/direct/nodefeaturediscovery.yaml dependencyRefs: - nfd-olm @@ -49,6 +52,8 @@ spec: # This installs the operator; GPU configuration is handled separately - name: gpu-operator-olm type: Direct + namespace: nvidia-gpu-operator + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml dependencyRefs: - nfd-olm @@ -57,6 +62,7 @@ spec: # The operator itself is installed via OLM (gpu-operator-olm above) - name: gpu-operator type: Direct + namespace: nvidia-gpu-operator sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml dependencyRefs: - gpu-operator-olm From e2d511b835d8ed1d24bcfee0d1e66fe40d13246b Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Tue, 12 May 2026 13:56:00 +0300 Subject: [PATCH 5/8] feat(bundler): add Direct deployment type with OpenShift OLM integration (#566) Implement Direct deployment type for static YAML manifests with embedded OLM operator lifecycle management. This enables native OpenShift operator installation through Operator Lifecycle Manager with automatic CSV readiness verification. - Add Direct deployment type alongside Helm and Kustomize with embedded CSV wait logic for OLM components - Implement component registry validation enforcing exactly one deployment type (helm, kustomize, or direct) per component - Add gpu-operator-olm and nfd-olm components with OpenShift OLM Subscription/OperatorGroup manifests - Generate install/uninstall scripts with conditional CSV deletion logic for OLM-managed operators - Update ComponentRef merge logic to clear type-incompatible fields when deployment type changes - Add comprehensive OpenShift deployment documentation with OLM principles and workflow examples Signed-off-by: Shai Kapon --- docs/design/007-openshift-integration.md | 239 ++++----- docs/integrator/openshift.md | 492 ++++++++++++++++++ pkg/bundler/deployer/helm/helm.go | 3 + pkg/bundler/deployer/localformat/direct.go | 2 + .../templates/install-direct.sh.tmpl | 24 + .../templates/uninstall-direct.sh.tmpl | 10 + pkg/bundler/deployer/localformat/writer.go | 1 + pkg/recipe/components.go | 58 ++- pkg/recipe/components_test.go | 91 +++- pkg/recipe/metadata.go | 75 ++- pkg/recipe/metadata_test.go | 3 +- .../gpu-operator-olm/direct/olm.yaml | 21 - .../gpu-operator/direct/clusterpolicy.yaml | 45 +- recipes/components/nfd-olm/direct/olm.yaml | 5 - .../nfd/direct/nodefeaturediscovery.yaml | 15 +- recipes/registry.yaml | 4 + site/.vitepress/config.ts | 1 + 17 files changed, 864 insertions(+), 225 deletions(-) create mode 100644 docs/integrator/openshift.md diff --git a/docs/design/007-openshift-integration.md b/docs/design/007-openshift-integration.md index 17616abf1..b248ff344 100644 --- a/docs/design/007-openshift-integration.md +++ b/docs/design/007-openshift-integration.md @@ -16,13 +16,6 @@ Helm installations, the **preferred and recommended installation mechanism** for operators is the **Operator Lifecycle Manager (OLM)**. OLM is the OpenShift-native way to install, manage, and upgrade operators. -The current AICR architecture produces Helm artifacts but does not produce -OLM-compatible operator bundles. - -This ADR explores how AICR can support OpenShift users by generating -OLM-compatible operator bundles alongside (or instead of) Helm charts for -OpenShift-targeted recipes. - ## OLM Lifecycle The Operator Lifecycle Manager (OLM) is a declarative framework for managing @@ -30,44 +23,30 @@ operator installation, upgrades, and dependency resolution in OpenShift. ### Key OLM Resources -**ClusterServiceVersion (CSV)** — Defines a single version of an operator -with metadata, RBAC, CRDs, and install strategy (Deployment spec). - -**Bundle** — Immutable container image containing CSVs, CRDs, and metadata -annotations. The deployment unit; never executed, only unpacked by OLM. - -**CatalogSource** — Points to an index image organizing bundles into -channels (`stable`, `fast`, `candidate`). - **Subscription** — User-facing resource declaring intent to install an operator from a catalog, specifying package name and channel. -**InstallPlan** — Generated by OLM with resolved dependencies; may require -manual approval. +**OperatorGroup** — Defines the set of namespaces that an operator can watch. +Required for operator installation and scoping. + +**ClusterServiceVersion (CSV)** — Defines a single version of an operator +with metadata, RBAC, CRDs, and install strategy (Deployment spec). Automatically +generated by OLM when a Subscription is created. ### Lifecycle Flow ``` -Subscription created → CatalogSource queried → InstallPlan generated -→ CSV resources applied → Operator deployed → Auto-upgrade on new bundle +Subscription created → CSV resources applied → Operator deployed ``` -### Key Differences from Helm - -- **Automatic dependency resolution** vs manual installation -- **Channel-based auto-upgrades** vs explicit `helm upgrade` -- **Immutable bundle images** vs mutable chart versions -- **RBAC pre-validation** vs runtime failures -- **OpenShift Console UI integration** vs no visibility - ## Proposed Architecture ### Dual-Component Pattern -Each OpenShift operator requires **two AICR components**: +Each OpenShift component requires **two AICR components**: 1. **Operator component** (`-olm`) — Installs the operator via OLM - - Contains Subscription, CatalogSource, Namespace, OperatorGroup + - Contains Subscription, CatalogSource, OperatorGroup - Uses `direct` deployment type - Example: `gpu-operator-olm` @@ -87,6 +66,7 @@ external repos), `direct` points to **static YAML manifests embedded in AICR**. ```yaml - name: gpu-operator-olm displayName: GPU Operator (OLM) + olm: true direct: sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml ``` @@ -129,7 +109,7 @@ For `direct` components, the bundler: 1. Reads the file at `sourceFile` 2. Copies it to the bundle output directory unchanged 3. Generates `install.sh` wrapper script that applies the YAML -4. If `waitScript` is specified, templates and includes wait.sh execution +4. If `olm: true`, embeds CSV wait logic directly in `install.sh` 5. No value merging, no templating on YAML — direct passthrough **Bundle output:** @@ -143,70 +123,49 @@ bundles/ └── install.sh # Generated wrapper script ``` -### Post-Install Wait Scripts +### Post-Install OLM Wait Logic -Components can optionally include a wait script that runs after resource -installation to verify deployment readiness. This is especially important for -OLM components where the Subscription apply returns immediately but the operator -may take time to install. +OLM components require post-installation waiting because the Subscription resource +applies immediately but the operator installation happens asynchronously. The +bundler embeds CSV readiness polling directly into the install script for OLM +components. **Registry configuration:** ```yaml - name: gpu-operator-olm displayName: GPU Operator (OLM) + olm: true # Enables built-in CSV wait logic direct: sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml - waitScript: recipes/components/gpu-operator-olm/direct/wait.sh.tmpl ``` -**Source structure:** -``` -recipes/components/gpu-operator-olm/direct/ -├── olm.yaml # OLM resources -└── wait.sh.tmpl # Wait script template -``` - -**Template variables:** -- `{{.Namespace}}` — Target namespace for the component -- `{{.ComponentName}}` — Component name from registry - **Bundler behavior:** -1. Generates `install.sh` wrapper script that runs `kubectl apply -f olm.yaml` -2. When `direct.waitScript` is specified: - - Reads the template file at `waitScript` path - - Templates it with namespace and component name - - Outputs `wait.sh` in bundle - - Updates `install.sh` to run `wait.sh` after applying resources -3. Makes all scripts executable - -**Wait script contract:** -- Runs after `kubectl apply -f olm.yaml` -- Polls for readiness condition in a loop -- Has internal timeout configuration -- Exits 0 on success, non-zero on timeout or failure -- For OLM components: waits for CSV `status.phase: Succeeded` -- For CR components: waits for custom resource status conditions - -**Example bundler output:** -``` -bundles/ -├── gpu-operator-olm/ -│ ├── olm.yaml -│ ├── install.sh # Generated wrapper script -│ └── wait.sh # Generated from wait.sh.tmpl -└── gpu-operator/ - ├── clusterpolicy.yaml - ├── install.sh # Generated wrapper script - └── wait.sh # Generated from wait.sh.tmpl -``` -**Deployment flow with wait scripts:** +For components with `olm: true`: +1. Generates `install.sh` that applies the YAML manifest +2. Embeds CSV readiness polling directly in `install.sh` after `kubectl apply` +3. Polls for ClusterServiceVersion with `status.phase: Succeeded` +4. Uses a fixed timeout (5 minutes) and polling interval (5 seconds) +5. Exits 0 on success, non-zero on timeout or failure + +For components without `olm: true`: +- No wait logic — install.sh returns immediately after `kubectl apply` + +**Wait logic contract:** +- Searches for CSV in the component's namespace +- Polls `kubectl get csv -n -o json` for phase: Succeeded +- Timeout: 300 seconds (5 minutes) +- Poll interval: 5 seconds +- Exits 0 when CSV reaches Succeeded phase +- Exits 1 on timeout or error + +**Deployment flow:** ```bash -# Install OLM resources -./bundles/gpu-operator-olm/install.sh # Applies olm.yaml + runs wait.sh +# Install OLM resources - waits for CSV ready +./bundles/gpu-operator-olm/install.sh # Applies olm.yaml + waits for CSV # Install CR (depends on operator being ready) -./bundles/gpu-operator/install.sh # Applies clusterpolicy.yaml + runs wait.sh +./bundles/gpu-operator/install.sh # Applies clusterpolicy.yaml, returns immediately ``` ### Example: GPU Operator on OpenShift @@ -216,16 +175,15 @@ bundles/ # Operator installation via OLM - name: gpu-operator-olm displayName: GPU Operator (OLM) + olm: true # Enables built-in CSV wait logic direct: sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml - waitScript: recipes/components/gpu-operator-olm/direct/wait.sh.tmpl # ClusterPolicy CR - name: gpu-operator displayName: GPU Operator direct: sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml - waitScript: recipes/components/gpu-operator/direct/wait.sh.tmpl dependsOn: - gpu-operator-olm ``` @@ -235,11 +193,6 @@ bundles/ `recipes/components/gpu-operator-olm/direct/olm.yaml`: ```yaml --- -apiVersion: v1 -kind: Namespace -metadata: - name: nvidia-gpu-operator ---- apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: @@ -247,15 +200,6 @@ metadata: namespace: nvidia-gpu-operator --- apiVersion: operators.coreos.com/v1alpha1 -kind: CatalogSource -metadata: - name: certified-operators - namespace: openshift-marketplace -spec: - sourceType: grpc - image: registry.redhat.io/redhat/certified-operator-index:v4.14 ---- -apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: name: gpu-operator-certified @@ -292,13 +236,13 @@ Recipe resolver selects overlay with gpu-operator-olm + gpu-operator ↓ Bundler processes direct components: - Copies recipes/components/gpu-operator-olm/direct/olm.yaml → bundles/ - - Templates recipes/components/gpu-operator-olm/direct/wait.sh.tmpl → bundles/gpu-operator-olm/wait.sh + - Generates bundles/gpu-operator-olm/install.sh with embedded CSV wait logic (olm: true) - Copies recipes/components/gpu-operator/direct/clusterpolicy.yaml → bundles/ - - Templates recipes/components/gpu-operator/direct/wait.sh.tmpl → bundles/gpu-operator/wait.sh + - Generates bundles/gpu-operator/install.sh without wait logic (olm: false) ↓ User deployment: 1. ./bundles/gpu-operator-olm/install.sh # Applies resources + waits for CSV ready - 2. ./bundles/gpu-operator/install.sh # Applies resources + waits for ClusterPolicy ready + 2. ./bundles/gpu-operator/install.sh # Applies resources, returns immediately ``` ## Consequences @@ -308,50 +252,39 @@ User deployment: - **OpenShift-native installation** — Uses OLM (the recommended OpenShift mechanism) instead of forcing Helm on OCP users. Operators appear in OpenShift Console OperatorHub with proper lifecycle management. -- **No bundle image management** — Unlike full OLM integration, AICR does not - need to build, push, or maintain operator bundle container images. All - manifests are static YAML embedded in AICR. -- **Reuses existing patterns** — The `direct` deployment type fits naturally - alongside `helm` and `kustomize`. Component registry, dependencies, and - bundler architecture remain unchanged. -- **Simple deployment model** — Users run `install.sh` scripts; no need to - understand OLM mechanics or manually sequence kubectl commands. -- **Deployment verification** — Wait scripts provide immediate feedback on - installation success/failure instead of leaving users to manually check - operator readiness. -- **Clean separation of concerns** — Dual-component pattern (operator vs CR) - separates operator installation from configuration, making it easy to swap - CR configurations without touching operator installation. -- **Zero external dependencies** — All manifests embedded at build time; - no runtime dependencies on external chart repos, git repos, or catalog sources - (beyond the OpenShift-provided certified operator index). +- **Simple deployment with verification** — Users run `install.sh` scripts with + embedded CSV wait logic that provides immediate feedback on installation + success/failure. No need to understand OLM mechanics, manually sequence kubectl + commands, or check operator readiness separately. +- **Clean implementation** — The `direct` deployment type fits naturally alongside + `helm` and `kustomize`, reusing existing patterns and architecture. All manifests + are static YAML embedded in AICR with no runtime dependencies on external chart + repos or operator bundle container images to build/push/maintain. ### Negative - **Dual-component overhead** — Each operator requires two registry entries, two component directories, and explicit dependency wiring. This doubles the per-operator maintenance surface. -- **Manual wait script implementation** — Each component needs a custom - `wait.sh.tmpl`. No shared templates or centralized wait logic; every - component reimplements polling. -- **Timeout configuration scattered** — Timeout values live inside individual - wait scripts, not in a central configuration. Changing timeouts requires - editing templates across multiple components. +- **Fixed wait configuration** — CSV wait timeout (5 minutes) and polling + interval (5 seconds) are hardcoded in the bundler. No per-component + customization available without code changes. +- **OLM-only wait logic** — Only OLM components (`olm: true`) get automated + wait logic. CR components have no built-in wait mechanism; users must verify + readiness manually or rely on operator reconciliation. ### Neutral -- **New deployment type** — `direct` is a third deployment mechanism alongside - `helm` and `kustomize`. Bundler complexity increases, but the pattern is - consistent with existing types. -- **Component count growth** — Logical operators (gpu-operator, network-operator) - become two components each. Registry size increases, but dependency graph - remains manageable. +- **System growth** — Introduces a third deployment type (`direct`) alongside + `helm` and `kustomize`, and each operator becomes two components (operator + + CR). Bundler complexity and registry size increase, but patterns remain + consistent with existing types and dependency graph stays manageable. - **Static manifests trade-off** — Embedding YAML in AICR guarantees reproducibility and eliminates runtime dependencies, but means updates require AICR releases instead of just pointing to new chart versions. -- **Wait scripts are optional** — Components can omit `waitScript` if they - don't need deployment verification. The pattern supports both simple (apply - only) and verified (apply + wait) deployments. +- **Declarative OLM integration** — Components declare `olm: true` to enable CSV + wait logic or omit it for apply-only behavior. Simple boolean flag supports + both deployment patterns without custom scripts. ## Alternatives Considered @@ -372,23 +305,37 @@ charts from the static YAML files and reuse the existing Helm bundler flow. 3. Bundle output references this temporary chart 4. Deployment uses `helm install` instead of `kubectl apply` +The `direct` deployment type is simpler: copy YAML, generate install.sh with +optional CSV wait logic, done. No need to force-fit static manifests into the +Helm model. + +### Explicit Wait Script Templates + +Instead of embedding CSV wait logic in the bundler, support flexible per-component +wait script templates that the bundler would template and include in the bundle. + +**Approach:** +1. Component specifies `waitScript: recipes/components//direct/wait.sh.tmpl` +2. Bundler reads the template, renders it with `{{.Namespace}}` and `{{.ComponentName}}` +3. Outputs `wait.sh` alongside `install.sh` in the bundle +4. `install.sh` calls `wait.sh` after `kubectl apply` + **Why rejected:** -- **Unnecessary indirection** — Adds Helm packaging overhead (Chart.yaml, - templates directory) for manifests that need no templating or value merging. - The Helm chart becomes an empty wrapper. -- **Confusing abstraction** — Users expect Helm charts to have values files, - templating, and chart dependencies. A chart with none of these is misleading. -- **Bundler complexity** — Requires bundler to generate Chart.yaml metadata, - manage temporary directories, and package charts for manifests that are - already complete. -- **No Helm benefits** — The OLM manifests don't use Helm features (templating, - hooks, dependencies). Using Helm provides no value over `kubectl apply`. -- **Wait script integration** — Helm has its own hook system; integrating - custom wait scripts with Helm hooks is more complex than standalone scripts. - -The `direct` deployment type is simpler: copy YAML, generate wrapper script, -done. No need to force-fit static manifests into the Helm model. +- **Limited to install.sh deployment** — Wait scripts only apply to the `install.sh` + reference implementation, which serves as an example deployment method. The primary + deployment target is ArgoCD, which handles synchronization and health checks through + its own mechanisms (resource hooks, sync waves, health assessments). Per-component + wait scripts would be unused in the ArgoCD flow. +- **Template maintenance overhead** — Each component needs its own wait.sh.tmpl + file. For OLM components, all templates would implement identical CSV polling + logic, leading to duplication across components. +- **Over-engineered for limited use case** — Adding a full templating system for + wait scripts provides flexibility that only benefits manual `install.sh` deployments. + A simple `olm: true` flag is sufficient for embedding CSV wait logic where needed. + +The embedded `olm: true` approach targets the actual need (CSV readiness in install.sh) +without over-investing in infrastructure for a secondary deployment method. ## References diff --git a/docs/integrator/openshift.md b/docs/integrator/openshift.md new file mode 100644 index 000000000..bd7a17c12 --- /dev/null +++ b/docs/integrator/openshift.md @@ -0,0 +1,492 @@ +# OpenShift Deployment + +Deploy NVIDIA GPU and networking components on Red Hat OpenShift Container Platform (OCP) using the Operator Lifecycle Manager (OLM). + +## Overview + +AICR supports OpenShift through a **Direct deployment type** that leverages OpenShift's native Operator Lifecycle Manager (OLM) instead of Helm charts. This approach aligns with OpenShift best practices and provides proper operator lifecycle management through the OpenShift Console. + +### Why OLM Instead of Helm? + +While OpenShift supports Helm installations, **OLM is the recommended and preferred installation mechanism** for operators on OpenShift because: + +- **Native Integration** — Operators appear in the OpenShift Console with full lifecycle visibility +- **Automatic Updates** — OLM handles operator updates through subscription channels +- **Dependency Management** — OLM resolves operator dependencies and installation ordering +- **Security & RBAC** — Proper integration with OpenShift's security model and ClusterServiceVersions (CSVs) + +### Key Concepts + +**Direct Deployment Type** + +The `Direct` deployment type is specifically designed for OpenShift OLM integration: +- Static YAML manifests embedded in AICR (no external chart repos or git dependencies) +- Simple `kubectl apply` deployment model with generated install scripts +- Automatic CSV wait logic for OLM components +- Works alongside `Helm` and `Kustomize` deployment types for other components + +**Dual-Component Pattern** + +Each OpenShift operator requires **two separate AICR components**: + +1. **Operator Component** (`-olm`) — Installs the operator itself via OLM + - Contains: `Subscription`, `OperatorGroup`, optionally `CatalogSource` + - Uses `Direct` deployment type with `olm: true` flag + - Example: `gpu-operator-olm`, `nfd-olm` + +2. **CR Component** (``) — Configures the operator's behavior + - Contains: Custom Resources (e.g., `ClusterPolicy` for GPU Operator) + - Uses `Direct` deployment type + - Depends on the corresponding `-olm` component + - Example: `gpu-operator`, `nfd` + +This separation ensures the operator is fully installed and ready before applying its configuration. + +### OLM Resource Flow + +``` +User creates Subscription + ↓ +OLM creates ClusterServiceVersion (CSV) + ↓ +CSV deploys operator Deployment/DaemonSet + ↓ +Operator becomes ready (CSV phase: Succeeded) + ↓ +User applies Custom Resources + ↓ +Operator reconciles workload +``` + +## Complete Deployment Workflow + +This section demonstrates the end-to-end deployment process on OpenShift. + +### Prerequisites + +- OpenShift Container Platform 4.14+ (Kubernetes 1.27+) +- `kubectl` or `oc` CLI configured with cluster access +- Cluster-admin privileges for operator installation +- GPU nodes properly labeled (if deploying GPU operators) + +### 1. Generate Recipe + +Generate a recipe for OpenShift by specifying `--service ocp`: + +```bash +aicr recipe \ + --service ocp \ + --accelerator h100 \ + --os rhel \ + --intent training \ + --output recipe.yaml +``` + +**Expected Output:** + +```text +[cli] building recipe from criteria: criteria=criteria(service=ocp, accelerator=h100, intent=training, os=rhel) +[cli] recipe generation completed: output=recipe.yaml components=4 overlays=3 +``` + +**Verify Recipe Contents:** + +```bash +cat recipe.yaml +``` + +The recipe will reference OpenShift-specific components using the `Direct` deployment type: + +```yaml +apiVersion: aicr.nvidia.com/v1alpha1 +kind: RecipeMetadata +metadata: + name: ocp-h100-rhel-training-recipe + +spec: + components: + # NFD Operator installation via OLM + - name: nfd-olm + type: Direct + namespace: openshift-nfd + sourceFile: recipes/components/nfd-olm/direct/olm.yaml + + # NFD Custom Resource configuration + - name: nfd + type: Direct + namespace: openshift-nfd + sourceFile: recipes/components/nfd/direct/nodefeaturediscovery.yaml + dependencyRefs: + - nfd-olm + + # GPU Operator installation via OLM + - name: gpu-operator-olm + type: Direct + namespace: nvidia-gpu-operator + sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml + dependencyRefs: + - nfd-olm + + # GPU Operator Custom Resource configuration + - name: gpu-operator + type: Direct + namespace: nvidia-gpu-operator + sourceFile: recipes/components/gpu-operator/direct/clusterpolicy.yaml + dependencyRefs: + - gpu-operator-olm + - nfd +``` + +### 2. Generate Bundle + +Create deployment bundle from the recipe: + +```bash +aicr bundle \ + --recipe recipe.yaml \ + --output ./ocp-bundle +``` + +**Expected Output:** + +```text +[cli] generating bundle: deployer=helm type=Helm per-component bundle recipe=recipe.yaml output=./ocp-bundle +[cli] bundle generated: type=Helm per-component bundle files=12 size_bytes=45120 output_dir=./ocp-bundle +``` + +**Bundle Directory Structure:** + +```bash +tree ocp-bundle +``` + +```text +ocp-bundle/ +├── deploy.sh # Orchestration script - deploys all components in order +├── undeploy.sh # Orchestration script - removes all components +├── 001-nfd-olm/ +│ ├── olm.yaml # Subscription + OperatorGroup manifests +│ ├── install.sh # Generated install script with CSV wait +│ └── uninstall.sh # Generated cleanup script +├── 002-nfd/ +│ ├── nodefeaturediscovery.yaml # NodeFeatureDiscovery CR +│ ├── install.sh # Generated install script +│ └── uninstall.sh # Generated cleanup script +├── 003-gpu-operator-olm/ +│ ├── olm.yaml # Subscription + OperatorGroup manifests +│ ├── install.sh # Generated install script with CSV wait +│ └── uninstall.sh # Generated cleanup script +├── 004-gpu-operator/ +│ ├── clusterpolicy.yaml # ClusterPolicy CR +│ ├── install.sh # Generated install script +│ └── uninstall.sh # Generated cleanup script +├── recipe.yaml # Recipe used to generate bundle +├── checksums.txt # SHA256 checksums for verification +└── README.md # Bundle documentation +``` + +### 3. Deploy an OLM Component + +Deploy operators through OLM subscriptions. The install scripts automatically create namespaces and wait for operator readiness. + +**Deploy NFD Operator:** + +```bash +cd ocp-bundle +./001-nfd-olm/install.sh +``` + +**Expected Output:** + +```text +namespace/openshift-nfd created +operatorgroup.operators.coreos.com/openshift-nfd-operatorgroup created +subscription.operators.coreos.com/nfd created +Waiting for OLM operator CSV to reach Succeeded phase... +CSV phase: Installing, waiting... (5s/300s) +CSV phase: Installing, waiting... (10s/300s) +CSV phase: Succeeded +CSV reached Succeeded phase +``` + +The script automatically polls for CSV readiness with a 5-minute timeout. + + +### 4. Verify Operator Installation + +Check that ClusterServiceVersions are in `Succeeded` phase: + +```bash +kubectl get csv -n openshift-nfd +``` + +**Expected Output:** + +```text +NAME DISPLAY VERSION REPLACES PHASE +nfd.v4.16.0-202601091828 Node Feature Discovery Operator 4.16.0 Succeeded +``` + +**Check Operator Pods:** + +```bash +kubectl get pods -n openshift-nfd +``` + +Operator pods should be in `Running` status. + +### 5. Deploy Custom Resources + +Once operators are ready, deploy the Custom Resources that configure their behavior: + +**Deploy NFD Custom Resource:** + +```bash +./002-nfd/install.sh +``` + +**Expected Output:** + +```text +nodefeaturediscovery.nfd.openshift.io/nfd-instance created +``` + +**Deploy GPU Operator ClusterPolicy:** + +```bash +./004-gpu-operator/install.sh +``` + +**Expected Output:** + +```text +clusterpolicy.nvidia.com/cluster-policy created +``` + +### 6. Monitor Component Rollout + +Watch the operators and DaemonSets as they deploy across nodes: + +**Check NFD Pods:** + +```bash +kubectl get pods -n openshift-nfd +``` + +```text +NAME READY STATUS RESTARTS AGE +nfd-controller-manager-7846858557-x6qn4 1/1 Running 0 5m1s +nfd-gc-5f975dd6d4-lrnxc 1/1 Running 0 4m21s +nfd-master-d5f49b9b7-hl9t8 1/1 Running 0 4m21s +nfd-worker-68rnq 1/1 Running 0 4m21s +nfd-worker-jh6k8 1/1 Running 0 4m21s +nfd-worker-qj8zn 1/1 Running 0 4m21s +``` + + + +### 7. Capture Snapshot + +After deployment, capture a snapshot of the cluster state for validation: + +```bash +aicr snapshot --output snapshot.yaml +``` + +**Expected Output:** + +```text +[cli] deploying agent: namespace=default +[cli] agent deployed successfully +[cli] waiting for Job completion: job=aicr-snapshot timeout=5m0s +[cli] job completed successfully +[cli] snapshot saved to file: path=snapshot.yaml +``` + +### 8. Validate Deployment + +Validate the deployed components against the recipe and snapshot: + +```bash +aicr validate \ + --recipe recipe.yaml \ + --snapshot snapshot.yaml +``` + +**Expected Output:** + +```text +[cli] loading recipe: uri=recipe.yaml +[cli] loading snapshot: uri=snapshot.yaml +[cli] running validation: phases=[deployment] +[cli] readiness pre-flight: constraints=3 +[cli] readiness constraint passed: name=K8s.crd.clusterpolicies.nvidia.com.established expected=true actual=true +[cli] readiness constraint passed: name=K8s.crd.nodefeaturediscoveries.nfd.openshift.io.established expected=true actual=true +[cli] readiness constraint passed: name=K8s.server.version expected=>= 1.27 actual=v1.27.4 +[cli] phase completed: phase=deployment status=passed validators=2 passed=2 failed=0 +[cli] all phases completed: phases=1 + +=== Validation Results === +[cli] phase result: phase=deployment status=passed duration=8.2s +``` + +## Automated Deployment with deploy.sh + +As an alternative to running individual component install scripts, the bundle includes top-level orchestration scripts that deploy all components in the correct dependency order. + +**Deploy all components:** + +```bash +cd ocp-bundle +./deploy.sh +``` + +The `deploy.sh` script: +- Runs pre-flight checks (validates cluster state, checks for stale resources) +- Loops through all `NNN-*/` directories in order +- Executes each component's `install.sh` script +- Waits for OLM operators to reach CSV `Succeeded` phase (for components with `olm: true`) +- Provides retry logic with exponential backoff on failures +- Continues asynchronously for operator-based components + +**Supported flags:** + +| Flag | Description | +|------|-------------| +| `--no-wait` | Skip Helm/kubectl wait (applies manifests but doesn't block on readiness) | +| `--best-effort` | Continue past individual component failures instead of exiting | +| `--retries N` | Retry failed operations N times with backoff (default: 5) | + +**Example with options:** + +```bash +./deploy.sh --best-effort --retries 3 +``` + +**Undeploy all components:** + +```bash +./undeploy.sh +``` + +The `undeploy.sh` script: +- Removes components in reverse dependency order (OLM CRs before operators) +- Deletes ClusterServiceVersions before Subscriptions +- Optionally preserves namespaces and PVCs + +**Undeploy flags:** + +| Flag | Description | +|------|-------------| +| `--keep-namespaces` | Preserve namespaces after removing components | +| `--delete-pvcs` | Delete PersistentVolumeClaims (default: preserve) | +| `--timeout SECONDS` | Timeout for kubectl/helm operations (default: 120s) | +| `--skip-preflight` | Skip pre-flight finalizer checks | + +**Example:** + +```bash +./undeploy.sh --keep-namespaces --timeout 300 +``` + +These orchestration scripts simplify deployment but the individual component `install.sh`/`uninstall.sh` scripts provide more granular control when needed. + +## OpenShift Console Integration + +After deploying via OLM, operators are visible in the OpenShift Console: + +**View Installed Operators:** +1. Navigate to **Operators → Installed Operators** +2. Select namespace: `nvidia-gpu-operator` or `openshift-nfd` +3. View operator details, status, and managed resources + +**View Custom Resources:** +1. Click on the installed operator +2. Navigate to the **ClusterPolicy** or **NodeFeatureDiscovery** tab +3. View CR details and status + +This provides a GUI alternative to `kubectl` for monitoring operator health. + +## Cleanup + +**Option 1: Automated cleanup (recommended)** + +Use the top-level `undeploy.sh` script to remove all components in reverse dependency order: + +```bash +cd ocp-bundle +./undeploy.sh +``` + +This automatically removes CRs before operators and CSVs before Subscriptions. + +**Option 2: Manual cleanup** + +Remove individual components using their uninstall scripts in reverse dependency order: + +```bash +# Remove Custom Resources first +./004-gpu-operator/uninstall.sh +./002-nfd/uninstall.sh + +# Remove Operators +./003-gpu-operator-olm/uninstall.sh +./001-nfd-olm/uninstall.sh +``` + +The uninstall scripts automatically remove ClusterServiceVersions before deleting Subscriptions to ensure clean operator removal. + +## Understanding the Install Scripts + +### OLM Component Install Script + +For components with `olm: true`, the generated `install.sh` includes: + +1. **Namespace Creation** — Creates the operator namespace if it doesn't exist +2. **Manifest Application** — Applies the OLM resources (Subscription, OperatorGroup) +3. **CSV Wait Logic** — Polls for ClusterServiceVersion to reach `Succeeded` phase + - Timeout: 300 seconds (5 minutes) + - Poll interval: 5 seconds + - Exits with code 0 on success, 1 on timeout/failure + +**Example script excerpt:** + +```bash +# Create namespace +kubectl create namespace nvidia-gpu-operator 2>/dev/null || true + +# Apply OLM resources +kubectl apply -f olm.yaml -n nvidia-gpu-operator + +# Wait for CSV +echo "Waiting for OLM operator CSV to reach Succeeded phase..." +TIMEOUT=300 +while [ $ELAPSED -lt $TIMEOUT ]; do + CSV_PHASE=$(kubectl get csv -n nvidia-gpu-operator -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "") + if [ "$CSV_PHASE" = "Succeeded" ]; then + echo "CSV reached Succeeded phase" + exit 0 + fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) +done +echo "ERROR: Timeout waiting for CSV" +exit 1 +``` + +### CR Component Install Script + +For Custom Resource components (without `olm: true`): + +1. **Namespace Creation** — Creates the namespace if specified +2. **Manifest Application** — Applies the Custom Resource +3. **No Wait Logic** — Returns immediately after apply + +The operator installed via OLM handles reconciliation asynchronously. + + +## References + +- [NVIDIA GPU Operator on OpenShift](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html) +- [Understanding OLM](https://docs.redhat.com/en/documentation/openshift_container_platform/4.2/html/operators/understanding-the-operator-lifecycle-manager-olm) +- [ADR-007: OpenShift Integration](../design/007-openshift-integration.md) diff --git a/pkg/bundler/deployer/helm/helm.go b/pkg/bundler/deployer/helm/helm.go index 77b032151..5fe071a93 100644 --- a/pkg/bundler/deployer/helm/helm.go +++ b/pkg/bundler/deployer/helm/helm.go @@ -57,6 +57,7 @@ type ComponentData struct { Tag string // Git ref for Kustomize-typed components (tag/branch/commit) Path string // Path within the repository to the kustomization SourceFile string // Path to static YAML file for Direct components + Olm bool // True for OLM operator components (enables CSV wait logic in install.sh) } // compile-time interface check @@ -265,6 +266,7 @@ func (g *Generator) buildComponentDataList() ([]ComponentData, error) { Tag: ref.Tag, Path: ref.Path, SourceFile: ref.SourceFile, + Olm: ref.Olm, }) } @@ -292,6 +294,7 @@ func toLocalformatComponents( Tag: c.Tag, Path: c.Path, SourceFile: c.SourceFile, + Olm: c.Olm, Values: values[c.Name], DynamicPaths: dynamic[c.Name], }) diff --git a/pkg/bundler/deployer/localformat/direct.go b/pkg/bundler/deployer/localformat/direct.go index 1daf9d51f..c2a1e2054 100644 --- a/pkg/bundler/deployer/localformat/direct.go +++ b/pkg/bundler/deployer/localformat/direct.go @@ -80,10 +80,12 @@ func writeDirectFolder(outputDir, dir string, idx int, c Component) (Folder, err ComponentName string Namespace string ManifestFilename string + Olm bool }{ ComponentName: c.Name, Namespace: c.Namespace, ManifestFilename: manifestFilename, + Olm: c.Olm, } if err := renderTemplateToFile(tmpl, data, folderDir, "install.sh", 0o755); err != nil { diff --git a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl index ffc17867b..a3ec401b1 100644 --- a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl @@ -24,3 +24,27 @@ kubectl create namespace {{ .Namespace }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} {{- end }} # Apply the static YAML manifest using kubectl kubectl apply -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace }}{{- end }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} +{{- if .Olm }} + +# Wait for ClusterServiceVersion to reach Succeeded phase +echo "Waiting for OLM operator CSV to reach Succeeded phase..." +TIMEOUT=300 +INTERVAL=5 +ELAPSED=0 + +while [ $ELAPSED -lt $TIMEOUT ]; do + CSV_PHASE=$(kubectl get csv -n {{ .Namespace }} -o jsonpath='{.items[0].status.phase}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") + + if [ "$CSV_PHASE" = "Succeeded" ]; then + echo "CSV reached Succeeded phase" + exit 0 + fi + + echo "CSV phase: ${CSV_PHASE:-}, waiting... (${ELAPSED}s/${TIMEOUT}s)" + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) +done + +echo "ERROR: Timeout waiting for CSV to reach Succeeded phase after ${TIMEOUT}s" +exit 1 +{{- end }} diff --git a/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl index 879b1cab2..e2dc9936e 100644 --- a/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl @@ -17,5 +17,15 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SCRIPT_DIR}" +{{- if .Olm }} +# For OLM-managed operators, delete the ClusterServiceVersion (CSV) first +# This removes the actual operator deployment before removing the Subscription +csv_name=$(kubectl get csv -n {{ .Namespace }} -o name 2>/dev/null | head -1) +if [[ -n "${csv_name}" ]]; then + echo "Deleting ClusterServiceVersion: ${csv_name}" + kubectl delete "${csv_name}" -n {{ .Namespace }} --ignore-not-found --timeout="${HELM_TIMEOUT:-120}s" || true +fi + +{{- end }} # Delete the static YAML manifest using kubectl kubectl delete -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace }}{{- end }} ${DRY_RUN_FLAG:-} ${KUBECONFIG_FLAG:-} --ignore-not-found --timeout="${HELM_TIMEOUT:-120}s" || true diff --git a/pkg/bundler/deployer/localformat/writer.go b/pkg/bundler/deployer/localformat/writer.go index 35852d54a..547dfd9a9 100644 --- a/pkg/bundler/deployer/localformat/writer.go +++ b/pkg/bundler/deployer/localformat/writer.go @@ -43,6 +43,7 @@ type Component struct { Path string // Direct (static YAML manifests) SourceFile string // path to static YAML file for Direct components + Olm bool // true for OLM operator components (enables CSV wait logic in install.sh) // Values hydrated by the component bundler Values map[string]any DynamicPaths []string // paths moved from values.yaml into cluster-values.yaml diff --git a/pkg/recipe/components.go b/pkg/recipe/components.go index b6a42b767..108210497 100644 --- a/pkg/recipe/components.go +++ b/pkg/recipe/components.go @@ -47,12 +47,19 @@ type ComponentConfig struct { // Example: ["gpuoperator"] allows --set gpuoperator:key=value ValueOverrideKeys []string `yaml:"valueOverrideKeys,omitempty"` + // Olm indicates this component installs an operator via OLM. + // When true, the bundler embeds CSV wait logic in install.sh. + Olm bool `yaml:"olm,omitempty"` + // Helm contains default Helm chart settings. Helm HelmConfig `yaml:"helm,omitempty"` // Kustomize contains default Kustomize settings. Kustomize KustomizeConfig `yaml:"kustomize,omitempty"` + // Direct contains settings for Direct deployment type. + Direct DirectConfig `yaml:"direct,omitempty"` + // NodeScheduling defines paths for injecting node selectors and tolerations. NodeScheduling NodeSchedulingConfig `yaml:"nodeScheduling,omitempty"` @@ -104,6 +111,12 @@ type KustomizeConfig struct { DefaultTag string `yaml:"defaultTag,omitempty"` } +// DirectConfig contains settings for Direct deployment type. +type DirectConfig struct { + // SourceFile is the path to the static YAML file (relative to data directory). + SourceFile string `yaml:"sourceFile,omitempty"` +} + // NodeSchedulingConfig defines paths for node scheduling injection. type NodeSchedulingConfig struct { // System defines paths for system component scheduling. @@ -302,19 +315,46 @@ func (r *ComponentRegistry) Validate() []error { } } - // Check for mutually exclusive helm/kustomize configuration + // Check for mutually exclusive helm/kustomize/direct configuration + // Exactly one deployment type must be configured per component for i, comp := range r.Components { - hasHelm := comp.Helm.DefaultRepository != "" || comp.Helm.DefaultChart != "" - hasKustomize := comp.Kustomize.DefaultSource != "" - - if hasHelm && hasKustomize { - errs = append(errs, errors.New(errors.ErrCodeInvalidRequest, fmt.Sprintf("component[%d] (%s): cannot have both helm and kustomize configuration", i, comp.Name))) + if err := validateDeploymentTypeExclusion(comp, i); err != nil { + errs = append(errs, err) } } return errs } +// validateDeploymentTypeExclusion ensures exactly one deployment type is configured. +// Returns an error if zero or multiple deployment types are set. +func validateDeploymentTypeExclusion(comp ComponentConfig, index int) error { + hasHelm := comp.Helm.DefaultRepository != "" || comp.Helm.DefaultChart != "" || comp.Helm.DefaultNamespace != "" + hasKustomize := comp.Kustomize.DefaultSource != "" + hasDirect := comp.Direct.SourceFile != "" + + configCount := 0 + if hasHelm { + configCount++ + } + if hasKustomize { + configCount++ + } + if hasDirect { + configCount++ + } + + if configCount != 1 { + if configCount == 0 { + return errors.New(errors.ErrCodeInvalidRequest, + fmt.Sprintf("component[%d] (%s): must have exactly one deployment type configuration (helm, kustomize, or direct)", index, comp.Name)) + } + return errors.New(errors.ErrCodeInvalidRequest, + fmt.Sprintf("component[%d] (%s): cannot have multiple deployment type configurations (helm, kustomize, direct) - found %d", index, comp.Name, configCount)) + } + return nil +} + // GetSystemNodeSelectorPaths returns all system node selector paths for a component. func (c *ComponentConfig) GetSystemNodeSelectorPaths() []string { if c == nil { @@ -388,12 +428,16 @@ func (c *ComponentConfig) GetValidations() []ComponentValidationConfig { } // GetType returns the component deployment type based on which config is present. -// Returns ComponentTypeKustomize if Kustomize.DefaultSource is set, +// Returns ComponentTypeDirect if Direct.SourceFile is set, +// ComponentTypeKustomize if Kustomize.DefaultSource is set, // otherwise returns ComponentTypeHelm (the default). func (c *ComponentConfig) GetType() ComponentType { if c == nil { return ComponentTypeHelm } + if c.Direct.SourceFile != "" { + return ComponentTypeDirect + } if c.Kustomize.DefaultSource != "" { return ComponentTypeKustomize } diff --git a/pkg/recipe/components_test.go b/pkg/recipe/components_test.go index b3135e39d..9ce195deb 100644 --- a/pkg/recipe/components_test.go +++ b/pkg/recipe/components_test.go @@ -566,8 +566,18 @@ func TestComponentRegistry_Validate_EdgeCases(t *testing.T) { t.Run("valid registry passes", func(t *testing.T) { registry := &ComponentRegistry{ Components: []ComponentConfig{ - {Name: "comp1", DisplayName: "Comp 1", ValueOverrideKeys: []string{"c1"}}, - {Name: "comp2", DisplayName: "Comp 2", ValueOverrideKeys: []string{"c2"}}, + { + Name: "comp1", + DisplayName: "Comp 1", + ValueOverrideKeys: []string{"c1"}, + Helm: HelmConfig{DefaultChart: "comp1"}, + }, + { + Name: "comp2", + DisplayName: "Comp 2", + ValueOverrideKeys: []string{"c2"}, + Helm: HelmConfig{DefaultChart: "comp2"}, + }, }, } errs := registry.Validate() @@ -688,7 +698,7 @@ func TestComponentRegistry_Validate_MutuallyExclusiveHelmKustomize(t *testing.T) } errs := registry.Validate() for _, e := range errs { - if strings.Contains(e.Error(), "both helm and kustomize") { + if strings.Contains(e.Error(), "multiple deployment type configurations") { t.Errorf("unexpected error for kustomize-only component: %v", e) } } @@ -713,13 +723,84 @@ func TestComponentRegistry_Validate_MutuallyExclusiveHelmKustomize(t *testing.T) errs := registry.Validate() found := false for _, e := range errs { - if strings.Contains(e.Error(), "both helm and kustomize") { + if strings.Contains(e.Error(), "multiple deployment type configurations") { + found = true + break + } + } + if !found { + t.Error("expected error about multiple deployment type configurations") + } + }) + + t.Run("no deployment type is invalid", func(t *testing.T) { + registry := &ComponentRegistry{ + Components: []ComponentConfig{ + { + Name: "test-none", + DisplayName: "Test None", + // No helm, kustomize, or direct configuration + }, + }, + } + errs := registry.Validate() + found := false + for _, e := range errs { + if strings.Contains(e.Error(), "must have exactly one deployment type configuration") { + found = true + break + } + } + if !found { + t.Error("expected error about missing deployment type configuration") + } + }) + + t.Run("direct only is valid", func(t *testing.T) { + registry := &ComponentRegistry{ + Components: []ComponentConfig{ + { + Name: "test-direct", + DisplayName: "Test Direct", + Direct: DirectConfig{ + SourceFile: "components/test/direct/olm.yaml", + }, + }, + }, + } + errs := registry.Validate() + for _, e := range errs { + if strings.Contains(e.Error(), "multiple deployment type configurations") { + t.Errorf("unexpected error for direct-only component: %v", e) + } + } + }) + + t.Run("direct and helm is invalid", func(t *testing.T) { + registry := &ComponentRegistry{ + Components: []ComponentConfig{ + { + Name: "test-direct-helm", + DisplayName: "Test Direct and Helm", + Helm: HelmConfig{ + DefaultChart: "example/chart", + }, + Direct: DirectConfig{ + SourceFile: "components/test/direct/olm.yaml", + }, + }, + }, + } + errs := registry.Validate() + found := false + for _, e := range errs { + if strings.Contains(e.Error(), "multiple deployment type configurations") { found = true break } } if !found { - t.Error("expected error about both helm and kustomize configuration") + t.Error("expected error about multiple deployment type configurations") } }) } diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index 00b302b0b..2b33b05a9 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -93,6 +93,10 @@ type ComponentRef struct { // Used by Direct deployment type to specify static Kubernetes manifests. SourceFile string `json:"sourceFile,omitempty" yaml:"sourceFile,omitempty"` + // Olm indicates this component installs an operator via OLM. + // When true, the bundler embeds CSV wait logic in install.sh for Direct deployments. + Olm bool `json:"olm,omitempty" yaml:"olm,omitempty"` + // Overrides contains inline values that override those from ValuesFile. // Merge order: base values → ValuesFile → Overrides (highest precedence). Overrides map[string]any `json:"overrides,omitempty" yaml:"overrides,omitempty"` @@ -244,8 +248,15 @@ func (ref *ComponentRef) ApplyRegistryDefaults(config *ComponentConfig) { ref.Path = config.Kustomize.DefaultPath } case ComponentTypeDirect: - // Direct components use static YAML manifests embedded in AICR. - // No defaults to apply - sourceFile comes from registry config. + // Apply Direct defaults + if ref.SourceFile == "" && config.Direct.SourceFile != "" { + ref.SourceFile = config.Direct.SourceFile + } + } + + // Apply Olm flag for all types (but only used for Direct deployments) + if !ref.Olm && config.Olm { + ref.Olm = config.Olm } // NOTE: healthCheck.assertFile content is intentionally NOT loaded here. @@ -537,6 +548,7 @@ func (s *RecipeMetadataSpec) Merge(other *RecipeMetadataSpec) { // mergeComponentRef merges overlay into base, with overlay taking precedence // for non-empty fields. Empty/zero fields in overlay inherit from base. +// When the overlay changes the Type, type-incompatible fields are cleared. func mergeComponentRef(base, overlay ComponentRef) ComponentRef { result := base // Start with base values @@ -551,32 +563,61 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { } // Type: overlay takes precedence if set - if overlay.Type != "" { + // If type changes, clear incompatible fields + if overlay.Type != "" && overlay.Type != base.Type { + result.Type = overlay.Type + + // Clear type-incompatible fields when type changes + switch overlay.Type { + case ComponentTypeDirect: + // Clear Helm/Kustomize-specific fields + result.Source = "" + result.Version = "" + result.Chart = "" + result.ValuesFile = "" + result.Tag = "" + result.Path = "" + result.Patches = nil + case ComponentTypeHelm: + // Clear Direct/Kustomize-specific fields + result.SourceFile = "" + result.Tag = "" + result.Path = "" + result.Patches = nil + case ComponentTypeKustomize: + // Clear Direct/Helm-specific fields + result.SourceFile = "" + result.Version = "" + result.Chart = "" + result.ValuesFile = "" + } + } else if overlay.Type != "" { result.Type = overlay.Type } - // Source: overlay takes precedence if set - if overlay.Source != "" { + // Only merge type-appropriate fields (unless type just changed and fields were already cleared) + // Source: overlay takes precedence if set (Helm/Kustomize) + if overlay.Source != "" && result.Type != ComponentTypeDirect { result.Source = overlay.Source } - // Version: overlay takes precedence if set - if overlay.Version != "" { + // Version: overlay takes precedence if set (Helm) + if overlay.Version != "" && result.Type == ComponentTypeHelm { result.Version = overlay.Version } - // Tag: overlay takes precedence if set - if overlay.Tag != "" { + // Tag: overlay takes precedence if set (Kustomize) + if overlay.Tag != "" && result.Type == ComponentTypeKustomize { result.Tag = overlay.Tag } - // ValuesFile: overlay takes precedence if set - if overlay.ValuesFile != "" { + // ValuesFile: overlay takes precedence if set (Helm) + if overlay.ValuesFile != "" && result.Type == ComponentTypeHelm { result.ValuesFile = overlay.ValuesFile } - // SourceFile: overlay takes precedence if set - if overlay.SourceFile != "" { + // SourceFile: overlay takes precedence if set (Direct) + if overlay.SourceFile != "" && result.Type == ComponentTypeDirect { result.SourceFile = overlay.SourceFile } @@ -588,8 +629,8 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { deepMergeMap(result.Overrides, overlay.Overrides) } - // Patches: overlay replaces if set - if len(overlay.Patches) > 0 { + // Patches: overlay replaces if set (Kustomize only) + if len(overlay.Patches) > 0 && result.Type == ComponentTypeKustomize { result.Patches = overlay.Patches } @@ -621,8 +662,8 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { } } - // Path: overlay takes precedence if set (for Kustomize) - if overlay.Path != "" { + // Path: overlay takes precedence if set (Kustomize only) + if overlay.Path != "" && result.Type == ComponentTypeKustomize { result.Path = overlay.Path } diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index 24e3caf33..332bd3b6e 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -422,6 +422,7 @@ func TestMergeComponentRef_AdvancedFields(t *testing.T) { t.Run("patches replaced by overlay", func(t *testing.T) { base := ComponentRef{ Name: "test", + Type: ComponentTypeKustomize, Patches: []string{"base-patch.yaml"}, } overlay := ComponentRef{ @@ -514,7 +515,7 @@ func TestMergeComponentRef_AdvancedFields(t *testing.T) { }) t.Run("tag from overlay", func(t *testing.T) { - base := ComponentRef{Name: "test", Tag: "v1.0"} + base := ComponentRef{Name: "test", Type: ComponentTypeKustomize, Tag: "v1.0"} overlay := ComponentRef{Name: "test", Tag: "v2.0"} result := mergeComponentRef(base, overlay) if result.Tag != "v2.0" { diff --git a/recipes/components/gpu-operator-olm/direct/olm.yaml b/recipes/components/gpu-operator-olm/direct/olm.yaml index ca7f0f206..34fb940c8 100644 --- a/recipes/components/gpu-operator-olm/direct/olm.yaml +++ b/recipes/components/gpu-operator-olm/direct/olm.yaml @@ -16,39 +16,18 @@ # These manifests install the NVIDIA GPU Operator via OpenShift's Operator Lifecycle Manager (OLM). # Reference: https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html --- -apiVersion: v1 -kind: Namespace -metadata: - name: nvidia-gpu-operator ---- apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: name: nvidia-gpu-operator-group - namespace: nvidia-gpu-operator spec: targetNamespaces: - nvidia-gpu-operator --- apiVersion: operators.coreos.com/v1alpha1 -kind: CatalogSource -metadata: - name: certified-operators - namespace: openshift-marketplace -spec: - sourceType: grpc - image: registry.redhat.io/redhat/certified-operator-index:v4.14 - displayName: Certified Operators - publisher: Red Hat - updateStrategy: - registryPoll: - interval: 10m ---- -apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: name: gpu-operator-certified - namespace: nvidia-gpu-operator spec: channel: stable name: gpu-operator-certified diff --git a/recipes/components/gpu-operator/direct/clusterpolicy.yaml b/recipes/components/gpu-operator/direct/clusterpolicy.yaml index 0936e506c..f70daa461 100644 --- a/recipes/components/gpu-operator/direct/clusterpolicy.yaml +++ b/recipes/components/gpu-operator/direct/clusterpolicy.yaml @@ -19,33 +19,40 @@ apiVersion: nvidia.com/v1 kind: ClusterPolicy metadata: - name: gpu-cluster-policy + name: cluster-policy spec: operator: - # OpenShift uses CRI-O as the default container runtime defaultRuntime: crio - driver: enabled: true - # Driver version for OpenShift / RHEL CoreOS - version: "550.54.15" - - toolkit: + rdma: + enabled: true + useHostMofed: false + cdi: enabled: true - - devicePlugin: + default: true + dcgm: + enabled: true + gfd: enabled: true - - # DCGM exporter for Prometheus metrics dcgmExporter: enabled: true - config: - name: "" - - # GPU Feature Discovery - gfd: + serviceMonitor: + enabled: true + devicePlugin: enabled: true - - # Node Feature Discovery - disabled because it's deployed as a standalone component - nfd: + toolkit: + enabled: true + installDir: /usr/local/nvidia + gdrcopy: enabled: false + sandboxWorkloads: + enabled: false + nodeStatusExporter: + enabled: true + daemonsets: + tolerations: + - operator: Exists + rollingUpdate: + maxUnavailable: '1' + updateStrategy: RollingUpdate \ No newline at end of file diff --git a/recipes/components/nfd-olm/direct/olm.yaml b/recipes/components/nfd-olm/direct/olm.yaml index 916fbbddb..b801ea398 100644 --- a/recipes/components/nfd-olm/direct/olm.yaml +++ b/recipes/components/nfd-olm/direct/olm.yaml @@ -16,11 +16,6 @@ # These manifests install NFD via OpenShift's Operator Lifecycle Manager (OLM). # Reference: https://docs.openshift.com/container-platform/latest/hardware_enablement/psap-node-feature-discovery-operator.html --- -apiVersion: v1 -kind: Namespace -metadata: - name: openshift-nfd ---- apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: diff --git a/recipes/components/nfd/direct/nodefeaturediscovery.yaml b/recipes/components/nfd/direct/nodefeaturediscovery.yaml index f5ca307d4..3e16edbe1 100644 --- a/recipes/components/nfd/direct/nodefeaturediscovery.yaml +++ b/recipes/components/nfd/direct/nodefeaturediscovery.yaml @@ -20,11 +20,16 @@ apiVersion: nfd.openshift.io/v1 kind: NodeFeatureDiscovery metadata: name: nfd-instance - namespace: openshift-nfd spec: + instance: '' operand: - image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.14 - imagePullPolicy: Always + # image: to get the image run and copy here (if nul leave empty): + # oc get csv -n openshift-nfd -o json | \ + # jq -r '.items[0].metadata.annotations["alm-examples"]' | tr -d '\000-\037' | \ + # jq -r '.[] | select(.kind == "NodeFeatureDiscovery") | .spec.operand.image' + servicePort: 12000 + prunerOnDelete: false + topologyUpdater: false workerConfig: configData: | core: @@ -32,8 +37,10 @@ spec: sources: pci: deviceClassWhitelist: - - "0200" + - "02" - "03" + - "0200" + - "0207" - "12" deviceLabelFields: - "vendor" diff --git a/recipes/registry.yaml b/recipes/registry.yaml index ca31b06e2..e9ef7b346 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -130,6 +130,7 @@ components: helm: # Manifest-only component - no external Helm chart, uses manifestFiles defaultRepository: "" + defaultChart: "" defaultNamespace: kube-system - name: aws-efa @@ -209,6 +210,7 @@ components: helm: # Manifest-only component - no external Helm chart, uses manifestFiles defaultRepository: "" + defaultChart: "" defaultNamespace: skyhook nodeScheduling: accelerated: @@ -525,6 +527,7 @@ components: displayName: GPU Operator (OLM) valueOverrideKeys: - gpuoperatorolm + olm: true direct: sourceFile: recipes/components/gpu-operator-olm/direct/olm.yaml @@ -532,5 +535,6 @@ components: displayName: Node Feature Discovery (OLM) valueOverrideKeys: - nfdolm + olm: true direct: sourceFile: recipes/components/nfd-olm/direct/olm.yaml diff --git a/site/.vitepress/config.ts b/site/.vitepress/config.ts index bad3413a5..3d3f637fe 100644 --- a/site/.vitepress/config.ts +++ b/site/.vitepress/config.ts @@ -86,6 +86,7 @@ export default withMermaid( { text: 'AKS GPU Setup', link: '/docs/integrator/aks-gpu-setup' }, { text: 'EKS Dynamo Networking', link: '/docs/integrator/eks-dynamo-networking' }, { text: 'GKE TCPXO Networking', link: '/docs/integrator/gke-tcpxo-networking' }, + { text: 'OpenShift Deployment', link: '/docs/integrator/openshift' }, { text: 'Recipe Development', link: '/docs/integrator/recipe-development' }, { text: 'Validator Extension', link: '/docs/integrator/validator-extension' }, ], From 45a17eaf09dcddd987aa678631ee679368480e82 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Tue, 12 May 2026 19:23:40 +0300 Subject: [PATCH 6/8] feat(bundler): add test coverage and improve Direct deployment type (#566) - Add unit tests for Direct deployment type (coverage improved from 68.2% to 73.3%) - Add OLM timeout constants to pkg/defaults with env var override capability (AICR_OLM_CSV_TIMEOUT, AICR_OLM_CSV_INTERVAL) - Update Direct deployment install scripts to use centralized timeout constants - Fix deploy.sh status message to reflect actual deployment method (OLM vs Helm vs kubectl) - Remove redundant empty source field from Direct component recipe YAML output - Add test coverage for OCP service type parsing Signed-off-by: Shai Kapon --- .../deployer/helm/templates/deploy.sh.tmpl | 20 +- pkg/bundler/deployer/localformat/direct.go | 21 +- .../templates/install-direct.sh.tmpl | 4 +- .../deployer/localformat/writer_test.go | 216 ++++++++++++++++++ pkg/defaults/timeouts.go | 17 ++ pkg/recipe/criteria_test.go | 2 + pkg/recipe/metadata.go | 6 +- 7 files changed, 273 insertions(+), 13 deletions(-) diff --git a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl index 88c5006ea..000ec43d5 100644 --- a/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/deploy.sh.tmpl @@ -377,7 +377,25 @@ else echo "Deployment complete." fi echo -echo "NOTE: The above status reflects Helm install and manifest apply results," +{{- $hasHelm := false }} +{{- $hasDirect := false }} +{{- $hasOLM := false }} +{{- range .Components }} + {{- if .Repository }}{{ $hasHelm = true }}{{ end }} + {{- if .SourceFile }}{{ $hasDirect = true }}{{ end }} + {{- if .Olm }}{{ $hasOLM = true }}{{ end }} +{{- end }} +{{- if and $hasDirect (not $hasHelm) }} + {{- if $hasOLM }} +echo "NOTE: The above status reflects OLM operator installation and kubectl apply results," + {{- else }} +echo "NOTE: The above status reflects kubectl apply results," + {{- end }} +{{- else if and $hasHelm (not $hasDirect) }} +echo "NOTE: The above status reflects Helm install results," +{{- else }} +echo "NOTE: The above status reflects Helm install and kubectl apply results," +{{- end }} echo "not whether the cluster is ready for GPU workloads. On fresh" echo "GPU nodes, cluster convergence may continue asynchronously after this" echo "script exits. Full workload readiness can take additional time for:" diff --git a/pkg/bundler/deployer/localformat/direct.go b/pkg/bundler/deployer/localformat/direct.go index c2a1e2054..e59f33ff8 100644 --- a/pkg/bundler/deployer/localformat/direct.go +++ b/pkg/bundler/deployer/localformat/direct.go @@ -23,6 +23,7 @@ import ( "text/template" "github.com/NVIDIA/aicr/pkg/bundler/deployer" + "github.com/NVIDIA/aicr/pkg/defaults" "github.com/NVIDIA/aicr/pkg/errors" "github.com/NVIDIA/aicr/pkg/recipe" ) @@ -77,15 +78,19 @@ func writeDirectFolder(outputDir, dir string, idx int, c Component) (Folder, err } data := struct { - ComponentName string - Namespace string - ManifestFilename string - Olm bool + ComponentName string + Namespace string + ManifestFilename string + Olm bool + CSVWaitTimeoutSeconds int + CSVPollIntervalSeconds int }{ - ComponentName: c.Name, - Namespace: c.Namespace, - ManifestFilename: manifestFilename, - Olm: c.Olm, + ComponentName: c.Name, + Namespace: c.Namespace, + ManifestFilename: manifestFilename, + Olm: c.Olm, + CSVWaitTimeoutSeconds: int(defaults.OLMCSVWaitTimeout.Seconds()), + CSVPollIntervalSeconds: int(defaults.OLMCSVPollInterval.Seconds()), } if err := renderTemplateToFile(tmpl, data, folderDir, "install.sh", 0o755); err != nil { diff --git a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl index a3ec401b1..4de881531 100644 --- a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl @@ -28,8 +28,8 @@ kubectl apply -f {{ .ManifestFilename }} {{- if .Namespace }} -n {{ .Namespace } # Wait for ClusterServiceVersion to reach Succeeded phase echo "Waiting for OLM operator CSV to reach Succeeded phase..." -TIMEOUT=300 -INTERVAL=5 +TIMEOUT=${AICR_OLM_CSV_TIMEOUT:-{{ .CSVWaitTimeoutSeconds }}} +INTERVAL=${AICR_OLM_CSV_INTERVAL:-{{ .CSVPollIntervalSeconds }}} ELAPSED=0 while [ $ELAPSED -lt $TIMEOUT ]; do diff --git a/pkg/bundler/deployer/localformat/writer_test.go b/pkg/bundler/deployer/localformat/writer_test.go index 5abfe3299..c286d254b 100644 --- a/pkg/bundler/deployer/localformat/writer_test.go +++ b/pkg/bundler/deployer/localformat/writer_test.go @@ -327,6 +327,222 @@ func TestWrite_Kustomize(t *testing.T) { } } +func TestWrite_DirectWithRealFiles(t *testing.T) { + // This test uses actual embedded files from the recipes directory + // Testing with real gpu-operator-olm OLM manifest + outDir := t.TempDir() + + res, err := localformat.Write(context.Background(), localformat.Options{ + OutputDir: outDir, + Components: []localformat.Component{{ + Name: "gpu-operator-olm", + Namespace: "nvidia-gpu-operator", + SourceFile: "recipes/components/gpu-operator-olm/direct/olm.yaml", + Olm: true, + }}, + }) + folders := res.Folders + if err != nil { + t.Fatalf("Write: %v", err) + } + + if len(folders) != 1 { + t.Fatalf("want 1 folder, got %d", len(folders)) + } + if got, want := folders[0].Dir, "001-gpu-operator-olm"; got != want { + t.Errorf("folders[0].Dir = %q, want %q", got, want) + } + if got, want := folders[0].Kind, localformat.KindDirect; got != want { + t.Errorf("folders[0].Kind = %v, want %v", got, want) + } + + // Verify files written on disk + for _, rel := range []string{"install.sh", "uninstall.sh", "olm.yaml"} { + path := filepath.Join(outDir, "001-gpu-operator-olm", rel) + if _, err := os.Stat(path); err != nil { + t.Errorf("missing file %s: %v", rel, err) + } + } + + // Verify install.sh contains OLM CSV wait logic + installPath := filepath.Join(outDir, "001-gpu-operator-olm", "install.sh") + installContent, err := os.ReadFile(installPath) + if err != nil { + t.Fatalf("read install.sh: %v", err) + } + expectedStrings := []string{ + "kubectl apply", + "olm.yaml", + "ClusterServiceVersion", + "CSV reached Succeeded phase", + "TIMEOUT=", + "-n nvidia-gpu-operator", + } + for _, expected := range expectedStrings { + if !strings.Contains(string(installContent), expected) { + t.Errorf("OLM install.sh missing %q", expected) + } + } + + // Verify uninstall.sh contains CSV deletion logic + uninstallPath := filepath.Join(outDir, "001-gpu-operator-olm", "uninstall.sh") + uninstallContent, err := os.ReadFile(uninstallPath) + if err != nil { + t.Fatalf("read uninstall.sh: %v", err) + } + if !strings.Contains(string(uninstallContent), "kubectl get csv") { + t.Errorf("OLM uninstall.sh should contain CSV deletion logic") + } + if !strings.Contains(string(uninstallContent), "kubectl delete") { + t.Errorf("uninstall.sh should contain kubectl delete") + } + + // Verify install.sh is executable + installInfo, err := os.Stat(installPath) + if err != nil { + t.Fatalf("stat install.sh: %v", err) + } + if installInfo.Mode()&0o111 == 0 { + t.Errorf("install.sh should be executable, got mode %v", installInfo.Mode()) + } + + // Verify manifest content was copied from embedded FS + manifestPath := filepath.Join(outDir, "001-gpu-operator-olm", "olm.yaml") + manifestContent, err := os.ReadFile(manifestPath) + if err != nil { + t.Fatalf("read olm.yaml: %v", err) + } + // Verify it contains expected OLM resources + manifestStr := string(manifestContent) + if !strings.Contains(manifestStr, "kind: OperatorGroup") { + t.Errorf("olm.yaml should contain OperatorGroup") + } + if !strings.Contains(manifestStr, "kind: Subscription") { + t.Errorf("olm.yaml should contain Subscription") + } +} + +func TestWrite_DirectNonOLM(t *testing.T) { + // Test Direct deployment without OLM (no CSV wait logic) + outDir := t.TempDir() + + res, err := localformat.Write(context.Background(), localformat.Options{ + OutputDir: outDir, + Components: []localformat.Component{{ + Name: "gpu-operator", + Namespace: "nvidia-gpu-operator", + SourceFile: "recipes/components/gpu-operator/direct/clusterpolicy.yaml", + Olm: false, // Not an OLM component + }}, + }) + folders := res.Folders + if err != nil { + t.Fatalf("Write: %v", err) + } + + if len(folders) != 1 { + t.Fatalf("want 1 folder, got %d", len(folders)) + } + + // Verify install.sh does NOT contain CSV wait logic (not OLM) + installPath := filepath.Join(outDir, "001-gpu-operator", "install.sh") + installContent, err := os.ReadFile(installPath) + if err != nil { + t.Fatalf("read install.sh: %v", err) + } + + // Should have kubectl apply + if !strings.Contains(string(installContent), "kubectl apply") { + t.Errorf("install.sh should contain kubectl apply") + } + if !strings.Contains(string(installContent), "clusterpolicy.yaml") { + t.Errorf("install.sh should reference clusterpolicy.yaml") + } + + // Should NOT have CSV wait logic (not OLM) + if strings.Contains(string(installContent), "ClusterServiceVersion") { + t.Errorf("non-OLM install.sh should not contain CSV wait logic") + } + if strings.Contains(string(installContent), "CSV reached Succeeded") { + t.Errorf("non-OLM install.sh should not wait for CSV") + } +} + +func TestWrite_DirectMissingSourceFile(t *testing.T) { + outDir := t.TempDir() + + _, err := localformat.Write(context.Background(), localformat.Options{ + OutputDir: outDir, + Components: []localformat.Component{{ + Name: "missing-file", + Namespace: "test-ns", + SourceFile: "recipes/components/nonexistent/direct/manifest.yaml", + Olm: false, + }}, + }) + + if err == nil { + t.Fatal("Write should fail when source file doesn't exist") + } + if !stderrors.Is(err, errors.New(errors.ErrCodeInternal, "")) { + t.Errorf("expected ErrCodeInternal, got %v", err) + } + if !strings.Contains(err.Error(), "failed to read source file") { + t.Errorf("error should mention failed to read source file, got: %v", err) + } +} + +func TestWrite_DirectOLMTimeoutConstants(t *testing.T) { + // Verify that timeout constants from pkg/defaults are properly + // rendered into the install.sh script with env var override capability + outDir := t.TempDir() + + res, err := localformat.Write(context.Background(), localformat.Options{ + OutputDir: outDir, + Components: []localformat.Component{{ + Name: "gpu-operator-olm", + Namespace: "nvidia-gpu-operator", + SourceFile: "recipes/components/gpu-operator-olm/direct/olm.yaml", + Olm: true, + }}, + }) + if err != nil { + t.Fatalf("Write: %v", err) + } + if len(res.Folders) != 1 { + t.Fatalf("want 1 folder, got %d", len(res.Folders)) + } + + // Read the generated install.sh + installPath := filepath.Join(outDir, "001-gpu-operator-olm", "install.sh") + installContent, err := os.ReadFile(installPath) + if err != nil { + t.Fatalf("read install.sh: %v", err) + } + + script := string(installContent) + + // Verify timeout uses env var with default from pkg/defaults (300 seconds) + if !strings.Contains(script, "TIMEOUT=${AICR_OLM_CSV_TIMEOUT:-300}") { + t.Error("install.sh should contain TIMEOUT=${AICR_OLM_CSV_TIMEOUT:-300}") + } + + // Verify interval uses env var with default from pkg/defaults (5 seconds) + if !strings.Contains(script, "INTERVAL=${AICR_OLM_CSV_INTERVAL:-5}") { + t.Error("install.sh should contain INTERVAL=${AICR_OLM_CSV_INTERVAL:-5}") + } + + // Verify the timeout logic still exists + if !strings.Contains(script, "while [ $ELAPSED -lt $TIMEOUT ]") { + t.Error("install.sh should contain timeout loop logic") + } + + // Verify CSV wait logic + if !strings.Contains(script, "CSV reached Succeeded phase") { + t.Error("install.sh should contain CSV success message") + } +} + func TestWrite_Deterministic(t *testing.T) { kustomizePath, err := filepath.Abs(filepath.Join("testdata", "kustomize_input")) if err != nil { diff --git a/pkg/defaults/timeouts.go b/pkg/defaults/timeouts.go index 101d86bbb..c97932f62 100644 --- a/pkg/defaults/timeouts.go +++ b/pkg/defaults/timeouts.go @@ -105,6 +105,23 @@ const ( K8sPodTerminationWaitTimeout = 60 * time.Second ) +// OpenShift OLM deployment timeouts for Direct deployment type install scripts. +// These constants are rendered into bundle install.sh scripts and can be +// overridden at deployment time via AICR_OLM_CSV_TIMEOUT and +// AICR_OLM_CSV_INTERVAL environment variables. +const ( + // OLMCSVWaitTimeout is the maximum time to wait for an OLM + // ClusterServiceVersion to reach Succeeded phase during operator + // installation. Covers catalog resolution, operator image pull, + // and initial startup of the operator pod. + OLMCSVWaitTimeout = 300 * time.Second + + // OLMCSVPollInterval is the polling interval for checking CSV status + // during operator installation. Short enough to provide responsive + // feedback without overwhelming the API server. + OLMCSVPollInterval = 5 * time.Second +) + // HTTP client timeouts for outbound requests. const ( // HTTPClientTimeout is the default total timeout for HTTP requests. diff --git a/pkg/recipe/criteria_test.go b/pkg/recipe/criteria_test.go index b48d4d9d5..f1803aa46 100644 --- a/pkg/recipe/criteria_test.go +++ b/pkg/recipe/criteria_test.go @@ -37,6 +37,8 @@ func TestParseCriteriaServiceType(t *testing.T) { {"gke", "gke", CriteriaServiceGKE, false}, {"aks", "aks", CriteriaServiceAKS, false}, {"oke", "oke", CriteriaServiceOKE, false}, + {"ocp", "ocp", CriteriaServiceOCP, false}, + {"OCP uppercase", "OCP", CriteriaServiceOCP, false}, {"lke", "lke", CriteriaServiceLKE, false}, {"LKE uppercase", "LKE", CriteriaServiceLKE, false}, {"self-managed", "self-managed", CriteriaServiceAny, false}, diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index 2b33b05a9..8c096ee6c 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -74,11 +74,13 @@ type ComponentRef struct { // Chart is the Helm chart name (e.g., "gpu-operator"). Chart string `json:"chart,omitempty" yaml:"chart,omitempty"` - // Type is the deployment type (Helm, Kustomize). + // Type is the deployment type (Helm, Kustomize, Direct). Type ComponentType `json:"type" yaml:"type"` // Source is the repository URL or OCI reference. - Source string `json:"source" yaml:"source"` + // Used by Helm (repository URL) and Kustomize (git repository). + // Not used by Direct (which uses SourceFile instead). + Source string `json:"source,omitempty" yaml:"source,omitempty"` // Version is the chart/component version (for Helm). Version string `json:"version,omitempty" yaml:"version,omitempty"` From 04ed2a999be24e1db701473c3492706512c0d3f8 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Tue, 12 May 2026 21:07:13 +0300 Subject: [PATCH 7/8] fix(gpu-operator): disable RDMA in OpenShift ClusterPolicy by default (#566) Signed-off-by: Shai Kapon --- ...07-openshift-integration.md => 009-openshift-integration.md} | 0 recipes/components/gpu-operator/direct/clusterpolicy.yaml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/design/{007-openshift-integration.md => 009-openshift-integration.md} (100%) diff --git a/docs/design/007-openshift-integration.md b/docs/design/009-openshift-integration.md similarity index 100% rename from docs/design/007-openshift-integration.md rename to docs/design/009-openshift-integration.md diff --git a/recipes/components/gpu-operator/direct/clusterpolicy.yaml b/recipes/components/gpu-operator/direct/clusterpolicy.yaml index f70daa461..a9a412250 100644 --- a/recipes/components/gpu-operator/direct/clusterpolicy.yaml +++ b/recipes/components/gpu-operator/direct/clusterpolicy.yaml @@ -26,7 +26,7 @@ spec: driver: enabled: true rdma: - enabled: true + enabled: false useHostMofed: false cdi: enabled: true From 8e06e8862452106b8b8cd38bddaefefd6f4c2ae0 Mon Sep 17 00:00:00 2001 From: Shai Kapon Date: Wed, 13 May 2026 08:54:38 +0300 Subject: [PATCH 8/8] fix(bundler): address CodeRabbit review findings for OCP OLM integration (#566) - Fix OLM CSV wait logic to resolve CSV name from Subscription.status.installedCSV instead of non-deterministic .items[0] query, ensuring correct CSV polling in multi-operator namespaces - Fix OLM CSV deletion to enumerate all Subscriptions and delete only their associated CSVs rather than using unsafe head -1 selection - Add fail-fast validation in ArgoCD deployer to reject Direct components with clear error message instead of silently generating unusable artifacts - Fix Helm deployment type detection to include DefaultVersion field, preventing components with only version configured from being misclassified - Fix ComponentRef merge logic to clear Olm field when transitioning from Direct to Helm/Kustomize types and apply Olm overlay to Direct components - Add explicit metadata.namespace to all OLM and CR manifests (gpu-operator-olm, nfd-olm, nfd CR) for deterministic namespace targeting Signed-off-by: Shai Kapon --- docs/design/009-openshift-integration.md | 33 +++++--- docs/integrator/data-flow.md | 2 +- docs/integrator/openshift.md | 2 +- pkg/bundler/deployer/argocd/argocd.go | 11 ++- .../templates/install-direct.sh.tmpl | 17 +++- .../templates/uninstall-direct.sh.tmpl | 19 +++-- .../deployer/localformat/writer_test.go | 24 +++++- pkg/recipe/components.go | 5 +- pkg/recipe/components_test.go | 46 +++++++++++ pkg/recipe/metadata.go | 7 ++ pkg/recipe/metadata_test.go | 81 +++++++++++++++++++ .../gpu-operator-olm/direct/olm.yaml | 2 + .../nfd/direct/nodefeaturediscovery.yaml | 1 + 13 files changed, 221 insertions(+), 29 deletions(-) diff --git a/docs/design/009-openshift-integration.md b/docs/design/009-openshift-integration.md index b248ff344..3eec4e701 100644 --- a/docs/design/009-openshift-integration.md +++ b/docs/design/009-openshift-integration.md @@ -1,4 +1,4 @@ -# ADR-007: OpenShift Integration +# ADR-009: OpenShift Integration ## Status @@ -35,7 +35,7 @@ generated by OLM when a Subscription is created. ### Lifecycle Flow -``` +```text Subscription created → CSV resources applied → Operator deployed ``` @@ -63,6 +63,7 @@ Unlike Helm (which references external charts) or Kustomize (which references external repos), `direct` points to **static YAML manifests embedded in AICR**. **Registry entry:** + ```yaml - name: gpu-operator-olm displayName: GPU Operator (OLM) @@ -72,7 +73,8 @@ external repos), `direct` points to **static YAML manifests embedded in AICR**. ``` **File structure:** -``` + +```text recipes/components/ ├── gpu-operator-olm/ │ └── direct/ @@ -113,7 +115,8 @@ For `direct` components, the bundler: 5. No value merging, no templating on YAML — direct passthrough **Bundle output:** -``` + +```text bundles/ ├── gpu-operator-olm/ │ ├── olm.yaml # Copied from recipes/components/ @@ -131,6 +134,7 @@ bundler embeds CSV readiness polling directly into the install script for OLM components. **Registry configuration:** + ```yaml - name: gpu-operator-olm displayName: GPU Operator (OLM) @@ -160,6 +164,7 @@ For components without `olm: true`: - Exits 1 on timeout or error **Deployment flow:** + ```bash # Install OLM resources - waits for CSV ready ./bundles/gpu-operator-olm/install.sh # Applies olm.yaml + waits for CSV @@ -171,6 +176,7 @@ For components without `olm: true`: ### Example: GPU Operator on OpenShift **Registry entries:** + ```yaml # Operator installation via OLM - name: gpu-operator-olm @@ -191,6 +197,7 @@ For components without `olm: true`: **Component files:** `recipes/components/gpu-operator-olm/direct/olm.yaml`: + ```yaml --- apiVersion: operators.coreos.com/v1 @@ -212,6 +219,7 @@ spec: ``` `recipes/components/gpu-operator/direct/clusterpolicy.yaml`: + ```yaml --- apiVersion: nvidia.com/v1 @@ -229,7 +237,7 @@ spec: ### Workflow -``` +```text User: aicr bundle --service ocp --accelerator h100 --output ./bundles ↓ Recipe resolver selects overlay with gpu-operator-olm + gpu-operator @@ -294,14 +302,17 @@ Instead of introducing a new `direct` deployment type, generate temporary Helm charts from the static YAML files and reuse the existing Helm bundler flow. **Approach:** + 1. Component has static YAML in `recipes/components//direct/olm.yaml` 2. Bundler generates a minimal Helm chart structure on-the-fly: - ``` + + ```text temp-chart/ ├── Chart.yaml └── templates/ └── olm.yaml ``` + 3. Bundle output references this temporary chart 4. Deployment uses `helm install` instead of `kubectl apply` @@ -323,10 +334,12 @@ wait script templates that the bundler would template and include in the bundle. **Why rejected:** - **Limited to install.sh deployment** — Wait scripts only apply to the `install.sh` - reference implementation, which serves as an example deployment method. The primary - deployment target is ArgoCD, which handles synchronization and health checks through - its own mechanisms (resource hooks, sync waves, health assessments). Per-component - wait scripts would be unused in the ArgoCD flow. + reference implementation, which serves as an example deployment method. ArgoCD and + OCP ArgoCD deployer support is out of scope for this PR. While ArgoCD is a planned + target deployment mechanism (which would handle synchronization and health checks + through resource hooks, sync waves, and health assessments), per-component wait + script templates would add complexity that only benefits manual `install.sh` + deployments in the current implementation. - **Template maintenance overhead** — Each component needs its own wait.sh.tmpl file. For OLM components, all templates would implement identical CSV polling logic, leading to duplication across components. diff --git a/docs/integrator/data-flow.md b/docs/integrator/data-flow.md index c12194158..7655d5f7f 100644 --- a/docs/integrator/data-flow.md +++ b/docs/integrator/data-flow.md @@ -898,7 +898,7 @@ X-RateLimit-Reset: 1735650000 **Query Parameters:** - Type checking (string, int, bool) -- Enum validation (eks, gke, aks, ocp, etc.) +- Enum validation for `service`: `aks`, `eks`, `gke`, `kind`, `lke`, `ocp`, `oke` (rejects unknown values with HTTP 400 / `ErrCodeInvalidRequest`) - Version format validation (regex) - Range validation (if applicable) diff --git a/docs/integrator/openshift.md b/docs/integrator/openshift.md index bd7a17c12..0d0ccc638 100644 --- a/docs/integrator/openshift.md +++ b/docs/integrator/openshift.md @@ -44,7 +44,7 @@ This separation ensures the operator is fully installed and ready before applyin ### OLM Resource Flow -``` +```text User creates Subscription ↓ OLM creates ClusterServiceVersion (CSV) diff --git a/pkg/bundler/deployer/argocd/argocd.go b/pkg/bundler/deployer/argocd/argocd.go index 84ffa49a6..a8ce91a21 100644 --- a/pkg/bundler/deployer/argocd/argocd.go +++ b/pkg/bundler/deployer/argocd/argocd.go @@ -361,6 +361,13 @@ func (g *Generator) Generate(ctx context.Context, outputDir string) (*deployer.O fmt.Sprintf("localformat returned folder for unknown component %q", f.Parent)) } + if f.Kind == localformat.KindDirect { + return nil, errors.New( + errors.ErrCodeInvalidRequest, + fmt.Sprintf("component %q uses direct manifests, which are not supported by the argocd deployer; use --deployer helm", f.Parent), + ) + } + appData := buildApplicationData(*comp, f, i, repoURL, targetRevision) appDataList = append(appDataList, appData) @@ -538,10 +545,6 @@ func buildApplicationData(comp recipe.ComponentRef, f localformat.Folder, syncWa data.Version = deployer.NormalizeVersion(comp.Version) } data.Chart = chart - case localformat.KindDirect: - // Direct components are not supported in ArgoCD deployer. - // Direct components use kubectl apply instead of Helm, which doesn't fit - // the Argo CD Application resource model. Use the helm deployer instead. } return data } diff --git a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl index 4de881531..ed2e080b8 100644 --- a/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/install-direct.sh.tmpl @@ -33,14 +33,25 @@ INTERVAL=${AICR_OLM_CSV_INTERVAL:-{{ .CSVPollIntervalSeconds }}} ELAPSED=0 while [ $ELAPSED -lt $TIMEOUT ]; do - CSV_PHASE=$(kubectl get csv -n {{ .Namespace }} -o jsonpath='{.items[0].status.phase}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") + # Resolve the installed CSV name from the Subscription + CSV_NAME=$(kubectl get subscription -n {{ .Namespace }} -o jsonpath='{.items[0].status.installedCSV}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") + + if [ -z "$CSV_NAME" ]; then + echo "Waiting for Subscription to report installedCSV... (${ELAPSED}s/${TIMEOUT}s)" + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) + continue + fi + + # Check the specific CSV's phase + CSV_PHASE=$(kubectl get csv "$CSV_NAME" -n {{ .Namespace }} -o jsonpath='{.status.phase}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") if [ "$CSV_PHASE" = "Succeeded" ]; then - echo "CSV reached Succeeded phase" + echo "CSV ${CSV_NAME} reached Succeeded phase" exit 0 fi - echo "CSV phase: ${CSV_PHASE:-}, waiting... (${ELAPSED}s/${TIMEOUT}s)" + echo "CSV ${CSV_NAME} phase: ${CSV_PHASE:-}, waiting... (${ELAPSED}s/${TIMEOUT}s)" sleep $INTERVAL ELAPSED=$((ELAPSED + INTERVAL)) done diff --git a/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl index e2dc9936e..667aae499 100644 --- a/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl +++ b/pkg/bundler/deployer/localformat/templates/uninstall-direct.sh.tmpl @@ -18,12 +18,19 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SCRIPT_DIR}" {{- if .Olm }} -# For OLM-managed operators, delete the ClusterServiceVersion (CSV) first -# This removes the actual operator deployment before removing the Subscription -csv_name=$(kubectl get csv -n {{ .Namespace }} -o name 2>/dev/null | head -1) -if [[ -n "${csv_name}" ]]; then - echo "Deleting ClusterServiceVersion: ${csv_name}" - kubectl delete "${csv_name}" -n {{ .Namespace }} --ignore-not-found --timeout="${HELM_TIMEOUT:-120}s" || true +# For OLM-managed operators, delete the ClusterServiceVersions (CSVs) first +# This removes the actual operator deployments before removing the Subscriptions +# Enumerate all Subscriptions in the namespace and delete their associated CSVs +subscriptions=$(kubectl get subscription -n {{ .Namespace }} -o jsonpath='{.items[*].metadata.name}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") + +if [[ -n "${subscriptions}" ]]; then + for sub in ${subscriptions}; do + csv_name=$(kubectl get subscription "${sub}" -n {{ .Namespace }} -o jsonpath='{.status.installedCSV}' ${KUBECONFIG_FLAG:-} 2>/dev/null || echo "") + if [[ -n "${csv_name}" ]]; then + echo "Deleting ClusterServiceVersion: ${csv_name} (from Subscription: ${sub})" + kubectl delete csv "${csv_name}" -n {{ .Namespace }} --ignore-not-found --timeout="${HELM_TIMEOUT:-120}s" ${KUBECONFIG_FLAG:-} || true + fi + done fi {{- end }} diff --git a/pkg/bundler/deployer/localformat/writer_test.go b/pkg/bundler/deployer/localformat/writer_test.go index c286d254b..22e821d2c 100644 --- a/pkg/bundler/deployer/localformat/writer_test.go +++ b/pkg/bundler/deployer/localformat/writer_test.go @@ -374,7 +374,9 @@ func TestWrite_DirectWithRealFiles(t *testing.T) { "kubectl apply", "olm.yaml", "ClusterServiceVersion", - "CSV reached Succeeded phase", + "reached Succeeded phase", + "kubectl get subscription", + "installedCSV", "TIMEOUT=", "-n nvidia-gpu-operator", } @@ -390,7 +392,14 @@ func TestWrite_DirectWithRealFiles(t *testing.T) { if err != nil { t.Fatalf("read uninstall.sh: %v", err) } - if !strings.Contains(string(uninstallContent), "kubectl get csv") { + // Verify it queries subscriptions to find CSVs + if !strings.Contains(string(uninstallContent), "kubectl get subscription") { + t.Errorf("OLM uninstall.sh should query Subscriptions to find CSVs") + } + if !strings.Contains(string(uninstallContent), "installedCSV") { + t.Errorf("OLM uninstall.sh should extract installedCSV from Subscription") + } + if !strings.Contains(string(uninstallContent), "kubectl delete csv") { t.Errorf("OLM uninstall.sh should contain CSV deletion logic") } if !strings.Contains(string(uninstallContent), "kubectl delete") { @@ -538,9 +547,18 @@ func TestWrite_DirectOLMTimeoutConstants(t *testing.T) { } // Verify CSV wait logic - if !strings.Contains(script, "CSV reached Succeeded phase") { + if !strings.Contains(script, "reached Succeeded phase") { t.Error("install.sh should contain CSV success message") } + + // Verify we resolve CSV name from Subscription + if !strings.Contains(script, "kubectl get subscription") { + t.Error("install.sh should query Subscription to resolve CSV name") + } + + if !strings.Contains(script, "installedCSV") { + t.Error("install.sh should extract installedCSV from Subscription status") + } } func TestWrite_Deterministic(t *testing.T) { diff --git a/pkg/recipe/components.go b/pkg/recipe/components.go index 108210497..aec67ba39 100644 --- a/pkg/recipe/components.go +++ b/pkg/recipe/components.go @@ -329,7 +329,10 @@ func (r *ComponentRegistry) Validate() []error { // validateDeploymentTypeExclusion ensures exactly one deployment type is configured. // Returns an error if zero or multiple deployment types are set. func validateDeploymentTypeExclusion(comp ComponentConfig, index int) error { - hasHelm := comp.Helm.DefaultRepository != "" || comp.Helm.DefaultChart != "" || comp.Helm.DefaultNamespace != "" + hasHelm := comp.Helm.DefaultRepository != "" || + comp.Helm.DefaultChart != "" || + comp.Helm.DefaultNamespace != "" || + comp.Helm.DefaultVersion != "" hasKustomize := comp.Kustomize.DefaultSource != "" hasDirect := comp.Direct.SourceFile != "" diff --git a/pkg/recipe/components_test.go b/pkg/recipe/components_test.go index 9ce195deb..c3fce47c2 100644 --- a/pkg/recipe/components_test.go +++ b/pkg/recipe/components_test.go @@ -918,3 +918,49 @@ components: t.Errorf("GetType() = %v, want %v", comp.GetType(), ComponentTypeKustomize) } } + +func TestValidateDeploymentTypeExclusion_HelmDefaultVersion(t *testing.T) { + tests := []struct { + name string + comp ComponentConfig + wantErr bool + }{ + { + name: "helm with only DefaultVersion is valid", + comp: ComponentConfig{ + Name: "test-component", + Helm: HelmConfig{ + DefaultVersion: "1.0.0", + }, + }, + wantErr: false, + }, + { + name: "helm with DefaultVersion and DefaultRepository", + comp: ComponentConfig{ + Name: "test-component", + Helm: HelmConfig{ + DefaultVersion: "1.0.0", + DefaultRepository: "https://charts.example.com", + }, + }, + wantErr: false, + }, + { + name: "no deployment type configured should error", + comp: ComponentConfig{ + Name: "test-component", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validateDeploymentTypeExclusion(tt.comp, 0) + if (err != nil) != tt.wantErr { + t.Errorf("validateDeploymentTypeExclusion() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} diff --git a/pkg/recipe/metadata.go b/pkg/recipe/metadata.go index 8c096ee6c..c049010f2 100644 --- a/pkg/recipe/metadata.go +++ b/pkg/recipe/metadata.go @@ -583,12 +583,14 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { case ComponentTypeHelm: // Clear Direct/Kustomize-specific fields result.SourceFile = "" + result.Olm = false result.Tag = "" result.Path = "" result.Patches = nil case ComponentTypeKustomize: // Clear Direct/Helm-specific fields result.SourceFile = "" + result.Olm = false result.Version = "" result.Chart = "" result.ValuesFile = "" @@ -623,6 +625,11 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef { result.SourceFile = overlay.SourceFile } + // Olm: overlay takes precedence if set (Direct only) + if overlay.Olm && result.Type == ComponentTypeDirect { + result.Olm = true + } + // Overrides: deep-merge maps, overlay takes precedence if len(overlay.Overrides) > 0 { if result.Overrides == nil { diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index 332bd3b6e..7c16abb24 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -1589,3 +1589,84 @@ func TestNFDTopologyUpdater_OverlayCoverage(t *testing.T) { }) } } + +func TestMergeComponentRef_OlmField(t *testing.T) { + tests := []struct { + name string + base ComponentRef + overlay ComponentRef + expected ComponentRef + }{ + { + name: "olm true in overlay applied to direct component", + base: ComponentRef{ + Name: "gpu-operator-olm", + Type: ComponentTypeDirect, + }, + overlay: ComponentRef{ + Olm: true, + }, + expected: ComponentRef{ + Name: "gpu-operator-olm", + Type: ComponentTypeDirect, + Olm: true, + }, + }, + { + name: "olm cleared when transitioning from direct to helm", + base: ComponentRef{ + Name: "gpu-operator", + Type: ComponentTypeDirect, + SourceFile: "olm.yaml", + Olm: true, + }, + overlay: ComponentRef{ + Type: ComponentTypeHelm, + Source: "https://charts.example.com", + Chart: "gpu-operator", + Version: "1.0.0", + }, + expected: ComponentRef{ + Name: "gpu-operator", + Type: ComponentTypeHelm, + Source: "https://charts.example.com", + Chart: "gpu-operator", + Version: "1.0.0", + Olm: false, + }, + }, + { + name: "olm cleared when transitioning from direct to kustomize", + base: ComponentRef{ + Name: "gpu-operator", + Type: ComponentTypeDirect, + SourceFile: "olm.yaml", + Olm: true, + }, + overlay: ComponentRef{ + Type: ComponentTypeKustomize, + Source: "https://github.com/example/repo", + Tag: "v1.0.0", + }, + expected: ComponentRef{ + Name: "gpu-operator", + Type: ComponentTypeKustomize, + Source: "https://github.com/example/repo", + Tag: "v1.0.0", + Olm: false, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := mergeComponentRef(tt.base, tt.overlay) + if result.Olm != tt.expected.Olm { + t.Errorf("Olm = %v, want %v", result.Olm, tt.expected.Olm) + } + if result.Type != tt.expected.Type { + t.Errorf("Type = %v, want %v", result.Type, tt.expected.Type) + } + }) + } +} diff --git a/recipes/components/gpu-operator-olm/direct/olm.yaml b/recipes/components/gpu-operator-olm/direct/olm.yaml index 34fb940c8..b2950ce19 100644 --- a/recipes/components/gpu-operator-olm/direct/olm.yaml +++ b/recipes/components/gpu-operator-olm/direct/olm.yaml @@ -20,6 +20,7 @@ apiVersion: operators.coreos.com/v1 kind: OperatorGroup metadata: name: nvidia-gpu-operator-group + namespace: nvidia-gpu-operator spec: targetNamespaces: - nvidia-gpu-operator @@ -28,6 +29,7 @@ apiVersion: operators.coreos.com/v1alpha1 kind: Subscription metadata: name: gpu-operator-certified + namespace: nvidia-gpu-operator spec: channel: stable name: gpu-operator-certified diff --git a/recipes/components/nfd/direct/nodefeaturediscovery.yaml b/recipes/components/nfd/direct/nodefeaturediscovery.yaml index 3e16edbe1..746f7ac3f 100644 --- a/recipes/components/nfd/direct/nodefeaturediscovery.yaml +++ b/recipes/components/nfd/direct/nodefeaturediscovery.yaml @@ -20,6 +20,7 @@ apiVersion: nfd.openshift.io/v1 kind: NodeFeatureDiscovery metadata: name: nfd-instance + namespace: openshift-nfd spec: instance: '' operand: