From 97adcd83f9a2f4ea63378ecda7fbdf50a0529649 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 13 May 2026 11:41:17 -0700 Subject: [PATCH] chore(recipes): migrate kgateway -> agentgateway for v2.2 inference routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kgateway v2.2 removed InferencePool routing from its Envoy data plane (PR kgateway-dev/kgateway#12689, deprecated in v2.1). The Gateway API Inference Extension support that AICR uses for CNCF AI Conformance Advanced Ingress moved entirely to the separate agentgateway project, which ships its own Helm charts, GatewayClass, controller, and AgentgatewayParameters CRD. AICR's only kgateway consumer was the inference Gateway resource — zero HTTPRoutes, no TrafficPolicy/BackendConfigPolicy/etc. — so this PR replaces kgateway entirely rather than running both side-by-side. Changes: - registry + mixin: kgateway/kgateway-crds (v2.0.0, cr.kgateway.dev) -> agentgateway/agentgateway-crds (v2.2.1, cr.agentgateway.dev) - inference-gateway.yaml: GatewayParameters (gateway.kgateway.dev) -> AgentgatewayParameters (agentgateway.dev/v1alpha1, strategic-merge patch on Deployment spec); gatewayClassName: kgateway -> agentgateway; namespace kgateway-system -> agentgateway-system - health check + component dirs renamed under git mv to preserve history - conformance validator (validators/conformance/inference_gateway_check.go) updated to gate on `agentgateway` component and query the new GatewayClass / namespace / Deployment names - evidence collector script + requirement title updated - GPU H100 inference workflow path filter follows the renamed component directories so PRs touching only agentgateway* still trigger CI - Go test fixtures, undeploy.sh.tmpl + golden files, chainsaw assertions, UAT recipe snapshots regenerated against the new component graph - docs/user (component-catalog, container-images, api-reference, cli-reference) refreshed; container-images.md regenerated via `make bom-docs` - demos/cuj2-* + demos/query.md + demos/images/meta.md updated so kubectl commands and architecture diagrams match the new namespace and component names Conformance-equivalence: AICR's `ai_inference` requirement is data-plane agnostic — it asserts a GatewayClass is Accepted, a Gateway is Programmed, and the InferencePool CRDs exist. Swapping to agentgateway preserves all five evidence checks; only the names in the captured output change. Historical evidence snapshots under docs/conformance/cncf/v1.35/nim-eks/, demos/examples/CUJ2-Test-Report.md, design doc 005, and CHANGELOG entries are intentionally left referencing kgateway as frozen records. Closes follow-up #1 from issue #698. Validation: - make qualify (Go unit tests, golangci-lint, yamllint, 20/20 chainsaw including cli-bundle-agentgateway-templates, vulnerability scan, license headers): all green - Bundle generation verified end-to-end (recipe -> bundle -> deploy.sh layout under /tmp); rendered post chart confirmed to emit AgentgatewayParameters + Gateway with class agentgateway in namespace agentgateway-system - validators/conformance package tests pass (22.6s) --- .github/scripts/gpu-debug-diagnostics.sh | 10 ++-- .../workflows/gpu-h100-inference-test.yaml | 4 +- demos/cuj2-demo.md | 19 +++--- demos/cuj2-eks.md | 2 +- demos/cuj2-gke.md | 2 +- demos/images/meta.md | 4 +- demos/query.md | 2 +- docs/user/api-reference.md | 4 +- docs/user/cli-reference.md | 2 +- docs/user/component-catalog.md | 6 +- docs/user/container-images.md | 28 ++++----- pkg/bundler/deployer/helm/helm_test.go | 10 ++-- .../deployer/helm/templates/undeploy.sh.tmpl | 2 +- .../kai_scheduler_present/undeploy.sh | 2 +- .../helm/testdata/manifest_only/undeploy.sh | 2 +- .../testdata/mixed_gpu_operator/undeploy.sh | 2 +- .../helm/testdata/mixed_with_pre/undeploy.sh | 2 +- .../testdata/nodewright_present/undeploy.sh | 2 +- .../testdata/upstream_helm_only/undeploy.sh | 2 +- pkg/evidence/cncf/requirements.go | 2 +- pkg/evidence/cncf/scripts/collect-evidence.sh | 32 +++++----- pkg/recipe/conformance_test.go | 20 +++---- pkg/recipe/metadata_test.go | 2 +- .../health-check.yaml | 24 ++++---- .../manifests/gateway-api-crds.yaml | 4 +- .../manifests/inference-extension-crds.yaml | 4 +- .../values.yaml | 13 ++-- .../manifests/inference-gateway.yaml | 60 +++++++++++-------- .../{kgateway => agentgateway}/values.yaml | 12 ++-- recipes/mixins/platform-inference.yaml | 24 ++++---- recipes/registry.yaml | 30 +++++----- recipes/validators/README.md | 2 +- recipes/validators/catalog.yaml | 2 +- tests/chainsaw/ai-conformance/README.md | 12 ++-- ...kgateway.yaml => assert-agentgateway.yaml} | 14 ++--- .../ai-conformance/cluster/assert-crds.yaml | 4 +- .../cluster/assert-namespaces.yaml | 2 +- .../ai-conformance/cluster/chainsaw-test.yaml | 10 ++-- ...kgateway.yaml => assert-agentgateway.yaml} | 8 +-- .../kind-inference-dynamo/assert-crds.yaml | 2 +- .../assert-namespaces.yaml | 2 +- .../kind-inference-dynamo/chainsaw-test.yaml | 10 ++-- .../ai-conformance/offline/assert-recipe.yaml | 10 ++-- .../ai-conformance/offline/chainsaw-test.yaml | 2 +- .../chainsaw-test.yaml | 44 +++++++------- .../tests/cuj2-inference/assert-recipe.yaml | 14 ++--- .../tests/cuj2-inference/assert-recipe.yaml | 18 +++--- .../tests/cuj2-inference/assert-recipe.yaml | 14 ++--- .../conformance/inference_gateway_check.go | 42 ++++++------- 49 files changed, 280 insertions(+), 266 deletions(-) rename recipes/checks/{kgateway => agentgateway}/health-check.yaml (77%) rename recipes/components/{kgateway-crds => agentgateway-crds}/manifests/gateway-api-crds.yaml (99%) rename recipes/components/{kgateway-crds => agentgateway-crds}/manifests/inference-extension-crds.yaml (99%) rename recipes/components/{kgateway-crds => agentgateway-crds}/values.yaml (70%) rename recipes/components/{kgateway => agentgateway}/manifests/inference-gateway.yaml (52%) rename recipes/components/{kgateway => agentgateway}/values.yaml (75%) rename tests/chainsaw/ai-conformance/cluster/{assert-kgateway.yaml => assert-agentgateway.yaml} (75%) rename tests/chainsaw/ai-conformance/kind-inference-dynamo/{assert-kgateway.yaml => assert-agentgateway.yaml} (83%) rename tests/chainsaw/bundle-templates/{kgateway => agentgateway}/chainsaw-test.yaml (66%) diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh index 089e32ca2..3576ba386 100644 --- a/.github/scripts/gpu-debug-diagnostics.sh +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -259,9 +259,9 @@ print_dynamo_diagnostics() { kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true } -print_kgateway_diagnostics() { - echo "=== kgateway pods ===" - kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true +print_agentgateway_diagnostics() { + echo "=== agentgateway pods ===" + kubectl_kind -n agentgateway-system get pods -o wide 2>/dev/null || true echo "=== GatewayClass status ===" kubectl_kind get gatewayclass -o yaml 2>/dev/null || true echo "=== Gateway status ===" @@ -280,9 +280,9 @@ case "${mode}" in print_kubeflow_diagnostics ;; inference) - print_h100_common_diagnostics dynamo-system kgateway-system + print_h100_common_diagnostics dynamo-system agentgateway-system print_dynamo_diagnostics - print_kgateway_diagnostics + print_agentgateway_diagnostics ;; *) echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index f02fa85b2..154caca93 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -89,8 +89,8 @@ jobs: - 'recipes/components/nodewright-operator/**' - 'recipes/components/nvidia-dra-driver-gpu/**' - 'recipes/components/nvsentinel/**' - - 'recipes/components/kgateway/**' - - 'recipes/components/kgateway-crds/**' + - 'recipes/components/agentgateway/**' + - 'recipes/components/agentgateway-crds/**' - 'recipes/components/grove/**' - 'recipes/components/dynamo-platform/**' - 'recipes/components/prometheus-adapter/**' diff --git a/demos/cuj2-demo.md b/demos/cuj2-demo.md index 4029fd622..102465346 100644 --- a/demos/cuj2-demo.md +++ b/demos/cuj2-demo.md @@ -42,8 +42,8 @@ │ ├── gpu-operator/ (GPU driver, device-plugin, DCGM) │ │ ├── nvidia-dra-driver-gpu/ (Dynamic Resource Allocation) │ │ ├── kai-scheduler/ (gang scheduling) │ - │ ├── kgateway-crds/ (Gateway API + inference CRDs) │ - │ ├── kgateway/ (inference gateway controller) │ + │ ├── agentgateway-crds/ (Gateway API + inference CRDs) │ + │ ├── agentgateway/ (inference gateway controller) │ │ ├── nvsentinel/ (security/compliance) │ │ ├── nodewright-operator/ (node configuration) │ │ ├── nodewright-customizations/ (H100 tuning) │ @@ -60,13 +60,13 @@ │ $ cd bundle && ./deploy.sh │ │ │ │ cert-manager ──▶ kube-prometheus-stack ──▶ gpu-operator ──▶ │ - │ kai-scheduler ──▶ kgateway ──▶ nvidia-dra-driver ──▶ │ + │ kai-scheduler ──▶ agentgateway ──▶ nvidia-dra-driver ──▶ │ │ dynamo-platform ──▶ nodewright ──▶ nvsentinel ──▶ ... │ │ │ │ Result: Fully configured GPU cluster │ │ • 8x H100 GPUs advertised via DRA │ │ • Gang scheduling (KAI Scheduler) │ - │ • Inference gateway (kgateway) │ + │ • Inference gateway (agentgateway) │ │ • GPU metrics (DCGM → Prometheus → HPA) │ │ • Dynamo inference platform │ └────────────────────────────────────────────────────────────────────────┘ @@ -114,8 +114,8 @@ │ └── aws-efa │ └── aws-efa │ │ │ │ │ │ │ eks-training.yaml │ eks-inference.yaml │ -│ (no new components) │ ├── kgateway-crds ◀── NEW │ -│ │ │ └── kgateway ◀── NEW │ +│ (no new components) │ ├── agentgateway-crds ◀── NEW │ +│ │ │ └── agentgateway ◀── NEW │ │ │ │ │ │ │ h100-eks-training.yaml │ h100-eks-inference.yaml │ │ ├── gpu-operator (CDI, gdrcopy) │ └── nodewright-customizations │ @@ -130,7 +130,8 @@ │ │ └── dynamo-platform ◀─ NEW │ │ │ │ ├─────────────────────────────────────┼─────────────────────────────────────┤ -│ Unique: kubeflow-trainer │ Unique: kgateway-crds, kgateway, │ +│ Unique: kubeflow-trainer │ Unique: agentgateway-crds, │ +│ │ agentgateway, │ │ │ dynamo-crds, dynamo-platform │ ├─────────────────────────────────────┴─────────────────────────────────────┤ │ Shared (base + eks): cert-manager, kube-prometheus-stack, gpu-operator, │ @@ -253,7 +254,7 @@ http://127.0.0.1:9090/chat.html │ │ │ toolkit, DCGM, validator) │ │ │ 4 │ accelerator_metrics │ gpu-operator (DCGM exporter) │ base │ │ 5 │ ai_service_metrics │ kube-prometheus-stack, prometheus-adapter│ base │ -│ 6 │ ai_inference │ kgateway-crds, kgateway │ eks-inf │ +│ 6 │ ai_inference │ agentgateway-crds, agentgateway │ eks-inf │ │ 7 │ robust_controller │ dynamo-crds, dynamo-platform │ dynamo │ │ 8 │ pod_autoscaling │ prometheus-adapter + HPA │ base │ │ 9 │ cluster_autoscaling │ EKS Auto Scaling Group (ASG) │ infra │ @@ -263,7 +264,7 @@ http://127.0.0.1:9090/chat.html │ DRA, gang scheduling, secure access, accelerator metrics, │ │ AI service metrics, pod autoscaling │ │ │ -│ eks-inference layer (+1): inference gateway (kgateway) │ +│ eks-inference layer (+1): inference gateway (agentgateway) │ │ dynamo layer (+1): robust controller (Dynamo operator) │ │ infra layer (+1): cluster autoscaling (EKS ASG) │ │ │ diff --git a/demos/cuj2-eks.md b/demos/cuj2-eks.md index e36b4bb3a..c9dad431b 100644 --- a/demos/cuj2-eks.md +++ b/demos/cuj2-eks.md @@ -85,7 +85,7 @@ kubectl get dynamographdeployments -n dynamo-workload kubectl get pods -n dynamo-workload -o wide -w # Verify the inference gateway routes to the workload -kubectl get gateway inference-gateway -n kgateway-system +kubectl get gateway inference-gateway -n agentgateway-system kubectl get inferencepool -n dynamo-workload ``` diff --git a/demos/cuj2-gke.md b/demos/cuj2-gke.md index 949333b79..fceec7cbf 100644 --- a/demos/cuj2-gke.md +++ b/demos/cuj2-gke.md @@ -83,7 +83,7 @@ kubectl get dynamographdeployments -n dynamo-workload kubectl get pods -n dynamo-workload -o wide -w # Verify the inference gateway routes to the workload -kubectl get gateway inference-gateway -n kgateway-system +kubectl get gateway inference-gateway -n agentgateway-system kubectl get inferencepool -n dynamo-workload ``` diff --git a/demos/images/meta.md b/demos/images/meta.md index a39269bdc..1bceb1f72 100644 --- a/demos/images/meta.md +++ b/demos/images/meta.md @@ -82,8 +82,8 @@ Visual: Single input forking into two divergent paths │ Unique: │ │ Unique: │ │ kubeflow-trainer │ │ dynamo-crds │ │ │ │ dynamo-platform │ -│ GPU Operator: │ │ kgateway-crds │ -│ CDI=true │ │ kgateway │ +│ GPU Operator: │ │ agentgateway-crds │ +│ CDI=true │ │ agentgateway │ │ gdrcopy=true │ │ │ │ │ │ DRA driver: │ │ │ │ gpuResources=true │ diff --git a/demos/query.md b/demos/query.md index 42e6cbce6..4da28059a 100644 --- a/demos/query.md +++ b/demos/query.md @@ -129,7 +129,7 @@ aicr query --service eks --accelerator h100 --intent inference --os ubuntu \ diff /tmp/training.txt /tmp/inference.txt ``` -> `> kgateway` and `> kgateway-crds` — the Inference Gateway is added only +> `> agentgateway` and `> agentgateway-crds` — the Inference Gateway is added only > when `--intent inference`. CDI defaults also flip: diff --git a/docs/user/api-reference.md b/docs/user/api-reference.md index 7bb93f9f9..16eb016e4 100644 --- a/docs/user/api-reference.md +++ b/docs/user/api-reference.md @@ -359,8 +359,8 @@ Bundler names correspond to component names in [`recipes/registry.yaml`](https:/ | `kai-scheduler` | DRA-aware gang scheduler with topology-aware placement | | `grove` | Dynamo pod lifecycle management | | `dynamo-platform` | NVIDIA Dynamo inference serving platform | -| `kgateway-crds` | Kubernetes Gateway API CRDs | -| `kgateway` | Kubernetes Gateway API implementation | +| `agentgateway-crds` | Kubernetes Gateway API CRDs for AI/ML inference (Gateway API + Inference Extension) | +| `agentgateway` | Kubernetes Gateway API implementation for AI/ML inference (InferencePool routing) | | `k8s-nim-operator` | NVIDIA NIM Operator for inference microservice deployments | | `kueue` | Kubernetes-native job queuing for batch and AI workloads | | `kubeflow-trainer` | Kubeflow Training Operator for distributed training | diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index c51370e85..9fb1a1ce1 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1072,7 +1072,7 @@ aicr bundle --recipe recipe.yaml \ This results in: - **GPU daemonsets** (driver, device-plugin, toolkit, dcgm): `nodeSelector=nodeGroup=gpu-worker` + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute` - **NFD workers**: no nodeSelector (runs on all nodes) + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute` -- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, kgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute` +- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, agentgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute` **Behavior:** - All components from the recipe are bundled automatically diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index b67a46f8d..b6392ee84 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -27,8 +27,8 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/kai-scheduler/KAI-Scheduler) | | **grove** | Pod lifecycle management for Dynamo inference platform. Installed as a standalone component. | [Grove](https://github.com/ai-dynamo/grove) | | **dynamo-platform** | NVIDIA Dynamo inference serving platform with bundled CRDs. Distributed inference with prefix-cache-aware routing and disaggregated prefill/decode. | [Dynamo](https://github.com/ai-dynamo/dynamo) | -| **kgateway-crds** | Custom Resource Definitions for kgateway (Kubernetes Gateway API implementation). | [kgateway](https://github.com/kgateway-dev/kgateway) | -| **kgateway** | Kubernetes Gateway API implementation. Provides model-aware ingress routing for inference workloads. | [kgateway](https://github.com/kgateway-dev/kgateway) | +| **agentgateway-crds** | Custom Resource Definitions for agentgateway (Kubernetes Gateway API implementation for AI/ML inference). | [agentgateway](https://github.com/agentgateway/agentgateway) | +| **agentgateway** | Kubernetes Gateway API implementation for AI/ML inference. Implements the Gateway API Inference Extension for model-aware ingress routing to InferencePool backends. | [agentgateway](https://github.com/agentgateway/agentgateway) | | **k8s-nim-operator** | NVIDIA NIM Operator for managing NIM (NVIDIA Inference Microservices) deployments on Kubernetes. | [K8s NIM Operator](https://github.com/NVIDIA/k8s-nim-operator) | | **kueue** | Kubernetes-native job queuing system. Manages quotas and admits jobs for batch and AI workloads. | [Kueue](https://github.com/kubernetes-sigs/kueue) | | **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) | @@ -41,7 +41,7 @@ Not every component appears in every recipe. The recipe engine selects component - **Base components** (cert-manager, kube-prometheus-stack) appear in most recipes. - **Cloud-specific components** (aws-efa, aws-ebs-csi-driver) are added when the service matches. -- **Intent-specific components** (kgateway, kgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). +- **Intent-specific components** (agentgateway, agentgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). - **Platform-specific components** (slinky-slurm-operator, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. - **Accelerator/OS-specific tuning** (nodewright-customizations, nvidia-dra-driver-gpu) varies by hardware and OS combination. diff --git a/docs/user/container-images.md b/docs/user/container-images.md index ac5e46c2e..bed4c912c 100644 --- a/docs/user/container-images.md +++ b/docs/user/container-images.md @@ -23,12 +23,14 @@ A machine-readable **CycloneDX 1.6 JSON** companion to this page is produced by - Unique images: **69** - Distinct registries: **11** -Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev` +Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.agentgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev` ## Components | Component | Type | Chart | Pinned Version | Images | |-----------|------|-------|----------------|--------| +| agentgateway | helm | agentgateway | v2.2.1 | 1 | +| agentgateway-crds | helm | agentgateway-crds | v2.2.1 | 0 | | aws-ebs-csi-driver | helm | aws-ebs-csi-driver/aws-ebs-csi-driver | 2.59.0 | 6 | | aws-efa | helm | aws-efa-k8s-device-plugin | v0.5.3 | 1 | | cert-manager | helm | jetstack/cert-manager | v1.20.2 | 4 | @@ -39,8 +41,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` | k8s-ephemeral-storage-metrics | helm | k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics | 1.19.2 | 1 | | k8s-nim-operator | helm | k8s-nim-operator | 3.1.0 | 1 | | kai-scheduler | helm | kai-scheduler | v0.14.1 | 2 | -| kgateway | helm | kgateway | v2.0.0 | 1 | -| kgateway-crds | helm | kgateway-crds | v2.0.0 | 0 | | kube-prometheus-stack | helm | prometheus-community/kube-prometheus-stack | 84.4.0 | 8 | | kubeflow-trainer | helm | kubeflow-trainer | 2.2.0 | 3 | | kueue | helm | kueue | 0.17.1 | 1 | @@ -54,6 +54,14 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` ## Images by component +### agentgateway + +- `cr.agentgateway.dev/controller:v2.2.1` + +### agentgateway-crds + +_No images extracted._ + ### aws-ebs-csi-driver - `public.ecr.aws/csi-components/csi-attacher:v4.11.0-eksbuild.4` @@ -119,14 +127,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` - `ghcr.io/kai-scheduler/kai-scheduler/crd-upgrader:v0.14.1` - `ghcr.io/kai-scheduler/kai-scheduler/operator:v0.14.1` -### kgateway - -- `cr.kgateway.dev/kgateway-dev/kgateway:v2.0.0` - -### kgateway-crds - -_No images extracted._ - ### kube-prometheus-stack - `docker.io/grafana/grafana:13.0.1` @@ -141,7 +141,7 @@ _No images extracted._ ### kubeflow-trainer - `ghcr.io/kubeflow/trainer/trainer-controller-manager:v2.2.0` -- `pytorch/pytorch:2.9.1-cuda12.8-cudnn9-runtime@sha256:7b324d212a4450795b49edba9949b7cdc72429148a64e974334bfe5774d51385` +- `pytorch/pytorch:2.11.0-cuda12.8-cudnn9-runtime@sha256:eee11b3b3872a8c838e35ef48f08b2d5def2080902c7f666831310ca1a0ef2be` - `registry.k8s.io/jobset/jobset:v0.11.0` ### kueue @@ -150,7 +150,7 @@ _No images extracted._ ### network-operator -- `busybox:1.36@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662` +- `busybox:1.37@sha256:1487d0af5f52b4ba31c7e465126ee2123fe3f2305d638e7827681e7cf6c83d5e` - `nvcr.io/nvidia/cloud-native/network-operator:v26.1.1` - `nvcr.io/nvidia/doca/doca_telemetry:1.22.5-doca3.1.0-host` - `nvcr.io/nvidia/mellanox/doca-driver:doca3.2.0-25.10-1.2.8.0-2` @@ -211,7 +211,7 @@ AICR pulls from a deliberately diverse set of registries: - **`public.ecr.aws`** — AWS public artifacts (aws-ebs-csi-driver). - **Regional ECR** (`.dkr.ecr..amazonaws.com`) — EKS-internal add-ons. The `aws-efa` entry below shows `us-west-2` because that is the in-tree default; deployments in other regions override `awsefa:image.repository` at bundle or install time. See [Regional registry overrides](../integrator/recipe-development.md#regional-registry-overrides) for the pattern. - **`gcr.io`, `gke.gcr.io`, `us-docker.pkg.dev`** — GCP/GKE add-ons (gke-nccl-tcpxo). -- **`cr.kgateway.dev`** — kgateway. +- **`cr.agentgateway.dev`** — agentgateway (AI inference gateway). - **`docker.io`** — assorted upstream images (`busybox`, `pytorch`, etc.). Customers running in air-gapped or private-registry environments need to mirror every registry above. A dedicated mirroring guide is tracked under [#743](https://github.com/NVIDIA/aicr/issues/743). diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 73ff977b9..8b4ad4e5f 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -1428,14 +1428,14 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin }{Version: "v0.1.0"}, ComponentRefs: []recipe.ComponentRef{ {Name: "cert-manager", Namespace: "cert-manager", Chart: "cert-manager", Version: "v1.17.2", Source: "https://charts.jetstack.io"}, - {Name: "kgateway", Namespace: "kgateway-system", Chart: "kgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"}, + {Name: "agentgateway", Namespace: "agentgateway-system", Chart: "agentgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"}, {Name: "nodewright-operator", Namespace: "skyhook", Chart: "nodewright-operator", Version: "v0.1.0", Source: "https://example.invalid/charts"}, }, - DeploymentOrder: []string{"cert-manager", "kgateway", "nodewright-operator"}, + DeploymentOrder: []string{"cert-manager", "agentgateway", "nodewright-operator"}, }, ComponentValues: map[string]map[string]any{ "cert-manager": {}, - "kgateway": {}, + "agentgateway": {}, "nodewright-operator": {}, }, Version: "v1.0.0", @@ -1449,7 +1449,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin snippet=$(sed -n '/^skip_preflight_for_release()/,/^}/p' "$UNDEPLOY") eval "$snippet" skip_preflight_for_release "nodewright-operator" && echo "skip:nodewright-operator" - skip_preflight_for_release "kgateway" && echo "skip:kgateway" + skip_preflight_for_release "agentgateway" && echo "skip:agentgateway" if skip_preflight_for_release "cert-manager"; then echo "unexpected:cert-manager" exit 1 @@ -1470,7 +1470,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin } out := stdout.String() - for _, want := range []string{"skip:nodewright-operator", "skip:kgateway", "check:cert-manager"} { + for _, want := range []string{"skip:nodewright-operator", "skip:agentgateway", "check:cert-manager"} { if !strings.Contains(out, want) { t.Errorf("expected %q in output; stdout=%q stderr=%q", want, out, stderr.String()) } diff --git a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl index 1752609ce..af65311a9 100644 --- a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh index 2e1f0633c..16d13e6f6 100644 --- a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh index ca77d9e4a..40f7b539b 100644 --- a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh index 215172f8f..b5b7b6c99 100644 --- a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh b/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh index 6df40f551..1ba4d15a0 100644 --- a/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh index 870719568..e625c2883 100644 --- a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh index d8844bfc8..5108629f3 100644 --- a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/evidence/cncf/requirements.go b/pkg/evidence/cncf/requirements.go index 05cbd7454..ea7377229 100644 --- a/pkg/evidence/cncf/requirements.go +++ b/pkg/evidence/cncf/requirements.go @@ -59,7 +59,7 @@ var requirements = map[string]requirementMeta{ }, featureInferenceGateway: { RequirementID: "ai_inference", - Title: "Inference API Gateway (kgateway)", + Title: "Inference API Gateway (agentgateway)", Description: "Demonstrates that the cluster supports Kubernetes Gateway API for AI/ML inference routing with an operational GatewayClass and Gateway.", File: "inference-gateway.md", }, diff --git a/pkg/evidence/cncf/scripts/collect-evidence.sh b/pkg/evidence/cncf/scripts/collect-evidence.sh index 1e789f8ae..200ab03cf 100755 --- a/pkg/evidence/cncf/scripts/collect-evidence.sh +++ b/pkg/evidence/cncf/scripts/collect-evidence.sh @@ -1302,13 +1302,13 @@ collect_gateway() { EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md" log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}" - # Skip if kgateway is not installed (training clusters don't have inference gateway) - if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then - log_info "Inference gateway evidence collection skipped — kgateway not installed." + # Skip if agentgateway is not installed (training clusters don't have inference gateway) + if ! kubectl get deploy -n agentgateway-system --no-headers 2>/dev/null | grep -q .; then + log_info "Inference gateway evidence collection skipped — agentgateway not installed." return fi - write_section_header "Inference API Gateway (kgateway)" + write_section_header "Inference API Gateway (agentgateway)" cat >> "${EVIDENCE_FILE}" <<'EOF' Demonstrates CNCF AI Conformance requirement for Kubernetes Gateway API support @@ -1316,19 +1316,19 @@ with an implementation for advanced traffic management for inference services. ## Summary -1. **kgateway controller** — Running in `kgateway-system` +1. **agentgateway controller** — Running in `agentgateway-system` 2. **inference-gateway deployment** — Running (the inference extension controller) 3. **Gateway API CRDs** — All present (GatewayClass, Gateway, HTTPRoute, GRPCRoute, ReferenceGrant) -4. **Active Gateway** — `inference-gateway` with class `kgateway`, programmed with an AWS ELB address -5. **Inference Extension CRDs** — InferencePool, InferenceModelRewrite, InferenceObjective installed +4. **Active Gateway** — `inference-gateway` with class `agentgateway`, programmed with a load balancer address +5. **Inference Extension CRDs** — InferencePool, InferenceObjective, InferenceModelRewrite installed 6. **Result: PASS** --- -## kgateway Controller +## agentgateway Controller EOF - capture "kgateway deployments" kubectl get deploy -n kgateway-system - capture "kgateway pods" kubectl get pods -n kgateway-system + capture "agentgateway deployments" kubectl get deploy -n agentgateway-system + capture "agentgateway pods" kubectl get pods -n agentgateway-system cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1352,7 +1352,7 @@ EOF ## Active Gateway EOF capture "Gateways" kubectl get gateways -A - capture "Gateway details" kubectl get gateway inference-gateway -n kgateway-system -o yaml + capture "Gateway details" kubectl get gateway inference-gateway -n agentgateway-system -o yaml cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1364,14 +1364,14 @@ EOF echo "" >> "${EVIDENCE_FILE}" echo "**GatewayClass conditions**" >> "${EVIDENCE_FILE}" echo '```' >> "${EVIDENCE_FILE}" - kubectl get gatewayclass kgateway -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 + kubectl get gatewayclass agentgateway -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 echo '```' >> "${EVIDENCE_FILE}" # Check Gateway Programmed condition echo "" >> "${EVIDENCE_FILE}" echo "**Gateway conditions**" >> "${EVIDENCE_FILE}" echo '```' >> "${EVIDENCE_FILE}" - kubectl get gateway inference-gateway -n kgateway-system -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 + kubectl get gateway inference-gateway -n agentgateway-system -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 echo '```' >> "${EVIDENCE_FILE}" cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1388,10 +1388,10 @@ EOF # Verdict — check both GatewayClass Accepted and Gateway Programmed echo "" >> "${EVIDENCE_FILE}" local gw_accepted gw_programmed - gw_accepted=$(kubectl get gatewayclass kgateway -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) - gw_programmed=$(kubectl get gateway inference-gateway -n kgateway-system -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null) + gw_accepted=$(kubectl get gatewayclass agentgateway -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) + gw_programmed=$(kubectl get gateway inference-gateway -n agentgateway-system -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null) if [ "${gw_accepted}" = "True" ] && [ "${gw_programmed}" = "True" ]; then - echo "**Result: PASS** — kgateway controller running, GatewayClass Accepted, Gateway Programmed, inference CRDs installed." >> "${EVIDENCE_FILE}" + echo "**Result: PASS** — agentgateway controller running, GatewayClass Accepted, Gateway Programmed, inference CRDs installed." >> "${EVIDENCE_FILE}" else echo "**Result: FAIL** — No active Gateway found." >> "${EVIDENCE_FILE}" fi diff --git a/pkg/recipe/conformance_test.go b/pkg/recipe/conformance_test.go index 601e61150..fdcdab520 100644 --- a/pkg/recipe/conformance_test.go +++ b/pkg/recipe/conformance_test.go @@ -54,8 +54,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", }, requiredChecks: []string{ "platform-health", @@ -88,8 +88,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, @@ -189,8 +189,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, @@ -285,8 +285,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", }, requiredChecks: []string{ "platform-health", @@ -318,8 +318,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index 02f99afeb..e04026e25 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -845,7 +845,7 @@ func TestOverlayMergeDoesNotLoseBaseComponents(t *testing.T) { builder := NewBuilder() // Build H100 EKS inference recipe with dynamo platform - // Matches overlay chain that adds kgateway, dynamo-platform, kai-scheduler, etc. + // Matches overlay chain that adds agentgateway, dynamo-platform, kai-scheduler, etc. criteria := NewCriteria() criteria.Service = CriteriaServiceEKS criteria.Accelerator = CriteriaAcceleratorH100 diff --git a/recipes/checks/kgateway/health-check.yaml b/recipes/checks/agentgateway/health-check.yaml similarity index 77% rename from recipes/checks/kgateway/health-check.yaml rename to recipes/checks/agentgateway/health-check.yaml index 0065cda76..ad79282b8 100644 --- a/recipes/checks/kgateway/health-check.yaml +++ b/recipes/checks/agentgateway/health-check.yaml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# KGateway Health Check +# Agentgateway Health Check # -# Validates that KGateway is running and healthy in the kgateway-system -# namespace. Checks that the kgateway deployment has at least one available -# replica and that no pods in the namespace are stuck in Pending, Failed, -# or Unknown phases. +# Validates that agentgateway is running and healthy in the agentgateway-system +# namespace. Checks that the agentgateway deployment has at least one available +# replica and that no pods in the namespace are stuck in Pending, Failed, or +# Unknown phases. apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: - name: kgateway-health-check + name: agentgateway-health-check spec: timeouts: assert: 5m @@ -29,14 +29,14 @@ spec: - name: validate-deployment-exists try: # Guard against vacuous pass on empty namespace: verify the - # kgateway deployment exists and has at least one ready replica. + # agentgateway deployment exists and has at least one ready replica. - assert: resource: apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (availableReplicas > `0`): true - name: validate-all-pods-healthy @@ -49,7 +49,7 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Pending - error: @@ -57,7 +57,7 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Failed - error: @@ -65,6 +65,6 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Unknown diff --git a/recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml b/recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml similarity index 99% rename from recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml rename to recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml index 600edfcf3..71a2566d0 100644 --- a/recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml +++ b/recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml @@ -1,8 +1,8 @@ # Standard Kubernetes Gateway API CRDs (v1.2.1) # Source: https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.1/standard-install.yaml -# Required by kgateway for Gateway, HTTPRoute, and GRPCRoute resources. +# Required by agentgateway for Gateway, HTTPRoute, and GRPCRoute resources. # -# These CRDs are not included in the kgateway-crds Helm chart and must +# These CRDs are not included in the agentgateway-crds Helm chart and must # be installed separately. Vendored here for fully automated deployment. # # aicr/skip-hook-validation: "true" diff --git a/recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml b/recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml similarity index 99% rename from recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml rename to recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml index d87d5fc3b..b03890aa4 100644 --- a/recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml +++ b/recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml @@ -14,9 +14,9 @@ # Gateway API Inference Extension CRDs (v1.3.0) # Source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.3.0/manifests.yaml -# Required by kgateway for InferencePool and InferenceModel resources. +# Required by agentgateway for InferencePool and InferenceObjective resources. # -# These CRDs are not included in the kgateway Helm chart and must +# These CRDs are not included in the agentgateway-crds Helm chart and must # be installed separately. Vendored here for fully automated deployment. # # aicr/skip-hook-validation: "true" diff --git a/recipes/components/kgateway-crds/values.yaml b/recipes/components/agentgateway-crds/values.yaml similarity index 70% rename from recipes/components/kgateway-crds/values.yaml rename to recipes/components/agentgateway-crds/values.yaml index c6ad4ba61..e67501e93 100644 --- a/recipes/components/kgateway-crds/values.yaml +++ b/recipes/components/agentgateway-crds/values.yaml @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# kgateway CRDs Helm values -# Installs kgateway-specific CRDs (Backends, DirectResponses, GatewayExtensions, -# GatewayParameters, HTTPListenerPolicies, TrafficPolicies). +# agentgateway CRDs Helm values +# Installs agentgateway-specific CRDs (AgentgatewayBackend, AgentgatewayPolicy, +# AgentgatewayParameters) in the agentgateway.dev/v1alpha1 API group. # -# Note: Standard Gateway API CRDs (GatewayClass, Gateway, HTTPRoute) and -# Inference Extension CRDs (InferencePool, InferenceModel) must be installed -# separately before deploying kgateway. See the kgateway getting started guide: +# Note: Standard Gateway API CRDs (GatewayClass, Gateway, HTTPRoute, GRPCRoute, +# ReferenceGrant) and Gateway API Inference Extension CRDs (InferencePool, +# InferenceObjective) must be installed separately before deploying agentgateway. +# See the upstream guide: # https://gateway-api-inference-extension.sigs.k8s.io/guides/ # This chart has no configurable values — it only installs CRDs. diff --git a/recipes/components/kgateway/manifests/inference-gateway.yaml b/recipes/components/agentgateway/manifests/inference-gateway.yaml similarity index 52% rename from recipes/components/kgateway/manifests/inference-gateway.yaml rename to recipes/components/agentgateway/manifests/inference-gateway.yaml index 310e2958e..9b292b347 100644 --- a/recipes/components/kgateway/manifests/inference-gateway.yaml +++ b/recipes/components/agentgateway/manifests/inference-gateway.yaml @@ -14,52 +14,62 @@ # Inference Gateway — provides external access to inference services # via the Kubernetes Gateway API. Any inference workload (dynamo, vLLM, -# TGI, etc.) can route through this gateway using HTTPRoute or -# InferenceModel resources. -{{- $kgw := index .Values "kgateway" }} -# GatewayParameters configures the proxy pod scheduling to match the -# system node tolerations/nodeSelector injected by the bundler into -# the kgateway controller. Without this, proxy pods land on any -# untainted node instead of system infrastructure nodes. +# TGI, etc.) can route through this gateway using HTTPRoute backendRefs +# to InferencePool resources or plain Service backends. +{{- $agw := index .Values "agentgateway" }} +# AgentgatewayParameters configures the proxy pod scheduling to match the +# system node tolerations/nodeSelector injected by the bundler into the +# agentgateway controller. Without this, proxy pods land on any untainted +# node instead of system infrastructure nodes. The new API uses a +# strategic-merge-patch into the generated Deployment spec. --- -apiVersion: gateway.kgateway.dev/v1alpha1 -kind: GatewayParameters +apiVersion: agentgateway.dev/v1alpha1 +kind: AgentgatewayParameters metadata: name: system-proxy - namespace: kgateway-system + namespace: agentgateway-system annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "5" "helm.sh/hook-delete-policy": before-hook-creation spec: - kube: - podTemplate: - extraLabels: + deployment: + metadata: + labels: app.kubernetes.io/managed-by: aicr - {{- if $kgw.nodeSelector }} - nodeSelector: - {{- toYaml $kgw.nodeSelector | nindent 8 }} - {{- end }} - {{- if $kgw.tolerations }} - tolerations: - {{- toYaml $kgw.tolerations | nindent 8 }} - {{- end }} + spec: + template: + metadata: + labels: + # Mirrors the old kgateway GatewayParameters podTemplate.extraLabels + # behavior so the proxy pods themselves carry the AICR ownership + # label, not just the Deployment. + app.kubernetes.io/managed-by: aicr + spec: + {{- if $agw.nodeSelector }} + nodeSelector: + {{- toYaml $agw.nodeSelector | nindent 12 }} + {{- end }} + {{- if $agw.tolerations }} + tolerations: + {{- toYaml $agw.tolerations | nindent 12 }} + {{- end }} --- apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "10" "helm.sh/hook-delete-policy": before-hook-creation spec: - gatewayClassName: kgateway + gatewayClassName: agentgateway infrastructure: parametersRef: - group: gateway.kgateway.dev - kind: GatewayParameters + group: agentgateway.dev + kind: AgentgatewayParameters name: system-proxy listeners: - name: http diff --git a/recipes/components/kgateway/values.yaml b/recipes/components/agentgateway/values.yaml similarity index 75% rename from recipes/components/kgateway/values.yaml rename to recipes/components/agentgateway/values.yaml index 82b3232ce..f6a20e981 100644 --- a/recipes/components/kgateway/values.yaml +++ b/recipes/components/agentgateway/values.yaml @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# kgateway Helm values -# Gateway API and Inference Gateway v1.0.0 conformant implementation. +# agentgateway Helm values +# Kubernetes Gateway API + Gateway API Inference Extension implementation. # Satisfies CNCF AI Conformance Advanced Ingress for AI/ML Inference requirement. -# Provides model-aware routing, weighted traffic splitting, and header-based routing. +# Provides model-aware routing, weighted traffic splitting, and header-based routing +# for InferencePool backends. # Override release name prefix to avoid aicr-stack- prefix -fullnameOverride: kgateway +fullnameOverride: agentgateway resources: requests: @@ -28,6 +29,7 @@ resources: cpu: 500m memory: 256Mi -# Enable Gateway API Inference Extension for model-aware routing +# Enable Gateway API Inference Extension support in the agentgateway controller. +# Required for routing to InferencePool backends. inferenceExtension: enabled: true diff --git a/recipes/mixins/platform-inference.yaml b/recipes/mixins/platform-inference.yaml index 71b02c7b1..1135cfb7c 100644 --- a/recipes/mixins/platform-inference.yaml +++ b/recipes/mixins/platform-inference.yaml @@ -18,22 +18,22 @@ metadata: name: platform-inference spec: componentRefs: - - name: kgateway-crds + - name: agentgateway-crds type: Helm - source: oci://cr.kgateway.dev/kgateway-dev/charts - version: v2.0.0 - valuesFile: components/kgateway-crds/values.yaml + source: oci://cr.agentgateway.dev/charts + version: v2.2.1 + valuesFile: components/agentgateway-crds/values.yaml manifestFiles: - - components/kgateway-crds/manifests/gateway-api-crds.yaml - - components/kgateway-crds/manifests/inference-extension-crds.yaml + - components/agentgateway-crds/manifests/gateway-api-crds.yaml + - components/agentgateway-crds/manifests/inference-extension-crds.yaml - - name: kgateway + - name: agentgateway type: Helm - source: oci://cr.kgateway.dev/kgateway-dev/charts - version: v2.0.0 - valuesFile: components/kgateway/values.yaml + source: oci://cr.agentgateway.dev/charts + version: v2.2.1 + valuesFile: components/agentgateway/values.yaml manifestFiles: - - components/kgateway/manifests/inference-gateway.yaml + - components/agentgateway/manifests/inference-gateway.yaml dependencyRefs: - - kgateway-crds + - agentgateway-crds - cert-manager diff --git a/recipes/registry.yaml b/recipes/registry.yaml index 43538bfdd..228fe5d8d 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -425,27 +425,27 @@ components: tolerationPaths: - dynamo-operator.controllerManager.tolerations - - name: kgateway-crds - displayName: kgateway-crds + - name: agentgateway-crds + displayName: agentgateway-crds valueOverrideKeys: - - kgatewaycrds + - agentgatewaycrds helm: - defaultRepository: oci://cr.kgateway.dev/kgateway-dev/charts - defaultChart: kgateway-crds - defaultVersion: v2.0.0 - defaultNamespace: kgateway-system + defaultRepository: oci://cr.agentgateway.dev/charts + defaultChart: agentgateway-crds + defaultVersion: v2.2.1 + defaultNamespace: agentgateway-system - - name: kgateway - displayName: kgateway + - name: agentgateway + displayName: agentgateway valueOverrideKeys: - - kgateway + - agentgateway healthCheck: - assertFile: checks/kgateway/health-check.yaml + assertFile: checks/agentgateway/health-check.yaml helm: - defaultRepository: oci://cr.kgateway.dev/kgateway-dev/charts - defaultChart: kgateway - defaultVersion: v2.0.0 - defaultNamespace: kgateway-system + defaultRepository: oci://cr.agentgateway.dev/charts + defaultChart: agentgateway + defaultVersion: v2.2.1 + defaultNamespace: agentgateway-system nodeScheduling: system: nodeSelectorPaths: diff --git a/recipes/validators/README.md b/recipes/validators/README.md index cd43075a9..073efa212 100644 --- a/recipes/validators/README.md +++ b/recipes/validators/README.md @@ -58,7 +58,7 @@ Applied by `catalog.Load` (`pkg/validator/catalog/catalog.go`) in order: | `gang-scheduling` | Verify gang scheduling with KAI scheduler using CPU-only workers | 10m | | `accelerator-metrics` | Verify accelerator metrics from DCGM exporter | 5m | | `ai-service-metrics` | Verify AI service metrics via Prometheus | 5m | -| `inference-gateway` | Verify inference gateway (kgateway) is operational | 5m | +| `inference-gateway` | Verify inference gateway (agentgateway) is operational | 5m | | `pod-autoscaling` | Verify HPA-driven pod autoscaling with GPU metrics | 10m | | `cluster-autoscaling` | Verify cluster autoscaling with Karpenter | 10m | | `robust-controller` | Verify Dynamo operator controller and webhooks | 5m | diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml index 37697a1df..2ebc2d49a 100644 --- a/recipes/validators/catalog.yaml +++ b/recipes/validators/catalog.yaml @@ -109,7 +109,7 @@ validators: env: [] - name: inference-gateway phase: conformance - description: "Verify inference gateway (kgateway) is operational" + description: "Verify inference gateway (agentgateway) is operational" image: ghcr.io/nvidia/aicr-validators/conformance:latest timeout: 5m args: ["inference-gateway"] diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index 90a13d54a..39a84e513 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -9,7 +9,7 @@ Chainsaw suites validating AI conformance flows across environments: - `common/` — assertions shared by `cluster/` and both Kind GPU suites - `kind-common/` — assertions shared only by Kind GPU suites -The `cluster/` suite validates the NVIDIA AI-conformance inference stack: KAI Scheduler (GPU scheduling), kgateway with Gateway API Inference Extension (inference routing), and the NVIDIA Dynamo serving platform. +The `cluster/` suite validates the NVIDIA AI-conformance inference stack: KAI Scheduler (GPU scheduling), agentgateway with Gateway API Inference Extension (inference routing), and the NVIDIA Dynamo serving platform. ## Cluster Inference Recipe @@ -56,8 +56,8 @@ The Kind GPU workflows use these leaf recipes instead: | prometheus-adapter | monitoring | Helm | Deployment | | aws-ebs-csi-driver | kube-system | Helm | **Disabled by default** (EKS managed addon) | | aws-efa | kube-system | Helm | Device plugin DaemonSet | -| kgateway-crds | kgateway-system | Helm | CRDs only (Gateway API + Inference Extension) | -| kgateway | kgateway-system | Helm | Controller Deployment | +| agentgateway-crds | agentgateway-system | Helm | CRDs only (Gateway API + Inference Extension) | +| agentgateway | agentgateway-system | Helm | Controller Deployment | | nodewright-customizations | skyhook | Manifest | No workloads (NodeConfiguration CRs) | | nvidia-dra-driver-gpu | nvidia-dra-driver | Helm | Controller Deployment, kubelet-plugin DaemonSet | | kai-scheduler | kai-scheduler | Helm | Scheduler Deployment | @@ -84,7 +84,7 @@ tests/chainsaw/ai-conformance/ │ ├── chainsaw-test.yaml # Inference leaf health check orchestration │ ├── assert-crds.yaml # Inference-specific CRDs installed │ ├── assert-dynamo.yaml # Dynamo platform healthy on kind -│ ├── assert-kgateway.yaml # kgateway healthy on kind +│ ├── assert-agentgateway.yaml # agentgateway healthy on kind │ └── assert-namespaces.yaml # Inference-specific namespaces exist ├── kind-training-kubeflow/ # Kind + H100 + training + kubeflow leaf suite │ ├── chainsaw-test.yaml # Training leaf health check orchestration @@ -100,7 +100,7 @@ tests/chainsaw/ai-conformance/ ├── assert-crds.yaml # Critical CRDs installed ├── assert-gpu-operator.yaml # GPU operator + DaemonSets healthy ├── assert-kube-system.yaml # AWS EFA healthy - ├── assert-kgateway.yaml # kgateway healthy + ├── assert-agentgateway.yaml # agentgateway healthy ├── assert-nvsentinel.yaml # NVSentinel healthy └── assert-dynamo.yaml # Dynamo platform healthy ``` @@ -159,7 +159,7 @@ chainsaw test \ | Component Group | Timeout | Reason | |-----------------|---------|--------| | Namespaces, CRDs | 2m | Should exist immediately after deployment | -| cert-manager, kgateway, skyhook, monitoring, kai-scheduler | 5m | Standard Deployment rollout | +| cert-manager, agentgateway, skyhook, monitoring, kai-scheduler | 5m | Standard Deployment rollout | | gpu-operator, nvidia-dra-driver-gpu | 10m | GPU driver compilation on nodes is slow | | dynamo-platform | 5m | Operator + etcd + NATS startup | diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml similarity index 75% rename from tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml rename to tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml index aebdf961d..be151ce4a 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Assert kgateway controller is available. -# fullnameOverride: kgateway (from values.yaml) -# Chart: kgateway v2.0.0 (oci://cr.kgateway.dev/kgateway-dev/charts) +# Assert agentgateway controller is available. +# fullnameOverride: agentgateway (from values.yaml) +# Chart: agentgateway v2.2.1 (oci://cr.agentgateway.dev/charts) # Satisfies CNCF AI Conformance Advanced Ingress for AI/ML Inference. -# Implements Gateway API + Inference Extension for model-aware routing. +# Implements Gateway API + Inference Extension for InferencePool routing. apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" @@ -31,7 +31,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" diff --git a/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml b/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml index e5524c5cf..e2cc501b8 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml @@ -13,7 +13,7 @@ # limitations under the License. # Assert that critical CRDs are installed by the AI-conformance inference stack. -# Covers CRD-only components (kgateway-crds) and operator-managed CRDs (dynamo-platform). +# Covers CRD-only components (agentgateway-crds) and operator-managed CRDs (dynamo-platform). # ── GPU Operator ─────────────────────────────────────────────────────── # ClusterPolicy CRD — the GPU operator's primary configuration object @@ -38,7 +38,7 @@ kind: CustomResourceDefinition metadata: name: clusterissuers.cert-manager.io --- -# ── kgateway-crds (Gateway API + Inference Extension) ───────────────── +# ── agentgateway-crds (Gateway API + Inference Extension) ──────────── apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: diff --git a/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml b/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml index 39319adae..78d069bf2 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml @@ -53,7 +53,7 @@ status: apiVersion: v1 kind: Namespace metadata: - name: kgateway-system + name: agentgateway-system status: phase: Active --- diff --git a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml index 2c78cba60..17049f90f 100644 --- a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml @@ -40,7 +40,7 @@ spec: # ── CRDs ─────────────────────────────────────────────────────────── - name: assert-crds - description: Verify critical CRDs are installed (kgateway, dynamo, GPU operator, cert-manager). + description: Verify critical CRDs are installed (agentgateway, dynamo, GPU operator, cert-manager). timeouts: assert: 120s try: @@ -77,12 +77,12 @@ spec: - assert: file: assert-kube-system.yaml - # ── kgateway ─────────────────────────────────────────────────────── - - name: assert-kgateway - description: Verify kgateway controller is available. + # ── agentgateway ─────────────────────────────────────────────────── + - name: assert-agentgateway + description: Verify agentgateway controller is available. try: - assert: - file: assert-kgateway.yaml + file: assert-agentgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml similarity index 83% rename from tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml rename to tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml index ee45021e9..889b8c5ea 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Assert kgateway controller is available on the kind inference + Dynamo stack. +# Assert agentgateway controller is available on the kind inference + Dynamo stack. apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" @@ -26,7 +26,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml index fe0e152d6..7791b3711 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml @@ -36,7 +36,7 @@ kind: CustomResourceDefinition metadata: name: clusterissuers.cert-manager.io --- -# kgateway-crds (Gateway API + Inference Extension) +# agentgateway-crds (Gateway API + Inference Extension) apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml index cd65aebc6..c8b853f33 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml @@ -54,7 +54,7 @@ status: apiVersion: v1 kind: Namespace metadata: - name: kgateway-system + name: agentgateway-system status: phase: Active --- diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index a9e2510bd..8711c88a4 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -40,7 +40,7 @@ spec: # ── CRDs ─────────────────────────────────────────────────────────── - name: assert-crds - description: Verify critical CRDs are installed (kgateway, dynamo, GPU operator, cert-manager). + description: Verify critical CRDs are installed (agentgateway, dynamo, GPU operator, cert-manager). timeouts: assert: 120s try: @@ -70,12 +70,12 @@ spec: - assert: file: ../kind-common/assert-monitoring.yaml - # ── kgateway ─────────────────────────────────────────────────────── - - name: assert-kgateway - description: Verify kgateway controller is available. + # ── agentgateway ─────────────────────────────────────────────────── + - name: assert-agentgateway + description: Verify agentgateway controller is available. try: - assert: - file: assert-kgateway.yaml + file: assert-agentgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook diff --git a/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml b/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml index 1483d9de9..2a730dc02 100644 --- a/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml +++ b/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service eks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,6 +36,8 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: aws-ebs-csi-driver - name: aws-efa - name: cert-manager @@ -44,8 +46,6 @@ componentRefs: ## alphabetically sorted - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -54,12 +54,12 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - aws-ebs-csi-driver - aws-efa - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - nfd diff --git a/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml index 6bf6d7b56..fdf0d1dff 100644 --- a/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml @@ -90,7 +90,7 @@ spec: aws-efa cert-manager \ dynamo-platform grove gpu-operator \ k8s-ephemeral-storage-metrics kai-scheduler \ - kgateway kgateway-crds kube-prometheus-stack \ + agentgateway agentgateway-crds kube-prometheus-stack \ nvidia-dra-driver-gpu nvsentinel prometheus-adapter \ nodewright-customizations nodewright-operator; do match=$(ls -d "${WORK}"/bundle/[0-9][0-9][0-9]-"${component}" 2>/dev/null | head -1) diff --git a/tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml b/tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml similarity index 66% rename from tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml rename to tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml index 17730c934..77bf2f919 100644 --- a/tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml +++ b/tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml @@ -16,23 +16,23 @@ apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: - name: cli-bundle-kgateway-templates + name: cli-bundle-agentgateway-templates spec: description: | - Validates that kgateway manifest templates render correctly. - Tests inference-gateway GatewayParameters scheduling (nodeSelector/tolerations). - Run with: AICR_BIN=$(pwd)/dist/e2e/aicr chainsaw test --no-cluster --test-dir tests/chainsaw/bundle-templates/kgateway + Validates that agentgateway manifest templates render correctly. + Tests inference-gateway AgentgatewayParameters scheduling (nodeSelector/tolerations). + Run with: AICR_BIN=$(pwd)/dist/e2e/aicr chainsaw test --no-cluster --test-dir tests/chainsaw/bundle-templates/agentgateway timeouts: exec: 30s steps: - name: generate-recipe - description: Generate an EKS H100 inference recipe (kgateway is inference-only). + description: Generate an EKS H100 inference recipe (agentgateway is inference-only). try: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}" && mkdir -p "${WORK}" ${AICR_BIN} recipe \ --service eks \ @@ -49,26 +49,26 @@ spec: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}/bundle-defaults" ${AICR_BIN} bundle \ --recipe "${WORK}/recipe.yaml" \ --output "${WORK}/bundle-defaults" - name: assert-gateway-defaults - description: Verify GatewayParameters and Gateway resources exist. + description: Verify AgentgatewayParameters and Gateway resources exist. try: - script: content: | - WORK="/tmp/chainsaw-bundle-kgateway-templates" - ## kgateway is a mixed component (upstream Helm + raw manifests), - ## so its raw manifests render into an injected NNN-kgateway-post/ + WORK="/tmp/chainsaw-bundle-agentgateway-templates" + ## agentgateway is a mixed component (upstream Helm + raw manifests), + ## so its raw manifests render into an injected NNN-agentgateway-post/ ## wrapped chart's templates/ folder under #662's layout. - MANIFEST=$(ls "${WORK}"/bundle-defaults/[0-9][0-9][0-9]-kgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) - [ -n "${MANIFEST}" ] || { echo "kgateway inference-gateway.yaml not found" >&2; exit 1; } - ## The manifest contains two resources (GatewayParameters + Gateway). + MANIFEST=$(ls "${WORK}"/bundle-defaults/[0-9][0-9][0-9]-agentgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) + [ -n "${MANIFEST}" ] || { echo "agentgateway inference-gateway.yaml not found" >&2; exit 1; } + ## The manifest contains two resources (AgentgatewayParameters + Gateway). ## Verify both are present. - grep -q 'kind: GatewayParameters' "${MANIFEST}" + grep -q 'kind: AgentgatewayParameters' "${MANIFEST}" grep -q 'kind: Gateway' "${MANIFEST}" ## ── With system node scheduling ──────────────────────────────────── @@ -79,7 +79,7 @@ spec: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}/bundle-scheduling" ${AICR_BIN} bundle \ --recipe "${WORK}/recipe.yaml" \ @@ -87,17 +87,17 @@ spec: --system-node-selector nodeGroup=system-pool - name: assert-gateway-scheduling - description: Verify GatewayParameters has nodeSelector for proxy pods. + description: Verify AgentgatewayParameters has nodeSelector for proxy pods. try: - script: content: | - WORK="/tmp/chainsaw-bundle-kgateway-templates" - MANIFEST=$(ls "${WORK}"/bundle-scheduling/[0-9][0-9][0-9]-kgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) - [ -n "${MANIFEST}" ] || { echo "kgateway inference-gateway.yaml not found in scheduling bundle" >&2; exit 1; } - ## Verify nodeSelector was injected into GatewayParameters + WORK="/tmp/chainsaw-bundle-agentgateway-templates" + MANIFEST=$(ls "${WORK}"/bundle-scheduling/[0-9][0-9][0-9]-agentgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) + [ -n "${MANIFEST}" ] || { echo "agentgateway inference-gateway.yaml not found in scheduling bundle" >&2; exit 1; } + ## Verify nodeSelector was injected into AgentgatewayParameters grep -q 'nodeSelector:' "${MANIFEST}" grep -q 'nodeGroup: system-pool' "${MANIFEST}" cleanup: - script: content: | - rm -rf /tmp/chainsaw-bundle-kgateway-templates + rm -rf /tmp/chainsaw-bundle-agentgateway-templates diff --git a/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml index 2633b1283..1affdd007 100644 --- a/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service eks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,17 +36,16 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: aws-ebs-csi-driver - name: aws-efa - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -55,15 +54,15 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - aws-ebs-csi-driver - aws-efa - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - - prometheus-adapter + - nfd - nodewright-operator - nodewright-customizations - gpu-operator @@ -71,3 +70,4 @@ deploymentOrder: - dynamo-platform - nvidia-dra-driver-gpu - nvsentinel + - prometheus-adapter diff --git a/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml index 55851ac73..e151184fb 100644 --- a/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service aks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,15 +36,14 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: network-operator - name: nfd @@ -53,17 +52,18 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - - gpu-operator - k8s-ephemeral-storage-metrics + - nfd + - network-operator + - gpu-operator - kai-scheduler - dynamo-platform - - network-operator + - nodewright-operator - nvidia-dra-driver-gpu - nvsentinel - prometheus-adapter - - nodewright-operator diff --git a/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml index b17ba261a..9c1ad23b6 100644 --- a/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service gke --accelerator h100 --intent inference # --os cos --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -30,15 +30,14 @@ constraints: - name: K8s.server.version value: '>= 1.34' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -47,13 +46,13 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - - prometheus-adapter + - nfd - nodewright-operator - nodewright-customizations - gpu-operator @@ -61,3 +60,4 @@ deploymentOrder: - dynamo-platform - nvidia-dra-driver-gpu - nvsentinel + - prometheus-adapter diff --git a/validators/conformance/inference_gateway_check.go b/validators/conformance/inference_gateway_check.go index 12246dc14..758c8648d 100644 --- a/validators/conformance/inference_gateway_check.go +++ b/validators/conformance/inference_gateway_check.go @@ -39,13 +39,13 @@ type gatewayDataPlaneReport struct { } // CheckInferenceGateway validates CNCF requirement #6: Inference Gateway. -// Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed, +// Verifies GatewayClass "agentgateway" is accepted, Gateway "inference-gateway" is programmed, // and required Gateway API + InferencePool CRDs exist. func CheckInferenceGateway(ctx *validators.Context) error { - // Skip if the recipe does not include kgateway (inference gateway component). + // Skip if the recipe does not include agentgateway (inference gateway component). // Training clusters typically don't have an inference gateway. - if !recipeHasComponent(ctx, "kgateway") { - return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only") + if !recipeHasComponent(ctx, "agentgateway") { + return validators.Skip("agentgateway not in recipe — inference gateway check applies to inference clusters only") } dynClient, err := getDynamicClient(ctx) @@ -55,13 +55,13 @@ func CheckInferenceGateway(ctx *validators.Context) error { collectGatewayControlPlaneArtifacts(ctx) - // 1. GatewayClass "kgateway" accepted + // 1. GatewayClass "agentgateway" accepted gcGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gatewayclasses", } - gc, err := dynClient.Resource(gcGVR).Get(ctx.Ctx, "kgateway", metav1.GetOptions{}) + gc, err := dynClient.Resource(gcGVR).Get(ctx.Ctx, "agentgateway", metav1.GetOptions{}) if err != nil { - return errors.Wrap(errors.ErrCodeNotFound, "GatewayClass 'kgateway' not found", err) + return errors.Wrap(errors.ErrCodeNotFound, "GatewayClass 'agentgateway' not found", err) } gcCond, condErr := getConditionObservation(gc, "Accepted") if condErr != nil { @@ -74,7 +74,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { } controllerName, _, _ := unstructured.NestedString(gc.Object, "spec", "controllerName") recordRawTextArtifact(ctx, "GatewayClass", - "kubectl get gatewayclass kgateway -o yaml", + "kubectl get gatewayclass agentgateway -o yaml", fmt.Sprintf("Name: %s\nControllerName: %s\nAccepted: %s\nReason: %s\nMessage: %s", gc.GetName(), valueOrUnknown(controllerName), gcCond.Status, gcCond.Reason, gcCond.Message)) @@ -82,7 +82,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { gwGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gateways", } - gw, err := dynClient.Resource(gwGVR).Namespace("kgateway-system").Get( + gw, err := dynClient.Resource(gwGVR).Namespace("agentgateway-system").Get( ctx.Ctx, "inference-gateway", metav1.GetOptions{}) if err != nil { return errors.Wrap(errors.ErrCodeNotFound, "Gateway 'inference-gateway' not found", err) @@ -106,7 +106,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { fmt.Sprintf("Name: %s/%s\nProgrammed: %s\nReason: %s\nMessage: %s\nAddressCount: %d", gw.GetNamespace(), gw.GetName(), gwCond.Status, gwCond.Reason, gwCond.Message, addressCount)) recordObjectYAMLArtifact(ctx, "Gateway details", - "kubectl get gateway inference-gateway -n kgateway-system -o yaml", gw.Object) + "kubectl get gateway inference-gateway -n agentgateway-system -o yaml", gw.Object) // 3. Required CRDs exist crdGVR := schema.GroupVersionResource{ @@ -133,7 +133,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { return err } recordRawTextArtifact(ctx, "Gateway Data Plane", - "kubectl get endpointslices -n kgateway-system", + "kubectl get endpointslices -n agentgateway-system", fmt.Sprintf("Listeners: %d\nAttached HTTPRoutes: %d\nHTTPRoutes (all): %d\nMatching EndpointSlices: %d\nReady endpoints: %d", report.ListenerCount, report.AttachedHTTPRoutes, report.TotalHTTPRoutes, report.MatchingEndpointSlice, report.ReadyEndpoints)) @@ -159,7 +159,7 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, gwGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gateways", } - gw, gwErr := dynClient.Resource(gwGVR).Namespace("kgateway-system").Get( + gw, gwErr := dynClient.Resource(gwGVR).Namespace("agentgateway-system").Get( ctx.Ctx, "inference-gateway", metav1.GetOptions{}) if gwErr == nil { listeners, found, _ := unstructured.NestedSlice(gw.Object, "status", "listeners") @@ -204,11 +204,11 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, // 3. Endpoint readiness (hard requirement): verify inference-gateway proxy has ready endpoints. // Filter by kubernetes.io/service-name containing "inference-gateway" to avoid matching // unrelated services in the namespace (e.g. controller manager, webhooks). - slices, err := ctx.Clientset.DiscoveryV1().EndpointSlices("kgateway-system").List( + slices, err := ctx.Clientset.DiscoveryV1().EndpointSlices("agentgateway-system").List( ctx.Ctx, metav1.ListOptions{}) if err != nil { return nil, errors.Wrap(errors.ErrCodeInternal, - "failed to list EndpointSlices in kgateway-system", err) + "failed to list EndpointSlices in agentgateway-system", err) } for _, slice := range slices.Items { @@ -226,7 +226,7 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, if report.ReadyEndpoints == 0 { return nil, errors.New(errors.ErrCodeInternal, - "no ready endpoints for inference-gateway proxy in kgateway-system") + "no ready endpoints for inference-gateway proxy in agentgateway-system") } return report, nil @@ -237,10 +237,10 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { return } - deploys, deployErr := ctx.Clientset.AppsV1().Deployments("kgateway-system").List( + deploys, deployErr := ctx.Clientset.AppsV1().Deployments("agentgateway-system").List( ctx.Ctx, metav1.ListOptions{}) if deployErr != nil { - recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system", + recordRawTextArtifact(ctx, "agentgateway deployments", "kubectl get deploy -n agentgateway-system", fmt.Sprintf("failed to list deployments: %v", deployErr)) } else { var deploymentSummary strings.Builder @@ -252,12 +252,12 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { fmt.Fprintf(&deploymentSummary, "%-40s available=%d/%d image=%s\n", d.Name, d.Status.AvailableReplicas, expected, firstContainerImage(d.Spec.Template.Spec.Containers)) } - recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system", deploymentSummary.String()) + recordRawTextArtifact(ctx, "agentgateway deployments", "kubectl get deploy -n agentgateway-system", deploymentSummary.String()) } - pods, podErr := ctx.Clientset.CoreV1().Pods("kgateway-system").List(ctx.Ctx, metav1.ListOptions{}) + pods, podErr := ctx.Clientset.CoreV1().Pods("agentgateway-system").List(ctx.Ctx, metav1.ListOptions{}) if podErr != nil { - recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system", + recordRawTextArtifact(ctx, "agentgateway pods", "kubectl get pods -n agentgateway-system", fmt.Sprintf("failed to list pods: %v", podErr)) return } @@ -266,5 +266,5 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { fmt.Fprintf(&podSummary, "%-48s ready=%s phase=%s node=%s\n", pod.Name, podReadyCount(pod), pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName)) } - recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system", podSummary.String()) + recordRawTextArtifact(ctx, "agentgateway pods", "kubectl get pods -n agentgateway-system", podSummary.String()) }