diff --git a/.github/scripts/gpu-debug-diagnostics.sh b/.github/scripts/gpu-debug-diagnostics.sh index 089e32ca2..3576ba386 100644 --- a/.github/scripts/gpu-debug-diagnostics.sh +++ b/.github/scripts/gpu-debug-diagnostics.sh @@ -259,9 +259,9 @@ print_dynamo_diagnostics() { kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true } -print_kgateway_diagnostics() { - echo "=== kgateway pods ===" - kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true +print_agentgateway_diagnostics() { + echo "=== agentgateway pods ===" + kubectl_kind -n agentgateway-system get pods -o wide 2>/dev/null || true echo "=== GatewayClass status ===" kubectl_kind get gatewayclass -o yaml 2>/dev/null || true echo "=== Gateway status ===" @@ -280,9 +280,9 @@ case "${mode}" in print_kubeflow_diagnostics ;; inference) - print_h100_common_diagnostics dynamo-system kgateway-system + print_h100_common_diagnostics dynamo-system agentgateway-system print_dynamo_diagnostics - print_kgateway_diagnostics + print_agentgateway_diagnostics ;; *) echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}" diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index f02fa85b2..154caca93 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -89,8 +89,8 @@ jobs: - 'recipes/components/nodewright-operator/**' - 'recipes/components/nvidia-dra-driver-gpu/**' - 'recipes/components/nvsentinel/**' - - 'recipes/components/kgateway/**' - - 'recipes/components/kgateway-crds/**' + - 'recipes/components/agentgateway/**' + - 'recipes/components/agentgateway-crds/**' - 'recipes/components/grove/**' - 'recipes/components/dynamo-platform/**' - 'recipes/components/prometheus-adapter/**' diff --git a/demos/cuj2-demo.md b/demos/cuj2-demo.md index 4029fd622..102465346 100644 --- a/demos/cuj2-demo.md +++ b/demos/cuj2-demo.md @@ -42,8 +42,8 @@ │ ├── gpu-operator/ (GPU driver, device-plugin, DCGM) │ │ ├── nvidia-dra-driver-gpu/ (Dynamic Resource Allocation) │ │ ├── kai-scheduler/ (gang scheduling) │ - │ ├── kgateway-crds/ (Gateway API + inference CRDs) │ - │ ├── kgateway/ (inference gateway controller) │ + │ ├── agentgateway-crds/ (Gateway API + inference CRDs) │ + │ ├── agentgateway/ (inference gateway controller) │ │ ├── nvsentinel/ (security/compliance) │ │ ├── nodewright-operator/ (node configuration) │ │ ├── nodewright-customizations/ (H100 tuning) │ @@ -60,13 +60,13 @@ │ $ cd bundle && ./deploy.sh │ │ │ │ cert-manager ──▶ kube-prometheus-stack ──▶ gpu-operator ──▶ │ - │ kai-scheduler ──▶ kgateway ──▶ nvidia-dra-driver ──▶ │ + │ kai-scheduler ──▶ agentgateway ──▶ nvidia-dra-driver ──▶ │ │ dynamo-platform ──▶ nodewright ──▶ nvsentinel ──▶ ... │ │ │ │ Result: Fully configured GPU cluster │ │ • 8x H100 GPUs advertised via DRA │ │ • Gang scheduling (KAI Scheduler) │ - │ • Inference gateway (kgateway) │ + │ • Inference gateway (agentgateway) │ │ • GPU metrics (DCGM → Prometheus → HPA) │ │ • Dynamo inference platform │ └────────────────────────────────────────────────────────────────────────┘ @@ -114,8 +114,8 @@ │ └── aws-efa │ └── aws-efa │ │ │ │ │ │ │ eks-training.yaml │ eks-inference.yaml │ -│ (no new components) │ ├── kgateway-crds ◀── NEW │ -│ │ │ └── kgateway ◀── NEW │ +│ (no new components) │ ├── agentgateway-crds ◀── NEW │ +│ │ │ └── agentgateway ◀── NEW │ │ │ │ │ │ │ h100-eks-training.yaml │ h100-eks-inference.yaml │ │ ├── gpu-operator (CDI, gdrcopy) │ └── nodewright-customizations │ @@ -130,7 +130,8 @@ │ │ └── dynamo-platform ◀─ NEW │ │ │ │ ├─────────────────────────────────────┼─────────────────────────────────────┤ -│ Unique: kubeflow-trainer │ Unique: kgateway-crds, kgateway, │ +│ Unique: kubeflow-trainer │ Unique: agentgateway-crds, │ +│ │ agentgateway, │ │ │ dynamo-crds, dynamo-platform │ ├─────────────────────────────────────┴─────────────────────────────────────┤ │ Shared (base + eks): cert-manager, kube-prometheus-stack, gpu-operator, │ @@ -253,7 +254,7 @@ http://127.0.0.1:9090/chat.html │ │ │ toolkit, DCGM, validator) │ │ │ 4 │ accelerator_metrics │ gpu-operator (DCGM exporter) │ base │ │ 5 │ ai_service_metrics │ kube-prometheus-stack, prometheus-adapter│ base │ -│ 6 │ ai_inference │ kgateway-crds, kgateway │ eks-inf │ +│ 6 │ ai_inference │ agentgateway-crds, agentgateway │ eks-inf │ │ 7 │ robust_controller │ dynamo-crds, dynamo-platform │ dynamo │ │ 8 │ pod_autoscaling │ prometheus-adapter + HPA │ base │ │ 9 │ cluster_autoscaling │ EKS Auto Scaling Group (ASG) │ infra │ @@ -263,7 +264,7 @@ http://127.0.0.1:9090/chat.html │ DRA, gang scheduling, secure access, accelerator metrics, │ │ AI service metrics, pod autoscaling │ │ │ -│ eks-inference layer (+1): inference gateway (kgateway) │ +│ eks-inference layer (+1): inference gateway (agentgateway) │ │ dynamo layer (+1): robust controller (Dynamo operator) │ │ infra layer (+1): cluster autoscaling (EKS ASG) │ │ │ diff --git a/demos/cuj2-eks.md b/demos/cuj2-eks.md index e36b4bb3a..c9dad431b 100644 --- a/demos/cuj2-eks.md +++ b/demos/cuj2-eks.md @@ -85,7 +85,7 @@ kubectl get dynamographdeployments -n dynamo-workload kubectl get pods -n dynamo-workload -o wide -w # Verify the inference gateway routes to the workload -kubectl get gateway inference-gateway -n kgateway-system +kubectl get gateway inference-gateway -n agentgateway-system kubectl get inferencepool -n dynamo-workload ``` diff --git a/demos/cuj2-gke.md b/demos/cuj2-gke.md index 949333b79..fceec7cbf 100644 --- a/demos/cuj2-gke.md +++ b/demos/cuj2-gke.md @@ -83,7 +83,7 @@ kubectl get dynamographdeployments -n dynamo-workload kubectl get pods -n dynamo-workload -o wide -w # Verify the inference gateway routes to the workload -kubectl get gateway inference-gateway -n kgateway-system +kubectl get gateway inference-gateway -n agentgateway-system kubectl get inferencepool -n dynamo-workload ``` diff --git a/demos/images/meta.md b/demos/images/meta.md index a39269bdc..1bceb1f72 100644 --- a/demos/images/meta.md +++ b/demos/images/meta.md @@ -82,8 +82,8 @@ Visual: Single input forking into two divergent paths │ Unique: │ │ Unique: │ │ kubeflow-trainer │ │ dynamo-crds │ │ │ │ dynamo-platform │ -│ GPU Operator: │ │ kgateway-crds │ -│ CDI=true │ │ kgateway │ +│ GPU Operator: │ │ agentgateway-crds │ +│ CDI=true │ │ agentgateway │ │ gdrcopy=true │ │ │ │ │ │ DRA driver: │ │ │ │ gpuResources=true │ diff --git a/demos/query.md b/demos/query.md index 42e6cbce6..4da28059a 100644 --- a/demos/query.md +++ b/demos/query.md @@ -129,7 +129,7 @@ aicr query --service eks --accelerator h100 --intent inference --os ubuntu \ diff /tmp/training.txt /tmp/inference.txt ``` -> `> kgateway` and `> kgateway-crds` — the Inference Gateway is added only +> `> agentgateway` and `> agentgateway-crds` — the Inference Gateway is added only > when `--intent inference`. CDI defaults also flip: diff --git a/docs/user/api-reference.md b/docs/user/api-reference.md index 7bb93f9f9..16eb016e4 100644 --- a/docs/user/api-reference.md +++ b/docs/user/api-reference.md @@ -359,8 +359,8 @@ Bundler names correspond to component names in [`recipes/registry.yaml`](https:/ | `kai-scheduler` | DRA-aware gang scheduler with topology-aware placement | | `grove` | Dynamo pod lifecycle management | | `dynamo-platform` | NVIDIA Dynamo inference serving platform | -| `kgateway-crds` | Kubernetes Gateway API CRDs | -| `kgateway` | Kubernetes Gateway API implementation | +| `agentgateway-crds` | Kubernetes Gateway API CRDs for AI/ML inference (Gateway API + Inference Extension) | +| `agentgateway` | Kubernetes Gateway API implementation for AI/ML inference (InferencePool routing) | | `k8s-nim-operator` | NVIDIA NIM Operator for inference microservice deployments | | `kueue` | Kubernetes-native job queuing for batch and AI workloads | | `kubeflow-trainer` | Kubeflow Training Operator for distributed training | diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 719cd9c8f..2f0b8b18e 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -1111,7 +1111,7 @@ aicr bundle --recipe recipe.yaml \ This results in: - **GPU daemonsets** (driver, device-plugin, toolkit, dcgm): `nodeSelector=nodeGroup=gpu-worker` + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute` - **NFD workers**: no nodeSelector (runs on all nodes) + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute` -- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, kgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute` +- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, agentgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute` **Behavior:** - All components from the recipe are bundled automatically diff --git a/docs/user/component-catalog.md b/docs/user/component-catalog.md index b67a46f8d..b6392ee84 100644 --- a/docs/user/component-catalog.md +++ b/docs/user/component-catalog.md @@ -27,8 +27,8 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/ | **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/kai-scheduler/KAI-Scheduler) | | **grove** | Pod lifecycle management for Dynamo inference platform. Installed as a standalone component. | [Grove](https://github.com/ai-dynamo/grove) | | **dynamo-platform** | NVIDIA Dynamo inference serving platform with bundled CRDs. Distributed inference with prefix-cache-aware routing and disaggregated prefill/decode. | [Dynamo](https://github.com/ai-dynamo/dynamo) | -| **kgateway-crds** | Custom Resource Definitions for kgateway (Kubernetes Gateway API implementation). | [kgateway](https://github.com/kgateway-dev/kgateway) | -| **kgateway** | Kubernetes Gateway API implementation. Provides model-aware ingress routing for inference workloads. | [kgateway](https://github.com/kgateway-dev/kgateway) | +| **agentgateway-crds** | Custom Resource Definitions for agentgateway (Kubernetes Gateway API implementation for AI/ML inference). | [agentgateway](https://github.com/agentgateway/agentgateway) | +| **agentgateway** | Kubernetes Gateway API implementation for AI/ML inference. Implements the Gateway API Inference Extension for model-aware ingress routing to InferencePool backends. | [agentgateway](https://github.com/agentgateway/agentgateway) | | **k8s-nim-operator** | NVIDIA NIM Operator for managing NIM (NVIDIA Inference Microservices) deployments on Kubernetes. | [K8s NIM Operator](https://github.com/NVIDIA/k8s-nim-operator) | | **kueue** | Kubernetes-native job queuing system. Manages quotas and admits jobs for batch and AI workloads. | [Kueue](https://github.com/kubernetes-sigs/kueue) | | **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) | @@ -41,7 +41,7 @@ Not every component appears in every recipe. The recipe engine selects component - **Base components** (cert-manager, kube-prometheus-stack) appear in most recipes. - **Cloud-specific components** (aws-efa, aws-ebs-csi-driver) are added when the service matches. -- **Intent-specific components** (kgateway, kgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). +- **Intent-specific components** (agentgateway, agentgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway). - **Platform-specific components** (slinky-slurm-operator, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`. - **Accelerator/OS-specific tuning** (nodewright-customizations, nvidia-dra-driver-gpu) varies by hardware and OS combination. diff --git a/docs/user/container-images.md b/docs/user/container-images.md index e63b5eb33..144418991 100644 --- a/docs/user/container-images.md +++ b/docs/user/container-images.md @@ -23,12 +23,14 @@ A machine-readable **CycloneDX 1.6 JSON** companion to this page is produced by - Unique images: **71** - Distinct registries: **11** -Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev` +Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.agentgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev` ## Components | Component | Type | Chart | Pinned Version | Images | |-----------|------|-------|----------------|--------| +| agentgateway | helm | agentgateway | v2.2.1 | 1 | +| agentgateway-crds | helm | agentgateway-crds | v2.2.1 | 0 | | aws-ebs-csi-driver | helm | aws-ebs-csi-driver/aws-ebs-csi-driver | 2.59.0 | 6 | | aws-efa | helm | aws-efa-k8s-device-plugin | v0.5.26 | 1 | | cert-manager | helm | jetstack/cert-manager | v1.20.2 | 4 | @@ -39,8 +41,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` | k8s-ephemeral-storage-metrics | helm | k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics | 1.19.2 | 1 | | k8s-nim-operator | helm | k8s-nim-operator | 3.1.0 | 1 | | kai-scheduler | helm | kai-scheduler | v0.14.1 | 2 | -| kgateway | helm | kgateway | v2.0.0 | 1 | -| kgateway-crds | helm | kgateway-crds | v2.0.0 | 0 | | kube-prometheus-stack | helm | prometheus-community/kube-prometheus-stack | 84.4.0 | 8 | | kubeflow-trainer | helm | kubeflow-trainer | 2.2.0 | 3 | | kueue | helm | kueue | 0.17.1 | 1 | @@ -56,6 +56,14 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` ## Images by component +### agentgateway + +- `cr.agentgateway.dev/controller:v2.2.1` + +### agentgateway-crds + +_No images extracted._ + ### aws-ebs-csi-driver - `public.ecr.aws/csi-components/csi-attacher:v4.11.0-eksbuild.4` @@ -121,14 +129,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, ` - `ghcr.io/kai-scheduler/kai-scheduler/crd-upgrader:v0.14.1` - `ghcr.io/kai-scheduler/kai-scheduler/operator:v0.14.1` -### kgateway - -- `cr.kgateway.dev/kgateway-dev/kgateway:v2.0.0` - -### kgateway-crds - -_No images extracted._ - ### kube-prometheus-stack - `docker.io/grafana/grafana:13.0.1` @@ -222,7 +222,7 @@ AICR pulls from a deliberately diverse set of registries: - **`public.ecr.aws`** — AWS public artifacts (aws-ebs-csi-driver). - **Regional ECR** (`.dkr.ecr..amazonaws.com`) — EKS-internal add-ons. The `aws-efa` entry below shows `us-west-2` because that is the in-tree default; deployments in other regions override `awsefa:image.repository` at bundle or install time. See [Regional registry overrides](../integrator/recipe-development.md#regional-registry-overrides) for the pattern. - **`gcr.io`, `gke.gcr.io`, `us-docker.pkg.dev`** — GCP/GKE add-ons (gke-nccl-tcpxo). -- **`cr.kgateway.dev`** — kgateway. +- **`cr.agentgateway.dev`** — agentgateway (AI inference gateway). - **`docker.io`** — assorted upstream images (`busybox`, `pytorch`, etc.). Customers running in air-gapped or private-registry environments need to mirror every registry above. A dedicated mirroring guide is tracked under [#743](https://github.com/NVIDIA/aicr/issues/743). diff --git a/pkg/bundler/deployer/helm/helm_test.go b/pkg/bundler/deployer/helm/helm_test.go index 73ff977b9..8b4ad4e5f 100644 --- a/pkg/bundler/deployer/helm/helm_test.go +++ b/pkg/bundler/deployer/helm/helm_test.go @@ -1428,14 +1428,14 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin }{Version: "v0.1.0"}, ComponentRefs: []recipe.ComponentRef{ {Name: "cert-manager", Namespace: "cert-manager", Chart: "cert-manager", Version: "v1.17.2", Source: "https://charts.jetstack.io"}, - {Name: "kgateway", Namespace: "kgateway-system", Chart: "kgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"}, + {Name: "agentgateway", Namespace: "agentgateway-system", Chart: "agentgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"}, {Name: "nodewright-operator", Namespace: "skyhook", Chart: "nodewright-operator", Version: "v0.1.0", Source: "https://example.invalid/charts"}, }, - DeploymentOrder: []string{"cert-manager", "kgateway", "nodewright-operator"}, + DeploymentOrder: []string{"cert-manager", "agentgateway", "nodewright-operator"}, }, ComponentValues: map[string]map[string]any{ "cert-manager": {}, - "kgateway": {}, + "agentgateway": {}, "nodewright-operator": {}, }, Version: "v1.0.0", @@ -1449,7 +1449,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin snippet=$(sed -n '/^skip_preflight_for_release()/,/^}/p' "$UNDEPLOY") eval "$snippet" skip_preflight_for_release "nodewright-operator" && echo "skip:nodewright-operator" - skip_preflight_for_release "kgateway" && echo "skip:kgateway" + skip_preflight_for_release "agentgateway" && echo "skip:agentgateway" if skip_preflight_for_release "cert-manager"; then echo "unexpected:cert-manager" exit 1 @@ -1470,7 +1470,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin } out := stdout.String() - for _, want := range []string{"skip:nodewright-operator", "skip:kgateway", "check:cert-manager"} { + for _, want := range []string{"skip:nodewright-operator", "skip:agentgateway", "check:cert-manager"} { if !strings.Contains(out, want) { t.Errorf("expected %q in output; stdout=%q stderr=%q", want, out, stderr.String()) } diff --git a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl index 1752609ce..af65311a9 100644 --- a/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl +++ b/pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh index 2e1f0633c..16d13e6f6 100644 --- a/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/kai_scheduler_present/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh index ca77d9e4a..40f7b539b 100644 --- a/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/manifest_only/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh index 215172f8f..b5b7b6c99 100644 --- a/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/mixed_gpu_operator/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh b/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh index 6df40f551..1ba4d15a0 100644 --- a/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/mixed_with_pre/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh index 870719568..e625c2883 100644 --- a/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/nodewright_present/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh index d8844bfc8..5108629f3 100644 --- a/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh +++ b/pkg/bundler/deployer/helm/testdata/upstream_helm_only/undeploy.sh @@ -248,7 +248,7 @@ extra_crds_for_release() { # deleted from manifests before the controller is uninstalled. skip_preflight_for_release() { case "$1" in - nodewright-operator|kgateway) return 0 ;; + nodewright-operator|agentgateway) return 0 ;; *) return 1 ;; esac } diff --git a/pkg/evidence/cncf/requirements.go b/pkg/evidence/cncf/requirements.go index 05cbd7454..ea7377229 100644 --- a/pkg/evidence/cncf/requirements.go +++ b/pkg/evidence/cncf/requirements.go @@ -59,7 +59,7 @@ var requirements = map[string]requirementMeta{ }, featureInferenceGateway: { RequirementID: "ai_inference", - Title: "Inference API Gateway (kgateway)", + Title: "Inference API Gateway (agentgateway)", Description: "Demonstrates that the cluster supports Kubernetes Gateway API for AI/ML inference routing with an operational GatewayClass and Gateway.", File: "inference-gateway.md", }, diff --git a/pkg/evidence/cncf/scripts/collect-evidence.sh b/pkg/evidence/cncf/scripts/collect-evidence.sh index 1e789f8ae..200ab03cf 100755 --- a/pkg/evidence/cncf/scripts/collect-evidence.sh +++ b/pkg/evidence/cncf/scripts/collect-evidence.sh @@ -1302,13 +1302,13 @@ collect_gateway() { EVIDENCE_FILE="${EVIDENCE_DIR}/inference-gateway.md" log_info "Collecting Inference API Gateway evidence → ${EVIDENCE_FILE}" - # Skip if kgateway is not installed (training clusters don't have inference gateway) - if ! kubectl get deploy -n kgateway-system --no-headers 2>/dev/null | grep -q .; then - log_info "Inference gateway evidence collection skipped — kgateway not installed." + # Skip if agentgateway is not installed (training clusters don't have inference gateway) + if ! kubectl get deploy -n agentgateway-system --no-headers 2>/dev/null | grep -q .; then + log_info "Inference gateway evidence collection skipped — agentgateway not installed." return fi - write_section_header "Inference API Gateway (kgateway)" + write_section_header "Inference API Gateway (agentgateway)" cat >> "${EVIDENCE_FILE}" <<'EOF' Demonstrates CNCF AI Conformance requirement for Kubernetes Gateway API support @@ -1316,19 +1316,19 @@ with an implementation for advanced traffic management for inference services. ## Summary -1. **kgateway controller** — Running in `kgateway-system` +1. **agentgateway controller** — Running in `agentgateway-system` 2. **inference-gateway deployment** — Running (the inference extension controller) 3. **Gateway API CRDs** — All present (GatewayClass, Gateway, HTTPRoute, GRPCRoute, ReferenceGrant) -4. **Active Gateway** — `inference-gateway` with class `kgateway`, programmed with an AWS ELB address -5. **Inference Extension CRDs** — InferencePool, InferenceModelRewrite, InferenceObjective installed +4. **Active Gateway** — `inference-gateway` with class `agentgateway`, programmed with a load balancer address +5. **Inference Extension CRDs** — InferencePool, InferenceObjective, InferenceModelRewrite installed 6. **Result: PASS** --- -## kgateway Controller +## agentgateway Controller EOF - capture "kgateway deployments" kubectl get deploy -n kgateway-system - capture "kgateway pods" kubectl get pods -n kgateway-system + capture "agentgateway deployments" kubectl get deploy -n agentgateway-system + capture "agentgateway pods" kubectl get pods -n agentgateway-system cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1352,7 +1352,7 @@ EOF ## Active Gateway EOF capture "Gateways" kubectl get gateways -A - capture "Gateway details" kubectl get gateway inference-gateway -n kgateway-system -o yaml + capture "Gateway details" kubectl get gateway inference-gateway -n agentgateway-system -o yaml cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1364,14 +1364,14 @@ EOF echo "" >> "${EVIDENCE_FILE}" echo "**GatewayClass conditions**" >> "${EVIDENCE_FILE}" echo '```' >> "${EVIDENCE_FILE}" - kubectl get gatewayclass kgateway -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 + kubectl get gatewayclass agentgateway -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 echo '```' >> "${EVIDENCE_FILE}" # Check Gateway Programmed condition echo "" >> "${EVIDENCE_FILE}" echo "**Gateway conditions**" >> "${EVIDENCE_FILE}" echo '```' >> "${EVIDENCE_FILE}" - kubectl get gateway inference-gateway -n kgateway-system -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 + kubectl get gateway inference-gateway -n agentgateway-system -o jsonpath='{range .status.conditions[*]}{.type}: {.status} ({.reason}){"\n"}{end}' >> "${EVIDENCE_FILE}" 2>&1 echo '```' >> "${EVIDENCE_FILE}" cat >> "${EVIDENCE_FILE}" <<'EOF' @@ -1388,10 +1388,10 @@ EOF # Verdict — check both GatewayClass Accepted and Gateway Programmed echo "" >> "${EVIDENCE_FILE}" local gw_accepted gw_programmed - gw_accepted=$(kubectl get gatewayclass kgateway -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) - gw_programmed=$(kubectl get gateway inference-gateway -n kgateway-system -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null) + gw_accepted=$(kubectl get gatewayclass agentgateway -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) + gw_programmed=$(kubectl get gateway inference-gateway -n agentgateway-system -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null) if [ "${gw_accepted}" = "True" ] && [ "${gw_programmed}" = "True" ]; then - echo "**Result: PASS** — kgateway controller running, GatewayClass Accepted, Gateway Programmed, inference CRDs installed." >> "${EVIDENCE_FILE}" + echo "**Result: PASS** — agentgateway controller running, GatewayClass Accepted, Gateway Programmed, inference CRDs installed." >> "${EVIDENCE_FILE}" else echo "**Result: FAIL** — No active Gateway found." >> "${EVIDENCE_FILE}" fi diff --git a/pkg/recipe/conformance_test.go b/pkg/recipe/conformance_test.go index 601e61150..fdcdab520 100644 --- a/pkg/recipe/conformance_test.go +++ b/pkg/recipe/conformance_test.go @@ -54,8 +54,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", }, requiredChecks: []string{ "platform-health", @@ -88,8 +88,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, @@ -189,8 +189,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, @@ -285,8 +285,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", }, requiredChecks: []string{ "platform-health", @@ -318,8 +318,8 @@ func TestConformanceRecipeInvariants(t *testing.T) { "prometheus-adapter", "nvidia-dra-driver-gpu", "kai-scheduler", - "kgateway-crds", - "kgateway", + "agentgateway-crds", + "agentgateway", "grove", "dynamo-platform", }, diff --git a/pkg/recipe/metadata_test.go b/pkg/recipe/metadata_test.go index 02f99afeb..e04026e25 100644 --- a/pkg/recipe/metadata_test.go +++ b/pkg/recipe/metadata_test.go @@ -845,7 +845,7 @@ func TestOverlayMergeDoesNotLoseBaseComponents(t *testing.T) { builder := NewBuilder() // Build H100 EKS inference recipe with dynamo platform - // Matches overlay chain that adds kgateway, dynamo-platform, kai-scheduler, etc. + // Matches overlay chain that adds agentgateway, dynamo-platform, kai-scheduler, etc. criteria := NewCriteria() criteria.Service = CriteriaServiceEKS criteria.Accelerator = CriteriaAcceleratorH100 diff --git a/recipes/checks/kgateway/health-check.yaml b/recipes/checks/agentgateway/health-check.yaml similarity index 77% rename from recipes/checks/kgateway/health-check.yaml rename to recipes/checks/agentgateway/health-check.yaml index 0065cda76..ad79282b8 100644 --- a/recipes/checks/kgateway/health-check.yaml +++ b/recipes/checks/agentgateway/health-check.yaml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# KGateway Health Check +# Agentgateway Health Check # -# Validates that KGateway is running and healthy in the kgateway-system -# namespace. Checks that the kgateway deployment has at least one available -# replica and that no pods in the namespace are stuck in Pending, Failed, -# or Unknown phases. +# Validates that agentgateway is running and healthy in the agentgateway-system +# namespace. Checks that the agentgateway deployment has at least one available +# replica and that no pods in the namespace are stuck in Pending, Failed, or +# Unknown phases. apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: - name: kgateway-health-check + name: agentgateway-health-check spec: timeouts: assert: 5m @@ -29,14 +29,14 @@ spec: - name: validate-deployment-exists try: # Guard against vacuous pass on empty namespace: verify the - # kgateway deployment exists and has at least one ready replica. + # agentgateway deployment exists and has at least one ready replica. - assert: resource: apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (availableReplicas > `0`): true - name: validate-all-pods-healthy @@ -49,7 +49,7 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Pending - error: @@ -57,7 +57,7 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Failed - error: @@ -65,6 +65,6 @@ spec: apiVersion: v1 kind: Pod metadata: - namespace: kgateway-system + namespace: agentgateway-system status: phase: Unknown diff --git a/recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml b/recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml similarity index 99% rename from recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml rename to recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml index 600edfcf3..71a2566d0 100644 --- a/recipes/components/kgateway-crds/manifests/gateway-api-crds.yaml +++ b/recipes/components/agentgateway-crds/manifests/gateway-api-crds.yaml @@ -1,8 +1,8 @@ # Standard Kubernetes Gateway API CRDs (v1.2.1) # Source: https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.1/standard-install.yaml -# Required by kgateway for Gateway, HTTPRoute, and GRPCRoute resources. +# Required by agentgateway for Gateway, HTTPRoute, and GRPCRoute resources. # -# These CRDs are not included in the kgateway-crds Helm chart and must +# These CRDs are not included in the agentgateway-crds Helm chart and must # be installed separately. Vendored here for fully automated deployment. # # aicr/skip-hook-validation: "true" diff --git a/recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml b/recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml similarity index 99% rename from recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml rename to recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml index d87d5fc3b..b03890aa4 100644 --- a/recipes/components/kgateway-crds/manifests/inference-extension-crds.yaml +++ b/recipes/components/agentgateway-crds/manifests/inference-extension-crds.yaml @@ -14,9 +14,9 @@ # Gateway API Inference Extension CRDs (v1.3.0) # Source: https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.3.0/manifests.yaml -# Required by kgateway for InferencePool and InferenceModel resources. +# Required by agentgateway for InferencePool and InferenceObjective resources. # -# These CRDs are not included in the kgateway Helm chart and must +# These CRDs are not included in the agentgateway-crds Helm chart and must # be installed separately. Vendored here for fully automated deployment. # # aicr/skip-hook-validation: "true" diff --git a/recipes/components/kgateway-crds/values.yaml b/recipes/components/agentgateway-crds/values.yaml similarity index 70% rename from recipes/components/kgateway-crds/values.yaml rename to recipes/components/agentgateway-crds/values.yaml index c6ad4ba61..e67501e93 100644 --- a/recipes/components/kgateway-crds/values.yaml +++ b/recipes/components/agentgateway-crds/values.yaml @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# kgateway CRDs Helm values -# Installs kgateway-specific CRDs (Backends, DirectResponses, GatewayExtensions, -# GatewayParameters, HTTPListenerPolicies, TrafficPolicies). +# agentgateway CRDs Helm values +# Installs agentgateway-specific CRDs (AgentgatewayBackend, AgentgatewayPolicy, +# AgentgatewayParameters) in the agentgateway.dev/v1alpha1 API group. # -# Note: Standard Gateway API CRDs (GatewayClass, Gateway, HTTPRoute) and -# Inference Extension CRDs (InferencePool, InferenceModel) must be installed -# separately before deploying kgateway. See the kgateway getting started guide: +# Note: Standard Gateway API CRDs (GatewayClass, Gateway, HTTPRoute, GRPCRoute, +# ReferenceGrant) and Gateway API Inference Extension CRDs (InferencePool, +# InferenceObjective) must be installed separately before deploying agentgateway. +# See the upstream guide: # https://gateway-api-inference-extension.sigs.k8s.io/guides/ # This chart has no configurable values — it only installs CRDs. diff --git a/recipes/components/kgateway/manifests/inference-gateway.yaml b/recipes/components/agentgateway/manifests/inference-gateway.yaml similarity index 52% rename from recipes/components/kgateway/manifests/inference-gateway.yaml rename to recipes/components/agentgateway/manifests/inference-gateway.yaml index 310e2958e..9b292b347 100644 --- a/recipes/components/kgateway/manifests/inference-gateway.yaml +++ b/recipes/components/agentgateway/manifests/inference-gateway.yaml @@ -14,52 +14,62 @@ # Inference Gateway — provides external access to inference services # via the Kubernetes Gateway API. Any inference workload (dynamo, vLLM, -# TGI, etc.) can route through this gateway using HTTPRoute or -# InferenceModel resources. -{{- $kgw := index .Values "kgateway" }} -# GatewayParameters configures the proxy pod scheduling to match the -# system node tolerations/nodeSelector injected by the bundler into -# the kgateway controller. Without this, proxy pods land on any -# untainted node instead of system infrastructure nodes. +# TGI, etc.) can route through this gateway using HTTPRoute backendRefs +# to InferencePool resources or plain Service backends. +{{- $agw := index .Values "agentgateway" }} +# AgentgatewayParameters configures the proxy pod scheduling to match the +# system node tolerations/nodeSelector injected by the bundler into the +# agentgateway controller. Without this, proxy pods land on any untainted +# node instead of system infrastructure nodes. The new API uses a +# strategic-merge-patch into the generated Deployment spec. --- -apiVersion: gateway.kgateway.dev/v1alpha1 -kind: GatewayParameters +apiVersion: agentgateway.dev/v1alpha1 +kind: AgentgatewayParameters metadata: name: system-proxy - namespace: kgateway-system + namespace: agentgateway-system annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "5" "helm.sh/hook-delete-policy": before-hook-creation spec: - kube: - podTemplate: - extraLabels: + deployment: + metadata: + labels: app.kubernetes.io/managed-by: aicr - {{- if $kgw.nodeSelector }} - nodeSelector: - {{- toYaml $kgw.nodeSelector | nindent 8 }} - {{- end }} - {{- if $kgw.tolerations }} - tolerations: - {{- toYaml $kgw.tolerations | nindent 8 }} - {{- end }} + spec: + template: + metadata: + labels: + # Mirrors the old kgateway GatewayParameters podTemplate.extraLabels + # behavior so the proxy pods themselves carry the AICR ownership + # label, not just the Deployment. + app.kubernetes.io/managed-by: aicr + spec: + {{- if $agw.nodeSelector }} + nodeSelector: + {{- toYaml $agw.nodeSelector | nindent 12 }} + {{- end }} + {{- if $agw.tolerations }} + tolerations: + {{- toYaml $agw.tolerations | nindent 12 }} + {{- end }} --- apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system annotations: "helm.sh/hook": post-install,post-upgrade "helm.sh/hook-weight": "10" "helm.sh/hook-delete-policy": before-hook-creation spec: - gatewayClassName: kgateway + gatewayClassName: agentgateway infrastructure: parametersRef: - group: gateway.kgateway.dev - kind: GatewayParameters + group: agentgateway.dev + kind: AgentgatewayParameters name: system-proxy listeners: - name: http diff --git a/recipes/components/kgateway/values.yaml b/recipes/components/agentgateway/values.yaml similarity index 75% rename from recipes/components/kgateway/values.yaml rename to recipes/components/agentgateway/values.yaml index 82b3232ce..f6a20e981 100644 --- a/recipes/components/kgateway/values.yaml +++ b/recipes/components/agentgateway/values.yaml @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -# kgateway Helm values -# Gateway API and Inference Gateway v1.0.0 conformant implementation. +# agentgateway Helm values +# Kubernetes Gateway API + Gateway API Inference Extension implementation. # Satisfies CNCF AI Conformance Advanced Ingress for AI/ML Inference requirement. -# Provides model-aware routing, weighted traffic splitting, and header-based routing. +# Provides model-aware routing, weighted traffic splitting, and header-based routing +# for InferencePool backends. # Override release name prefix to avoid aicr-stack- prefix -fullnameOverride: kgateway +fullnameOverride: agentgateway resources: requests: @@ -28,6 +29,7 @@ resources: cpu: 500m memory: 256Mi -# Enable Gateway API Inference Extension for model-aware routing +# Enable Gateway API Inference Extension support in the agentgateway controller. +# Required for routing to InferencePool backends. inferenceExtension: enabled: true diff --git a/recipes/mixins/platform-inference.yaml b/recipes/mixins/platform-inference.yaml index 71b02c7b1..1135cfb7c 100644 --- a/recipes/mixins/platform-inference.yaml +++ b/recipes/mixins/platform-inference.yaml @@ -18,22 +18,22 @@ metadata: name: platform-inference spec: componentRefs: - - name: kgateway-crds + - name: agentgateway-crds type: Helm - source: oci://cr.kgateway.dev/kgateway-dev/charts - version: v2.0.0 - valuesFile: components/kgateway-crds/values.yaml + source: oci://cr.agentgateway.dev/charts + version: v2.2.1 + valuesFile: components/agentgateway-crds/values.yaml manifestFiles: - - components/kgateway-crds/manifests/gateway-api-crds.yaml - - components/kgateway-crds/manifests/inference-extension-crds.yaml + - components/agentgateway-crds/manifests/gateway-api-crds.yaml + - components/agentgateway-crds/manifests/inference-extension-crds.yaml - - name: kgateway + - name: agentgateway type: Helm - source: oci://cr.kgateway.dev/kgateway-dev/charts - version: v2.0.0 - valuesFile: components/kgateway/values.yaml + source: oci://cr.agentgateway.dev/charts + version: v2.2.1 + valuesFile: components/agentgateway/values.yaml manifestFiles: - - components/kgateway/manifests/inference-gateway.yaml + - components/agentgateway/manifests/inference-gateway.yaml dependencyRefs: - - kgateway-crds + - agentgateway-crds - cert-manager diff --git a/recipes/registry.yaml b/recipes/registry.yaml index 009362efc..1b892f7ee 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -425,27 +425,27 @@ components: tolerationPaths: - dynamo-operator.controllerManager.tolerations - - name: kgateway-crds - displayName: kgateway-crds + - name: agentgateway-crds + displayName: agentgateway-crds valueOverrideKeys: - - kgatewaycrds + - agentgatewaycrds helm: - defaultRepository: oci://cr.kgateway.dev/kgateway-dev/charts - defaultChart: kgateway-crds - defaultVersion: v2.0.0 - defaultNamespace: kgateway-system + defaultRepository: oci://cr.agentgateway.dev/charts + defaultChart: agentgateway-crds + defaultVersion: v2.2.1 + defaultNamespace: agentgateway-system - - name: kgateway - displayName: kgateway + - name: agentgateway + displayName: agentgateway valueOverrideKeys: - - kgateway + - agentgateway healthCheck: - assertFile: checks/kgateway/health-check.yaml + assertFile: checks/agentgateway/health-check.yaml helm: - defaultRepository: oci://cr.kgateway.dev/kgateway-dev/charts - defaultChart: kgateway - defaultVersion: v2.0.0 - defaultNamespace: kgateway-system + defaultRepository: oci://cr.agentgateway.dev/charts + defaultChart: agentgateway + defaultVersion: v2.2.1 + defaultNamespace: agentgateway-system nodeScheduling: system: nodeSelectorPaths: diff --git a/recipes/validators/README.md b/recipes/validators/README.md index cd43075a9..073efa212 100644 --- a/recipes/validators/README.md +++ b/recipes/validators/README.md @@ -58,7 +58,7 @@ Applied by `catalog.Load` (`pkg/validator/catalog/catalog.go`) in order: | `gang-scheduling` | Verify gang scheduling with KAI scheduler using CPU-only workers | 10m | | `accelerator-metrics` | Verify accelerator metrics from DCGM exporter | 5m | | `ai-service-metrics` | Verify AI service metrics via Prometheus | 5m | -| `inference-gateway` | Verify inference gateway (kgateway) is operational | 5m | +| `inference-gateway` | Verify inference gateway (agentgateway) is operational | 5m | | `pod-autoscaling` | Verify HPA-driven pod autoscaling with GPU metrics | 10m | | `cluster-autoscaling` | Verify cluster autoscaling with Karpenter | 10m | | `robust-controller` | Verify Dynamo operator controller and webhooks | 5m | diff --git a/recipes/validators/catalog.yaml b/recipes/validators/catalog.yaml index 37697a1df..2ebc2d49a 100644 --- a/recipes/validators/catalog.yaml +++ b/recipes/validators/catalog.yaml @@ -109,7 +109,7 @@ validators: env: [] - name: inference-gateway phase: conformance - description: "Verify inference gateway (kgateway) is operational" + description: "Verify inference gateway (agentgateway) is operational" image: ghcr.io/nvidia/aicr-validators/conformance:latest timeout: 5m args: ["inference-gateway"] diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index 90a13d54a..39a84e513 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -9,7 +9,7 @@ Chainsaw suites validating AI conformance flows across environments: - `common/` — assertions shared by `cluster/` and both Kind GPU suites - `kind-common/` — assertions shared only by Kind GPU suites -The `cluster/` suite validates the NVIDIA AI-conformance inference stack: KAI Scheduler (GPU scheduling), kgateway with Gateway API Inference Extension (inference routing), and the NVIDIA Dynamo serving platform. +The `cluster/` suite validates the NVIDIA AI-conformance inference stack: KAI Scheduler (GPU scheduling), agentgateway with Gateway API Inference Extension (inference routing), and the NVIDIA Dynamo serving platform. ## Cluster Inference Recipe @@ -56,8 +56,8 @@ The Kind GPU workflows use these leaf recipes instead: | prometheus-adapter | monitoring | Helm | Deployment | | aws-ebs-csi-driver | kube-system | Helm | **Disabled by default** (EKS managed addon) | | aws-efa | kube-system | Helm | Device plugin DaemonSet | -| kgateway-crds | kgateway-system | Helm | CRDs only (Gateway API + Inference Extension) | -| kgateway | kgateway-system | Helm | Controller Deployment | +| agentgateway-crds | agentgateway-system | Helm | CRDs only (Gateway API + Inference Extension) | +| agentgateway | agentgateway-system | Helm | Controller Deployment | | nodewright-customizations | skyhook | Manifest | No workloads (NodeConfiguration CRs) | | nvidia-dra-driver-gpu | nvidia-dra-driver | Helm | Controller Deployment, kubelet-plugin DaemonSet | | kai-scheduler | kai-scheduler | Helm | Scheduler Deployment | @@ -84,7 +84,7 @@ tests/chainsaw/ai-conformance/ │ ├── chainsaw-test.yaml # Inference leaf health check orchestration │ ├── assert-crds.yaml # Inference-specific CRDs installed │ ├── assert-dynamo.yaml # Dynamo platform healthy on kind -│ ├── assert-kgateway.yaml # kgateway healthy on kind +│ ├── assert-agentgateway.yaml # agentgateway healthy on kind │ └── assert-namespaces.yaml # Inference-specific namespaces exist ├── kind-training-kubeflow/ # Kind + H100 + training + kubeflow leaf suite │ ├── chainsaw-test.yaml # Training leaf health check orchestration @@ -100,7 +100,7 @@ tests/chainsaw/ai-conformance/ ├── assert-crds.yaml # Critical CRDs installed ├── assert-gpu-operator.yaml # GPU operator + DaemonSets healthy ├── assert-kube-system.yaml # AWS EFA healthy - ├── assert-kgateway.yaml # kgateway healthy + ├── assert-agentgateway.yaml # agentgateway healthy ├── assert-nvsentinel.yaml # NVSentinel healthy └── assert-dynamo.yaml # Dynamo platform healthy ``` @@ -159,7 +159,7 @@ chainsaw test \ | Component Group | Timeout | Reason | |-----------------|---------|--------| | Namespaces, CRDs | 2m | Should exist immediately after deployment | -| cert-manager, kgateway, skyhook, monitoring, kai-scheduler | 5m | Standard Deployment rollout | +| cert-manager, agentgateway, skyhook, monitoring, kai-scheduler | 5m | Standard Deployment rollout | | gpu-operator, nvidia-dra-driver-gpu | 10m | GPU driver compilation on nodes is slow | | dynamo-platform | 5m | Operator + etcd + NATS startup | diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml similarity index 75% rename from tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml rename to tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml index aebdf961d..be151ce4a 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-agentgateway.yaml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Assert kgateway controller is available. -# fullnameOverride: kgateway (from values.yaml) -# Chart: kgateway v2.0.0 (oci://cr.kgateway.dev/kgateway-dev/charts) +# Assert agentgateway controller is available. +# fullnameOverride: agentgateway (from values.yaml) +# Chart: agentgateway v2.2.1 (oci://cr.agentgateway.dev/charts) # Satisfies CNCF AI Conformance Advanced Ingress for AI/ML Inference. -# Implements Gateway API + Inference Extension for model-aware routing. +# Implements Gateway API + Inference Extension for InferencePool routing. apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" @@ -31,7 +31,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" diff --git a/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml b/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml index e5524c5cf..e2cc501b8 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-crds.yaml @@ -13,7 +13,7 @@ # limitations under the License. # Assert that critical CRDs are installed by the AI-conformance inference stack. -# Covers CRD-only components (kgateway-crds) and operator-managed CRDs (dynamo-platform). +# Covers CRD-only components (agentgateway-crds) and operator-managed CRDs (dynamo-platform). # ── GPU Operator ─────────────────────────────────────────────────────── # ClusterPolicy CRD — the GPU operator's primary configuration object @@ -38,7 +38,7 @@ kind: CustomResourceDefinition metadata: name: clusterissuers.cert-manager.io --- -# ── kgateway-crds (Gateway API + Inference Extension) ───────────────── +# ── agentgateway-crds (Gateway API + Inference Extension) ──────────── apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: diff --git a/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml b/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml index 39319adae..78d069bf2 100644 --- a/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml +++ b/tests/chainsaw/ai-conformance/cluster/assert-namespaces.yaml @@ -53,7 +53,7 @@ status: apiVersion: v1 kind: Namespace metadata: - name: kgateway-system + name: agentgateway-system status: phase: Active --- diff --git a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml index 2c78cba60..17049f90f 100644 --- a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml @@ -40,7 +40,7 @@ spec: # ── CRDs ─────────────────────────────────────────────────────────── - name: assert-crds - description: Verify critical CRDs are installed (kgateway, dynamo, GPU operator, cert-manager). + description: Verify critical CRDs are installed (agentgateway, dynamo, GPU operator, cert-manager). timeouts: assert: 120s try: @@ -77,12 +77,12 @@ spec: - assert: file: assert-kube-system.yaml - # ── kgateway ─────────────────────────────────────────────────────── - - name: assert-kgateway - description: Verify kgateway controller is available. + # ── agentgateway ─────────────────────────────────────────────────── + - name: assert-agentgateway + description: Verify agentgateway controller is available. try: - assert: - file: assert-kgateway.yaml + file: assert-agentgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml similarity index 83% rename from tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml rename to tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml index ee45021e9..889b8c5ea 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-agentgateway.yaml @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Assert kgateway controller is available on the kind inference + Dynamo stack. +# Assert agentgateway controller is available on the kind inference + Dynamo stack. apiVersion: apps/v1 kind: Deployment metadata: - name: kgateway - namespace: kgateway-system + name: agentgateway + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" @@ -26,7 +26,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: inference-gateway - namespace: kgateway-system + namespace: agentgateway-system status: (conditions[?type == 'Available']): - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml index fe0e152d6..7791b3711 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml @@ -36,7 +36,7 @@ kind: CustomResourceDefinition metadata: name: clusterissuers.cert-manager.io --- -# kgateway-crds (Gateway API + Inference Extension) +# agentgateway-crds (Gateway API + Inference Extension) apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml index cd65aebc6..c8b853f33 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-namespaces.yaml @@ -54,7 +54,7 @@ status: apiVersion: v1 kind: Namespace metadata: - name: kgateway-system + name: agentgateway-system status: phase: Active --- diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index a9e2510bd..8711c88a4 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -40,7 +40,7 @@ spec: # ── CRDs ─────────────────────────────────────────────────────────── - name: assert-crds - description: Verify critical CRDs are installed (kgateway, dynamo, GPU operator, cert-manager). + description: Verify critical CRDs are installed (agentgateway, dynamo, GPU operator, cert-manager). timeouts: assert: 120s try: @@ -70,12 +70,12 @@ spec: - assert: file: ../kind-common/assert-monitoring.yaml - # ── kgateway ─────────────────────────────────────────────────────── - - name: assert-kgateway - description: Verify kgateway controller is available. + # ── agentgateway ─────────────────────────────────────────────────── + - name: assert-agentgateway + description: Verify agentgateway controller is available. try: - assert: - file: assert-kgateway.yaml + file: assert-agentgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook diff --git a/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml b/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml index 1483d9de9..2a730dc02 100644 --- a/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml +++ b/tests/chainsaw/ai-conformance/offline/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service eks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,6 +36,8 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: aws-ebs-csi-driver - name: aws-efa - name: cert-manager @@ -44,8 +46,6 @@ componentRefs: ## alphabetically sorted - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -54,12 +54,12 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - aws-ebs-csi-driver - aws-efa - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - nfd diff --git a/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml index 6bf6d7b56..fdf0d1dff 100644 --- a/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/offline/chainsaw-test.yaml @@ -90,7 +90,7 @@ spec: aws-efa cert-manager \ dynamo-platform grove gpu-operator \ k8s-ephemeral-storage-metrics kai-scheduler \ - kgateway kgateway-crds kube-prometheus-stack \ + agentgateway agentgateway-crds kube-prometheus-stack \ nvidia-dra-driver-gpu nvsentinel prometheus-adapter \ nodewright-customizations nodewright-operator; do match=$(ls -d "${WORK}"/bundle/[0-9][0-9][0-9]-"${component}" 2>/dev/null | head -1) diff --git a/tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml b/tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml similarity index 66% rename from tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml rename to tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml index 17730c934..77bf2f919 100644 --- a/tests/chainsaw/bundle-templates/kgateway/chainsaw-test.yaml +++ b/tests/chainsaw/bundle-templates/agentgateway/chainsaw-test.yaml @@ -16,23 +16,23 @@ apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: - name: cli-bundle-kgateway-templates + name: cli-bundle-agentgateway-templates spec: description: | - Validates that kgateway manifest templates render correctly. - Tests inference-gateway GatewayParameters scheduling (nodeSelector/tolerations). - Run with: AICR_BIN=$(pwd)/dist/e2e/aicr chainsaw test --no-cluster --test-dir tests/chainsaw/bundle-templates/kgateway + Validates that agentgateway manifest templates render correctly. + Tests inference-gateway AgentgatewayParameters scheduling (nodeSelector/tolerations). + Run with: AICR_BIN=$(pwd)/dist/e2e/aicr chainsaw test --no-cluster --test-dir tests/chainsaw/bundle-templates/agentgateway timeouts: exec: 30s steps: - name: generate-recipe - description: Generate an EKS H100 inference recipe (kgateway is inference-only). + description: Generate an EKS H100 inference recipe (agentgateway is inference-only). try: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}" && mkdir -p "${WORK}" ${AICR_BIN} recipe \ --service eks \ @@ -49,26 +49,26 @@ spec: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}/bundle-defaults" ${AICR_BIN} bundle \ --recipe "${WORK}/recipe.yaml" \ --output "${WORK}/bundle-defaults" - name: assert-gateway-defaults - description: Verify GatewayParameters and Gateway resources exist. + description: Verify AgentgatewayParameters and Gateway resources exist. try: - script: content: | - WORK="/tmp/chainsaw-bundle-kgateway-templates" - ## kgateway is a mixed component (upstream Helm + raw manifests), - ## so its raw manifests render into an injected NNN-kgateway-post/ + WORK="/tmp/chainsaw-bundle-agentgateway-templates" + ## agentgateway is a mixed component (upstream Helm + raw manifests), + ## so its raw manifests render into an injected NNN-agentgateway-post/ ## wrapped chart's templates/ folder under #662's layout. - MANIFEST=$(ls "${WORK}"/bundle-defaults/[0-9][0-9][0-9]-kgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) - [ -n "${MANIFEST}" ] || { echo "kgateway inference-gateway.yaml not found" >&2; exit 1; } - ## The manifest contains two resources (GatewayParameters + Gateway). + MANIFEST=$(ls "${WORK}"/bundle-defaults/[0-9][0-9][0-9]-agentgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) + [ -n "${MANIFEST}" ] || { echo "agentgateway inference-gateway.yaml not found" >&2; exit 1; } + ## The manifest contains two resources (AgentgatewayParameters + Gateway). ## Verify both are present. - grep -q 'kind: GatewayParameters' "${MANIFEST}" + grep -q 'kind: AgentgatewayParameters' "${MANIFEST}" grep -q 'kind: Gateway' "${MANIFEST}" ## ── With system node scheduling ──────────────────────────────────── @@ -79,7 +79,7 @@ spec: - script: content: | AICR_BIN="${AICR_BIN:?Set AICR_BIN to the path of the aicr binary}" - WORK="/tmp/chainsaw-bundle-kgateway-templates" + WORK="/tmp/chainsaw-bundle-agentgateway-templates" rm -rf "${WORK}/bundle-scheduling" ${AICR_BIN} bundle \ --recipe "${WORK}/recipe.yaml" \ @@ -87,17 +87,17 @@ spec: --system-node-selector nodeGroup=system-pool - name: assert-gateway-scheduling - description: Verify GatewayParameters has nodeSelector for proxy pods. + description: Verify AgentgatewayParameters has nodeSelector for proxy pods. try: - script: content: | - WORK="/tmp/chainsaw-bundle-kgateway-templates" - MANIFEST=$(ls "${WORK}"/bundle-scheduling/[0-9][0-9][0-9]-kgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) - [ -n "${MANIFEST}" ] || { echo "kgateway inference-gateway.yaml not found in scheduling bundle" >&2; exit 1; } - ## Verify nodeSelector was injected into GatewayParameters + WORK="/tmp/chainsaw-bundle-agentgateway-templates" + MANIFEST=$(ls "${WORK}"/bundle-scheduling/[0-9][0-9][0-9]-agentgateway-post/templates/inference-gateway.yaml 2>/dev/null | head -1) + [ -n "${MANIFEST}" ] || { echo "agentgateway inference-gateway.yaml not found in scheduling bundle" >&2; exit 1; } + ## Verify nodeSelector was injected into AgentgatewayParameters grep -q 'nodeSelector:' "${MANIFEST}" grep -q 'nodeGroup: system-pool' "${MANIFEST}" cleanup: - script: content: | - rm -rf /tmp/chainsaw-bundle-kgateway-templates + rm -rf /tmp/chainsaw-bundle-agentgateway-templates diff --git a/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml index 2633b1283..1affdd007 100644 --- a/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/aws/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service eks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,17 +36,16 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: aws-ebs-csi-driver - name: aws-efa - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -55,15 +54,15 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - aws-ebs-csi-driver - aws-efa - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - - prometheus-adapter + - nfd - nodewright-operator - nodewright-customizations - gpu-operator @@ -71,3 +70,4 @@ deploymentOrder: - dynamo-platform - nvidia-dra-driver-gpu - nvsentinel + - prometheus-adapter diff --git a/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml index 55851ac73..e151184fb 100644 --- a/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/azure/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service aks --accelerator h100 --intent inference # --os ubuntu --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -36,15 +36,14 @@ constraints: - name: OS.sysctl./proc/sys/kernel/osrelease value: '>= 6.8' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: network-operator - name: nfd @@ -53,17 +52,18 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - - gpu-operator - k8s-ephemeral-storage-metrics + - nfd + - network-operator + - gpu-operator - kai-scheduler - dynamo-platform - - network-operator + - nodewright-operator - nvidia-dra-driver-gpu - nvsentinel - prometheus-adapter - - nodewright-operator diff --git a/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml b/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml index b17ba261a..9c1ad23b6 100644 --- a/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml +++ b/tests/uat/gcp/tests/cuj2-inference/assert-recipe.yaml @@ -16,7 +16,7 @@ # # Validates that `aicr recipe --service gke --accelerator h100 --intent inference # --os cos --platform dynamo` produces a valid recipe with correct criteria, -# inference-specific components (kgateway, DRA driver, KAI scheduler, Dynamo), +# inference-specific components (agentgateway, DRA driver, KAI scheduler, Dynamo), # and standard GPU stack components. kind: RecipeResult apiVersion: aicr.nvidia.com/v1alpha1 @@ -30,15 +30,14 @@ constraints: - name: K8s.server.version value: '>= 1.34' componentRefs: ## alphabetically sorted + - name: agentgateway + - name: agentgateway-crds - name: cert-manager - name: dynamo-platform - - name: grove - name: gpu-operator - name: grove - name: k8s-ephemeral-storage-metrics - name: kai-scheduler - - name: kgateway - - name: kgateway-crds - name: kube-prometheus-stack - name: nfd - name: nodewright-customizations @@ -47,13 +46,13 @@ componentRefs: ## alphabetically sorted - name: nvsentinel - name: prometheus-adapter deploymentOrder: + - agentgateway-crds - cert-manager + - agentgateway - grove - - kgateway-crds - - kgateway - kube-prometheus-stack - k8s-ephemeral-storage-metrics - - prometheus-adapter + - nfd - nodewright-operator - nodewright-customizations - gpu-operator @@ -61,3 +60,4 @@ deploymentOrder: - dynamo-platform - nvidia-dra-driver-gpu - nvsentinel + - prometheus-adapter diff --git a/validators/conformance/inference_gateway_check.go b/validators/conformance/inference_gateway_check.go index 12246dc14..758c8648d 100644 --- a/validators/conformance/inference_gateway_check.go +++ b/validators/conformance/inference_gateway_check.go @@ -39,13 +39,13 @@ type gatewayDataPlaneReport struct { } // CheckInferenceGateway validates CNCF requirement #6: Inference Gateway. -// Verifies GatewayClass "kgateway" is accepted, Gateway "inference-gateway" is programmed, +// Verifies GatewayClass "agentgateway" is accepted, Gateway "inference-gateway" is programmed, // and required Gateway API + InferencePool CRDs exist. func CheckInferenceGateway(ctx *validators.Context) error { - // Skip if the recipe does not include kgateway (inference gateway component). + // Skip if the recipe does not include agentgateway (inference gateway component). // Training clusters typically don't have an inference gateway. - if !recipeHasComponent(ctx, "kgateway") { - return validators.Skip("kgateway not in recipe — inference gateway check applies to inference clusters only") + if !recipeHasComponent(ctx, "agentgateway") { + return validators.Skip("agentgateway not in recipe — inference gateway check applies to inference clusters only") } dynClient, err := getDynamicClient(ctx) @@ -55,13 +55,13 @@ func CheckInferenceGateway(ctx *validators.Context) error { collectGatewayControlPlaneArtifacts(ctx) - // 1. GatewayClass "kgateway" accepted + // 1. GatewayClass "agentgateway" accepted gcGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gatewayclasses", } - gc, err := dynClient.Resource(gcGVR).Get(ctx.Ctx, "kgateway", metav1.GetOptions{}) + gc, err := dynClient.Resource(gcGVR).Get(ctx.Ctx, "agentgateway", metav1.GetOptions{}) if err != nil { - return errors.Wrap(errors.ErrCodeNotFound, "GatewayClass 'kgateway' not found", err) + return errors.Wrap(errors.ErrCodeNotFound, "GatewayClass 'agentgateway' not found", err) } gcCond, condErr := getConditionObservation(gc, "Accepted") if condErr != nil { @@ -74,7 +74,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { } controllerName, _, _ := unstructured.NestedString(gc.Object, "spec", "controllerName") recordRawTextArtifact(ctx, "GatewayClass", - "kubectl get gatewayclass kgateway -o yaml", + "kubectl get gatewayclass agentgateway -o yaml", fmt.Sprintf("Name: %s\nControllerName: %s\nAccepted: %s\nReason: %s\nMessage: %s", gc.GetName(), valueOrUnknown(controllerName), gcCond.Status, gcCond.Reason, gcCond.Message)) @@ -82,7 +82,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { gwGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gateways", } - gw, err := dynClient.Resource(gwGVR).Namespace("kgateway-system").Get( + gw, err := dynClient.Resource(gwGVR).Namespace("agentgateway-system").Get( ctx.Ctx, "inference-gateway", metav1.GetOptions{}) if err != nil { return errors.Wrap(errors.ErrCodeNotFound, "Gateway 'inference-gateway' not found", err) @@ -106,7 +106,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { fmt.Sprintf("Name: %s/%s\nProgrammed: %s\nReason: %s\nMessage: %s\nAddressCount: %d", gw.GetNamespace(), gw.GetName(), gwCond.Status, gwCond.Reason, gwCond.Message, addressCount)) recordObjectYAMLArtifact(ctx, "Gateway details", - "kubectl get gateway inference-gateway -n kgateway-system -o yaml", gw.Object) + "kubectl get gateway inference-gateway -n agentgateway-system -o yaml", gw.Object) // 3. Required CRDs exist crdGVR := schema.GroupVersionResource{ @@ -133,7 +133,7 @@ func CheckInferenceGateway(ctx *validators.Context) error { return err } recordRawTextArtifact(ctx, "Gateway Data Plane", - "kubectl get endpointslices -n kgateway-system", + "kubectl get endpointslices -n agentgateway-system", fmt.Sprintf("Listeners: %d\nAttached HTTPRoutes: %d\nHTTPRoutes (all): %d\nMatching EndpointSlices: %d\nReady endpoints: %d", report.ListenerCount, report.AttachedHTTPRoutes, report.TotalHTTPRoutes, report.MatchingEndpointSlice, report.ReadyEndpoints)) @@ -159,7 +159,7 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, gwGVR := schema.GroupVersionResource{ Group: apiGroupGateway, Version: "v1", Resource: "gateways", } - gw, gwErr := dynClient.Resource(gwGVR).Namespace("kgateway-system").Get( + gw, gwErr := dynClient.Resource(gwGVR).Namespace("agentgateway-system").Get( ctx.Ctx, "inference-gateway", metav1.GetOptions{}) if gwErr == nil { listeners, found, _ := unstructured.NestedSlice(gw.Object, "status", "listeners") @@ -204,11 +204,11 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, // 3. Endpoint readiness (hard requirement): verify inference-gateway proxy has ready endpoints. // Filter by kubernetes.io/service-name containing "inference-gateway" to avoid matching // unrelated services in the namespace (e.g. controller manager, webhooks). - slices, err := ctx.Clientset.DiscoveryV1().EndpointSlices("kgateway-system").List( + slices, err := ctx.Clientset.DiscoveryV1().EndpointSlices("agentgateway-system").List( ctx.Ctx, metav1.ListOptions{}) if err != nil { return nil, errors.Wrap(errors.ErrCodeInternal, - "failed to list EndpointSlices in kgateway-system", err) + "failed to list EndpointSlices in agentgateway-system", err) } for _, slice := range slices.Items { @@ -226,7 +226,7 @@ func validateGatewayDataPlane(ctx *validators.Context) (*gatewayDataPlaneReport, if report.ReadyEndpoints == 0 { return nil, errors.New(errors.ErrCodeInternal, - "no ready endpoints for inference-gateway proxy in kgateway-system") + "no ready endpoints for inference-gateway proxy in agentgateway-system") } return report, nil @@ -237,10 +237,10 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { return } - deploys, deployErr := ctx.Clientset.AppsV1().Deployments("kgateway-system").List( + deploys, deployErr := ctx.Clientset.AppsV1().Deployments("agentgateway-system").List( ctx.Ctx, metav1.ListOptions{}) if deployErr != nil { - recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system", + recordRawTextArtifact(ctx, "agentgateway deployments", "kubectl get deploy -n agentgateway-system", fmt.Sprintf("failed to list deployments: %v", deployErr)) } else { var deploymentSummary strings.Builder @@ -252,12 +252,12 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { fmt.Fprintf(&deploymentSummary, "%-40s available=%d/%d image=%s\n", d.Name, d.Status.AvailableReplicas, expected, firstContainerImage(d.Spec.Template.Spec.Containers)) } - recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system", deploymentSummary.String()) + recordRawTextArtifact(ctx, "agentgateway deployments", "kubectl get deploy -n agentgateway-system", deploymentSummary.String()) } - pods, podErr := ctx.Clientset.CoreV1().Pods("kgateway-system").List(ctx.Ctx, metav1.ListOptions{}) + pods, podErr := ctx.Clientset.CoreV1().Pods("agentgateway-system").List(ctx.Ctx, metav1.ListOptions{}) if podErr != nil { - recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system", + recordRawTextArtifact(ctx, "agentgateway pods", "kubectl get pods -n agentgateway-system", fmt.Sprintf("failed to list pods: %v", podErr)) return } @@ -266,5 +266,5 @@ func collectGatewayControlPlaneArtifacts(ctx *validators.Context) { fmt.Fprintf(&podSummary, "%-48s ready=%s phase=%s node=%s\n", pod.Name, podReadyCount(pod), pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName)) } - recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system", podSummary.String()) + recordRawTextArtifact(ctx, "agentgateway pods", "kubectl get pods -n agentgateway-system", podSummary.String()) }