Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/scripts/gpu-debug-diagnostics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@ print_dynamo_diagnostics() {
kubectl_kind -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
}

print_kgateway_diagnostics() {
echo "=== kgateway pods ==="
kubectl_kind -n kgateway-system get pods -o wide 2>/dev/null || true
print_agentgateway_diagnostics() {
echo "=== agentgateway pods ==="
kubectl_kind -n agentgateway-system get pods -o wide 2>/dev/null || true
echo "=== GatewayClass status ==="
kubectl_kind get gatewayclass -o yaml 2>/dev/null || true
echo "=== Gateway status ==="
Expand All @@ -280,9 +280,9 @@ case "${mode}" in
print_kubeflow_diagnostics
;;
inference)
print_h100_common_diagnostics dynamo-system kgateway-system
print_h100_common_diagnostics dynamo-system agentgateway-system
print_dynamo_diagnostics
print_kgateway_diagnostics
print_agentgateway_diagnostics
;;
*)
echo "::error::unknown GPU_TEST_DIAGNOSTIC_MODE: ${mode}"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/gpu-h100-inference-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ jobs:
- 'recipes/components/nodewright-operator/**'
- 'recipes/components/nvidia-dra-driver-gpu/**'
- 'recipes/components/nvsentinel/**'
- 'recipes/components/kgateway/**'
- 'recipes/components/kgateway-crds/**'
- 'recipes/components/agentgateway/**'
- 'recipes/components/agentgateway-crds/**'
- 'recipes/components/grove/**'
- 'recipes/components/dynamo-platform/**'
- 'recipes/components/prometheus-adapter/**'
Expand Down
19 changes: 10 additions & 9 deletions demos/cuj2-demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
│ ├── gpu-operator/ (GPU driver, device-plugin, DCGM) │
│ ├── nvidia-dra-driver-gpu/ (Dynamic Resource Allocation) │
│ ├── kai-scheduler/ (gang scheduling) │
│ ├── kgateway-crds/ (Gateway API + inference CRDs) │
│ ├── kgateway/ (inference gateway controller) │
│ ├── agentgateway-crds/ (Gateway API + inference CRDs) │
│ ├── agentgateway/ (inference gateway controller) │
│ ├── nvsentinel/ (security/compliance) │
│ ├── nodewright-operator/ (node configuration) │
│ ├── nodewright-customizations/ (H100 tuning) │
Expand All @@ -60,13 +60,13 @@
│ $ cd bundle && ./deploy.sh │
│ │
│ cert-manager ──▶ kube-prometheus-stack ──▶ gpu-operator ──▶ │
│ kai-scheduler ──▶ kgateway ──▶ nvidia-dra-driver ──▶
│ kai-scheduler ──▶ agentgateway ──▶ nvidia-dra-driver ──▶ │
│ dynamo-platform ──▶ nodewright ──▶ nvsentinel ──▶ ... │
│ │
│ Result: Fully configured GPU cluster │
│ • 8x H100 GPUs advertised via DRA │
│ • Gang scheduling (KAI Scheduler) │
│ • Inference gateway (kgateway)
│ • Inference gateway (agentgateway)
│ • GPU metrics (DCGM → Prometheus → HPA) │
│ • Dynamo inference platform │
└────────────────────────────────────────────────────────────────────────┘
Expand Down Expand Up @@ -114,8 +114,8 @@
│ └── aws-efa │ └── aws-efa │
│ │ │ │ │
│ eks-training.yaml │ eks-inference.yaml │
│ (no new components) │ ├── kgateway-crds ◀── NEW │
│ │ │ └── kgateway ◀── NEW │
│ (no new components) │ ├── agentgateway-crds ◀── NEW │
│ │ │ └── agentgateway ◀── NEW │
│ │ │ │ │
│ h100-eks-training.yaml │ h100-eks-inference.yaml │
│ ├── gpu-operator (CDI, gdrcopy) │ └── nodewright-customizations │
Expand All @@ -130,7 +130,8 @@
│ │ └── dynamo-platform ◀─ NEW │
│ │ │
├─────────────────────────────────────┼─────────────────────────────────────┤
│ Unique: kubeflow-trainer │ Unique: kgateway-crds, kgateway, │
│ Unique: kubeflow-trainer │ Unique: agentgateway-crds, │
│ │ agentgateway, │
│ │ dynamo-crds, dynamo-platform │
├─────────────────────────────────────┴─────────────────────────────────────┤
│ Shared (base + eks): cert-manager, kube-prometheus-stack, gpu-operator, │
Expand Down Expand Up @@ -253,7 +254,7 @@ http://127.0.0.1:9090/chat.html
│ │ │ toolkit, DCGM, validator) │ │
│ 4 │ accelerator_metrics │ gpu-operator (DCGM exporter) │ base │
│ 5 │ ai_service_metrics │ kube-prometheus-stack, prometheus-adapter│ base │
│ 6 │ ai_inference │ kgateway-crds, kgateway │ eks-inf │
│ 6 │ ai_inference │ agentgateway-crds, agentgateway │ eks-inf │
│ 7 │ robust_controller │ dynamo-crds, dynamo-platform │ dynamo │
│ 8 │ pod_autoscaling │ prometheus-adapter + HPA │ base │
│ 9 │ cluster_autoscaling │ EKS Auto Scaling Group (ASG) │ infra │
Expand All @@ -263,7 +264,7 @@ http://127.0.0.1:9090/chat.html
│ DRA, gang scheduling, secure access, accelerator metrics, │
│ AI service metrics, pod autoscaling │
│ │
│ eks-inference layer (+1): inference gateway (kgateway)
│ eks-inference layer (+1): inference gateway (agentgateway)
│ dynamo layer (+1): robust controller (Dynamo operator) │
│ infra layer (+1): cluster autoscaling (EKS ASG) │
│ │
Expand Down
2 changes: 1 addition & 1 deletion demos/cuj2-eks.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ kubectl get dynamographdeployments -n dynamo-workload
kubectl get pods -n dynamo-workload -o wide -w

# Verify the inference gateway routes to the workload
kubectl get gateway inference-gateway -n kgateway-system
kubectl get gateway inference-gateway -n agentgateway-system
kubectl get inferencepool -n dynamo-workload
```

Expand Down
2 changes: 1 addition & 1 deletion demos/cuj2-gke.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ kubectl get dynamographdeployments -n dynamo-workload
kubectl get pods -n dynamo-workload -o wide -w

# Verify the inference gateway routes to the workload
kubectl get gateway inference-gateway -n kgateway-system
kubectl get gateway inference-gateway -n agentgateway-system
kubectl get inferencepool -n dynamo-workload
```

Expand Down
4 changes: 2 additions & 2 deletions demos/images/meta.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ Visual: Single input forking into two divergent paths
│ Unique: │ │ Unique: │
│ kubeflow-trainer │ │ dynamo-crds │
│ │ │ dynamo-platform │
│ GPU Operator: │ │ kgateway-crds
│ CDI=true │ │ kgateway
│ GPU Operator: │ │ agentgateway-crds │
│ CDI=true │ │ agentgateway
│ gdrcopy=true │ │ │
│ │ │ DRA driver: │
│ │ │ gpuResources=true │
Expand Down
2 changes: 1 addition & 1 deletion demos/query.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ aicr query --service eks --accelerator h100 --intent inference --os ubuntu \
diff /tmp/training.txt /tmp/inference.txt
```

> `> kgateway` and `> kgateway-crds` — the Inference Gateway is added only
> `> agentgateway` and `> agentgateway-crds` — the Inference Gateway is added only
> when `--intent inference`.

CDI defaults also flip:
Expand Down
4 changes: 2 additions & 2 deletions docs/user/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@ Bundler names correspond to component names in [`recipes/registry.yaml`](https:/
| `kai-scheduler` | DRA-aware gang scheduler with topology-aware placement |
| `grove` | Dynamo pod lifecycle management |
| `dynamo-platform` | NVIDIA Dynamo inference serving platform |
| `kgateway-crds` | Kubernetes Gateway API CRDs |
| `kgateway` | Kubernetes Gateway API implementation |
| `agentgateway-crds` | Kubernetes Gateway API CRDs for AI/ML inference (Gateway API + Inference Extension) |
| `agentgateway` | Kubernetes Gateway API implementation for AI/ML inference (InferencePool routing) |
| `k8s-nim-operator` | NVIDIA NIM Operator for inference microservice deployments |
| `kueue` | Kubernetes-native job queuing for batch and AI workloads |
| `kubeflow-trainer` | Kubeflow Training Operator for distributed training |
Expand Down
2 changes: 1 addition & 1 deletion docs/user/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,7 @@ aicr bundle --recipe recipe.yaml \
This results in:
- **GPU daemonsets** (driver, device-plugin, toolkit, dcgm): `nodeSelector=nodeGroup=gpu-worker` + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute`
- **NFD workers**: no nodeSelector (runs on all nodes) + tolerations for `dedicated=worker-workload` with both `NoSchedule` and `NoExecute`
- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, kgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute`
- **System components** (gpu-operator controller, NFD gc/master, dynamo grove, agentgateway proxy): `nodeSelector=nodeGroup=system-worker` + tolerations for `dedicated=system-workload` with both `NoSchedule` and `NoExecute`

**Behavior:**
- All components from the recipe are bundled automatically
Expand Down
6 changes: 3 additions & 3 deletions docs/user/component-catalog.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ The source of truth is [`recipes/registry.yaml`](https://github.com/NVIDIA/aicr/
| **kai-scheduler** | DRA-aware gang scheduler with hierarchical queues and topology-aware placement. Ensures distributed training jobs land on nodes with optimal interconnect topology. | [KAI Scheduler](https://github.com/kai-scheduler/KAI-Scheduler) |
| **grove** | Pod lifecycle management for Dynamo inference platform. Installed as a standalone component. | [Grove](https://github.com/ai-dynamo/grove) |
| **dynamo-platform** | NVIDIA Dynamo inference serving platform with bundled CRDs. Distributed inference with prefix-cache-aware routing and disaggregated prefill/decode. | [Dynamo](https://github.com/ai-dynamo/dynamo) |
| **kgateway-crds** | Custom Resource Definitions for kgateway (Kubernetes Gateway API implementation). | [kgateway](https://github.com/kgateway-dev/kgateway) |
| **kgateway** | Kubernetes Gateway API implementation. Provides model-aware ingress routing for inference workloads. | [kgateway](https://github.com/kgateway-dev/kgateway) |
| **agentgateway-crds** | Custom Resource Definitions for agentgateway (Kubernetes Gateway API implementation for AI/ML inference). | [agentgateway](https://github.com/agentgateway/agentgateway) |
| **agentgateway** | Kubernetes Gateway API implementation for AI/ML inference. Implements the Gateway API Inference Extension for model-aware ingress routing to InferencePool backends. | [agentgateway](https://github.com/agentgateway/agentgateway) |
| **k8s-nim-operator** | NVIDIA NIM Operator for managing NIM (NVIDIA Inference Microservices) deployments on Kubernetes. | [K8s NIM Operator](https://github.com/NVIDIA/k8s-nim-operator) |
| **kueue** | Kubernetes-native job queuing system. Manages quotas and admits jobs for batch and AI workloads. | [Kueue](https://github.com/kubernetes-sigs/kueue) |
| **kubeflow-trainer** | Kubeflow Training Operator for distributed training jobs (PyTorch, etc.). Manages multi-node training job lifecycle with JobSet integration. | [Kubeflow Trainer](https://github.com/kubeflow/trainer) |
Expand All @@ -41,7 +41,7 @@ Not every component appears in every recipe. The recipe engine selects component

- **Base components** (cert-manager, kube-prometheus-stack) appear in most recipes.
- **Cloud-specific components** (aws-efa, aws-ebs-csi-driver) are added when the service matches.
- **Intent-specific components** (kgateway, kgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway).
- **Intent-specific components** (agentgateway, agentgateway-crds) are added based on workload intent (e.g., inference recipes include the inference gateway).
- **Platform-specific components** (slinky-slurm-operator, kubeflow-trainer, dynamo-platform) are added when the recipe selects a matching `--platform`.
- **Accelerator/OS-specific tuning** (nodewright-customizations, nvidia-dra-driver-gpu) varies by hardware and OS combination.

Expand Down
24 changes: 12 additions & 12 deletions docs/user/container-images.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ A machine-readable **CycloneDX 1.6 JSON** companion to this page is produced by
- Unique images: **71**
- Distinct registries: **11**

Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev`
Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.agentgateway.dev`, `docker.io`, `gcr.io`, `ghcr.io`, `gke.gcr.io`, `nvcr.io`, `public.ecr.aws`, `quay.io`, `registry.k8s.io`, `us-docker.pkg.dev`

## Components

| Component | Type | Chart | Pinned Version | Images |
|-----------|------|-------|----------------|--------|
| agentgateway | helm | agentgateway | v2.2.1 | 1 |
| agentgateway-crds | helm | agentgateway-crds | v2.2.1 | 0 |
| aws-ebs-csi-driver | helm | aws-ebs-csi-driver/aws-ebs-csi-driver | 2.59.0 | 6 |
| aws-efa | helm | aws-efa-k8s-device-plugin | v0.5.26 | 1 |
| cert-manager | helm | jetstack/cert-manager | v1.20.2 | 4 |
Expand All @@ -39,8 +41,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `
| k8s-ephemeral-storage-metrics | helm | k8s-ephemeral-storage-metrics/k8s-ephemeral-storage-metrics | 1.19.2 | 1 |
| k8s-nim-operator | helm | k8s-nim-operator | 3.1.0 | 1 |
| kai-scheduler | helm | kai-scheduler | v0.14.1 | 2 |
| kgateway | helm | kgateway | v2.0.0 | 1 |
| kgateway-crds | helm | kgateway-crds | v2.0.0 | 0 |
| kube-prometheus-stack | helm | prometheus-community/kube-prometheus-stack | 84.4.0 | 8 |
| kubeflow-trainer | helm | kubeflow-trainer | 2.2.0 | 3 |
| kueue | helm | kueue | 0.17.1 | 1 |
Expand All @@ -56,6 +56,14 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `

## Images by component

### agentgateway

- `cr.agentgateway.dev/controller:v2.2.1`

### agentgateway-crds

_No images extracted._

### aws-ebs-csi-driver

- `public.ecr.aws/csi-components/csi-attacher:v4.11.0-eksbuild.4`
Expand Down Expand Up @@ -121,14 +129,6 @@ Registries: `602401143452.dkr.ecr.us-west-2.amazonaws.com`, `cr.kgateway.dev`, `
- `ghcr.io/kai-scheduler/kai-scheduler/crd-upgrader:v0.14.1`
- `ghcr.io/kai-scheduler/kai-scheduler/operator:v0.14.1`

### kgateway

- `cr.kgateway.dev/kgateway-dev/kgateway:v2.0.0`

### kgateway-crds

_No images extracted._

### kube-prometheus-stack

- `docker.io/grafana/grafana:13.0.1`
Expand Down Expand Up @@ -222,7 +222,7 @@ AICR pulls from a deliberately diverse set of registries:
- **`public.ecr.aws`** — AWS public artifacts (aws-ebs-csi-driver).
- **Regional ECR** (`<account>.dkr.ecr.<region>.amazonaws.com`) — EKS-internal add-ons. The `aws-efa` entry below shows `us-west-2` because that is the in-tree default; deployments in other regions override `awsefa:image.repository` at bundle or install time. See [Regional registry overrides](../integrator/recipe-development.md#regional-registry-overrides) for the pattern.
- **`gcr.io`, `gke.gcr.io`, `us-docker.pkg.dev`** — GCP/GKE add-ons (gke-nccl-tcpxo).
- **`cr.kgateway.dev`** — kgateway.
- **`cr.agentgateway.dev`** — agentgateway (AI inference gateway).
- **`docker.io`** — assorted upstream images (`busybox`, `pytorch`, etc.).

Customers running in air-gapped or private-registry environments need to mirror every registry above. A dedicated mirroring guide is tracked under [#743](https://github.com/NVIDIA/aicr/issues/743).
Expand Down
10 changes: 5 additions & 5 deletions pkg/bundler/deployer/helm/helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1428,14 +1428,14 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin
}{Version: "v0.1.0"},
ComponentRefs: []recipe.ComponentRef{
{Name: "cert-manager", Namespace: "cert-manager", Chart: "cert-manager", Version: "v1.17.2", Source: "https://charts.jetstack.io"},
{Name: "kgateway", Namespace: "kgateway-system", Chart: "kgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"},
{Name: "agentgateway", Namespace: "agentgateway-system", Chart: "agentgateway", Version: "v0.1.0", Source: "https://example.invalid/charts"},
{Name: "nodewright-operator", Namespace: "skyhook", Chart: "nodewright-operator", Version: "v0.1.0", Source: "https://example.invalid/charts"},
},
DeploymentOrder: []string{"cert-manager", "kgateway", "nodewright-operator"},
DeploymentOrder: []string{"cert-manager", "agentgateway", "nodewright-operator"},
},
ComponentValues: map[string]map[string]any{
"cert-manager": {},
"kgateway": {},
"agentgateway": {},
"nodewright-operator": {},
},
Version: "v1.0.0",
Expand All @@ -1449,7 +1449,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin
snippet=$(sed -n '/^skip_preflight_for_release()/,/^}/p' "$UNDEPLOY")
eval "$snippet"
skip_preflight_for_release "nodewright-operator" && echo "skip:nodewright-operator"
skip_preflight_for_release "kgateway" && echo "skip:kgateway"
skip_preflight_for_release "agentgateway" && echo "skip:agentgateway"
if skip_preflight_for_release "cert-manager"; then
echo "unexpected:cert-manager"
exit 1
Expand All @@ -1470,7 +1470,7 @@ func TestUndeployScript_PreflightSkipListCoversManifestDeletedReleases(t *testin
}

out := stdout.String()
for _, want := range []string{"skip:nodewright-operator", "skip:kgateway", "check:cert-manager"} {
for _, want := range []string{"skip:nodewright-operator", "skip:agentgateway", "check:cert-manager"} {
if !strings.Contains(out, want) {
t.Errorf("expected %q in output; stdout=%q stderr=%q", want, out, stderr.String())
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/bundler/deployer/helm/templates/undeploy.sh.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ extra_crds_for_release() {
# deleted from manifests before the controller is uninstalled.
skip_preflight_for_release() {
case "$1" in
nodewright-operator|kgateway) return 0 ;;
nodewright-operator|agentgateway) return 0 ;;
*) return 1 ;;
esac
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ extra_crds_for_release() {
# deleted from manifests before the controller is uninstalled.
skip_preflight_for_release() {
case "$1" in
nodewright-operator|kgateway) return 0 ;;
nodewright-operator|agentgateway) return 0 ;;
*) return 1 ;;
esac
}
Expand Down
Loading
Loading