From 927d91aedc5f491ccb22dd195f201e55150a4234 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Fri, 20 Mar 2026 17:03:55 +0100
Subject: [PATCH 1/4] feat(sandbox): switch device plugin to CDI injection mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Configure the NVIDIA device plugin to use deviceListStrategy=cdi-cri so
that GPU devices are injected via direct CDI device requests in the CRI.
Sandbox pods now only require the nvidia.com/gpu resource request —
runtimeClassName is no longer set on GPU pods.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 architecture/gateway-single-node.md           |  8 +-
 crates/openshell-server/src/sandbox/mod.rs    | 87 +++++++++++++------
 .../nvidia-device-plugin-helmchart.yaml       |  8 ++
 3 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md
index 57aebd3a..1f4e04cd 100644
--- a/architecture/gateway-single-node.md
+++ b/architecture/gateway-single-node.md
@@ -300,7 +300,7 @@ GPU support is part of the single-node gateway bootstrap path rather than a sepa
 - When enabled, the cluster container is created with Docker `DeviceRequests`, which is the API equivalent of `docker run --gpus all`.
 - `deploy/docker/Dockerfile.images` installs NVIDIA Container Toolkit packages in a dedicated Ubuntu stage and copies the runtime binaries, config, and `libnvidia-container` shared libraries into the final Ubuntu-based cluster image.
 - `deploy/docker/cluster-entrypoint.sh` checks `GPU_ENABLED=true` and copies GPU-only manifests from `/opt/openshell/gpu-manifests/` into k3s's manifests directory.
-- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels.
+- `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`. NFD and GFD are disabled; the device plugin's default `nodeAffinity` (which requires `feature.node.kubernetes.io/pci-10de.present=true` or `nvidia.com/gpu.present=true` from NFD/GFD) is overridden to empty so the DaemonSet schedules on the single-node cluster without requiring those labels. The chart is configured with `deviceListStrategy: cdi-cri` so the device plugin injects devices via direct CDI device requests in the CRI.
 - k3s auto-detects `nvidia-container-runtime` on `PATH`, registers the `nvidia` containerd runtime, and creates the `nvidia` `RuntimeClass` automatically.
 - The OpenShell Helm chart grants the gateway service account cluster-scoped read access to `node.k8s.io/runtimeclasses` and core `nodes` so GPU sandbox admission can verify both the `nvidia` `RuntimeClass` and allocatable GPU capacity before creating a sandbox.
 
@@ -311,10 +311,12 @@ Host GPU drivers & NVIDIA Container Toolkit
     └─ Docker: --gpus all (DeviceRequests in bollard API)
         └─ k3s/containerd: nvidia-container-runtime on PATH -> auto-detected
             └─ k8s: nvidia-device-plugin DaemonSet advertises nvidia.com/gpu
-                └─ Pods: request nvidia.com/gpu in resource limits
+                └─ Pods: request nvidia.com/gpu in resource limits (CDI injection — no runtimeClassName needed)
 ```
 
-The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`.
+Device injection uses CDI (`deviceListStrategy: cdi-cri`): the device plugin injects devices via direct CDI device requests in the CRI. Sandbox pods only need `nvidia.com/gpu: 1` in their resource limits — no `runtimeClassName` field is set on GPU pods.
+
+The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` (without `runtimeClassName`) and running `nvidia-smi`.
 
 ## Remote Image Transfer
 
diff --git a/crates/openshell-server/src/sandbox/mod.rs b/crates/openshell-server/src/sandbox/mod.rs
index e10b33d0..3dca6649 100644
--- a/crates/openshell-server/src/sandbox/mod.rs
+++ b/crates/openshell-server/src/sandbox/mod.rs
@@ -31,7 +31,6 @@ pub const SANDBOX_KIND: &str = "Sandbox";
 const SANDBOX_ID_LABEL: &str = "openshell.ai/sandbox-id";
 const SANDBOX_MANAGED_LABEL: &str = "openshell.ai/managed-by";
 const SANDBOX_MANAGED_VALUE: &str = "openshell";
-const GPU_RUNTIME_CLASS_NAME: &str = "nvidia";
 const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu";
 const GPU_RESOURCE_QUANTITY: &str = "1";
 
@@ -127,25 +126,6 @@ impl SandboxClient {
     }
 
     pub async fn validate_gpu_support(&self) -> Result<(), tonic::Status> {
-        let runtime_classes: Api<DynamicObject> = Api::all_with(
-            self.client.clone(),
-            &ApiResource::from_gvk(&GroupVersionKind::gvk("node.k8s.io", "v1", "RuntimeClass")),
-        );
-
-        let runtime_class_exists = runtime_classes
-            .get_opt(GPU_RUNTIME_CLASS_NAME)
-            .await
-            .map_err(|err| {
-                tonic::Status::internal(format!("check GPU runtime class failed: {err}"))
-            })?
-            .is_some();
-
-        if !runtime_class_exists {
-            return Err(tonic::Status::failed_precondition(
-                "GPU sandbox requested, but the active gateway is not GPU-enabled. To start a gateway with GPU support run: `openshell gateway start --gpu`",
-            ));
-        }
-
         let nodes: Api<Node> = Api::all(self.client.clone());
         let node_list = nodes.list(&ListParams::default()).await.map_err(|err| {
             tonic::Status::internal(format!("check GPU node capacity failed: {err}"))
@@ -869,12 +849,7 @@ fn sandbox_template_to_k8s(
     }
 
     let mut spec = serde_json::Map::new();
-    if gpu {
-        spec.insert(
-            "runtimeClassName".to_string(),
-            serde_json::json!(GPU_RUNTIME_CLASS_NAME),
-        );
-    } else if !template.runtime_class_name.is_empty() {
+    if !template.runtime_class_name.is_empty() {
         spec.insert(
             "runtimeClassName".to_string(),
             serde_json::json!(template.runtime_class_name),
@@ -1660,7 +1635,7 @@ mod tests {
 
         assert_eq!(
             pod_template["spec"]["runtimeClassName"],
-            serde_json::json!(GPU_RUNTIME_CLASS_NAME)
+            serde_json::Value::Null
         );
         assert_eq!(
             pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME],
@@ -1668,6 +1643,64 @@ mod tests {
         );
     }
 
+    #[test]
+    fn gpu_sandbox_uses_template_runtime_class_name_when_set() {
+        let template = SandboxTemplate {
+            runtime_class_name: "kata-containers".to_string(),
+            ..SandboxTemplate::default()
+        };
+
+        let pod_template = sandbox_template_to_k8s(
+            &template,
+            true,
+            "openshell/sandbox:latest",
+            "",
+            "sandbox-id",
+            "sandbox-name",
+            "https://gateway.example.com",
+            "0.0.0.0:2222",
+            "secret",
+            300,
+            &std::collections::HashMap::new(),
+            "",
+            "",
+        );
+
+        assert_eq!(
+            pod_template["spec"]["runtimeClassName"],
+            serde_json::json!("kata-containers")
+        );
+    }
+
+    #[test]
+    fn non_gpu_sandbox_uses_template_runtime_class_name_when_set() {
+        let template = SandboxTemplate {
+            runtime_class_name: "kata-containers".to_string(),
+            ..SandboxTemplate::default()
+        };
+
+        let pod_template = sandbox_template_to_k8s(
+            &template,
+            false,
+            "openshell/sandbox:latest",
+            "",
+            "sandbox-id",
+            "sandbox-name",
+            "https://gateway.example.com",
+            "0.0.0.0:2222",
+            "secret",
+            300,
+            &std::collections::HashMap::new(),
+            "",
+            "",
+        );
+
+        assert_eq!(
+            pod_template["spec"]["runtimeClassName"],
+            serde_json::json!("kata-containers")
+        );
+    }
+
     #[test]
     fn gpu_sandbox_preserves_existing_resource_limits() {
         let template = SandboxTemplate {
diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
index 088562ac..4ad6512a 100644
--- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
+++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
@@ -12,6 +12,10 @@
 # (which requires nvidia.com/gpu.present=true) is overridden to empty
 # so it schedules on any node without requiring NFD/GFD labels.
 #
+# CDI injection mode: the device plugin uses deviceListStrategy=cdi-cri so that
+# devices are injected via CDI hooks before container start. Sandbox pods only
+# need the nvidia.com/gpu resource request — no runtimeClassName is required.
+#
 # k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia"
 # RuntimeClass automatically, so no manual RuntimeClass manifest is needed.
 
@@ -28,6 +32,10 @@ spec:
   createNamespace: true
   valuesContent: |-
     runtimeClassName: nvidia
+    deviceListStrategy: cdi-cri
+    cdi:
+      nvidiaHookPath: /usr/bin/nvidia-cdi-hook
+    nvidiaDriverRoot: "/"
     gfd:
       enabled: false
     nfd:

From dae37a202f7ca49f0c5e637e51355a2f17f426aa Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Fri, 20 Mar 2026 17:54:52 +0100
Subject: [PATCH 2/4] docs(debug-skill): add CDI device plugin diagnostics for
 GPU gateways

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .../skills/debug-openshell-cluster/SKILL.md   | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md
index 4d0e4659..5b3b6375 100644
--- a/.agents/skills/debug-openshell-cluster/SKILL.md
+++ b/.agents/skills/debug-openshell-cluster/SKILL.md
@@ -256,7 +256,43 @@ Look for:
 - `OOMKilled` — memory limits too low
 - `FailedMount` — volume issues
 
-### Step 8: Check DNS Resolution
+### Step 8: Check GPU Device Plugin and CDI (GPU gateways only)
+
+Skip this step for non-GPU gateways.
+
+The NVIDIA device plugin DaemonSet must be running and healthy before GPU sandboxes can be created. It uses CDI injection (`deviceListStrategy: cdi-cri`) to inject GPU devices into sandbox pods — no `runtimeClassName` is set on sandbox pods.
+
+```bash
+# DaemonSet status — numberReady must be >= 1
+openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin
+
+# Device plugin pod logs — look for "CDI" lines confirming CDI mode is active
+openshell doctor exec -- kubectl logs -n nvidia-device-plugin -l app.kubernetes.io/name=nvidia-device-plugin --tail=50
+
+# List CDI devices registered by the device plugin (requires nvidia-ctk in the cluster image).
+# Device plugin CDI entries use the vendor string "k8s.device-plugin.nvidia.com" so entries
+# will be prefixed "k8s.device-plugin.nvidia.com/gpu=". If the list is empty, CDI spec
+# generation has not completed yet.
+openshell doctor exec -- nvidia-ctk cdi list
+
+# Verify CDI spec files were generated on the node
+openshell doctor exec -- ls /var/run/cdi/
+
+# Helm install job logs for the device plugin chart
+openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-nvidia-device-plugin --tail=100
+
+# Confirm a GPU sandbox pod has no runtimeClassName (CDI injection, not runtime class)
+openshell doctor exec -- kubectl get pod -n openshell -o jsonpath='{range .items[*]}{.metadata.name}{" runtimeClassName="}{.spec.runtimeClassName}{"\n"}{end}'
+```
+
+Common issues:
+
+- **DaemonSet 0/N ready**: The device plugin chart may still be deploying (k3s Helm controller can take 1–2 min) or the pod is crashing. Check pod logs.
+- **`nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries**: CDI spec generation has not completed. The device plugin may still be starting or the `cdi-cri` strategy isn't active. Verify `deviceListStrategy: cdi-cri` is in the rendered Helm values.
+- **No CDI spec files at `/var/run/cdi/`**: Same as above — device plugin hasn't written CDI specs yet.
+- **`HEALTHCHECK_GPU_DEVICE_PLUGIN_NOT_READY` in health check logs**: Device plugin has no ready pods. Check DaemonSet events and pod logs.
+
+### Step 9: Check DNS Resolution
 
 DNS misconfiguration is a common root cause, especially on remote/Linux hosts:
 
@@ -315,6 +351,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
 | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` |
 | Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` target in `deploy/docker/Dockerfile.images`. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker |
 | `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy <name> && openshell gateway start` |
+| `nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries | CDI specs not yet generated by device plugin | Device plugin may still be starting; wait and retry, or check pod logs (Step 8) |
 
 ## Full Diagnostic Dump
 
@@ -368,4 +405,9 @@ openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox
 
 echo "=== DNS Configuration ==="
 openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf
+
+# GPU gateways only
+echo "=== GPU Device Plugin ==="
+openshell doctor exec -- kubectl get daemonset -n nvidia-device-plugin
+openshell doctor exec -- nvidia-ctk cdi list
 ```

From d5c1b9cce1724f9ef7b2ad86f341558b49c889ff Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Fri, 20 Mar 2026 23:36:07 +0100
Subject: [PATCH 3/4] feat(gpu): set deviceIDStrategy=index in device plugin
 Helm values

Using index-based device IDs improves compatibility across platforms
including Jetson/Tegra-based and WSL2-based system.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
index 4ad6512a..1cb0ca70 100644
--- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
+++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml
@@ -33,6 +33,7 @@ spec:
   valuesContent: |-
     runtimeClassName: nvidia
     deviceListStrategy: cdi-cri
+    deviceIDStrategy: index
     cdi:
       nvidiaHookPath: /usr/bin/nvidia-cdi-hook
     nvidiaDriverRoot: "/"

From 9555515abb3fb4fedac663c36469a45509b14186 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 25 Mar 2026 15:24:31 +0100
Subject: [PATCH 4/4] feat(gateway): Reduce nvidia-container-toolkit
 installation

For newer NVIDIA Container Toolkit versions, the components installed
through the nvidia-container-toolkit, libnvidia-container-tools, and
libnvidia-container1 packages are considered legacy. In CDI mode -- or
when native CDI is used -- only the nvidia-container-toolkit-base
package is required with the notable components being:

* nvidia-ctk - The general purpose NVIDIA Container Toolkit CLI. It
  includes functionality such as nvidia-ctk cdi generate to generate
  CDI specifications and nvidia-ctk cdi list to show available CDI
  devices.
* nvidia-cdi-hook - Implements specific container lifecycle hooks used
  to ensure that a container is set up correctly to allow GPU access
  after device nodes and driver files are injected using CDI.
  This CLI is aliased by the `nvidia-ctk hook` subcommand.
* nvidia-container-runtime - As a wrapper for runc to add GPU support
  in environments where direct CDI device requests are not possible.
  This includes k3s, where the nvidia RuntimeClass is added automatically
  if thie nvidia-container-runtime is detected and used to ensure the
  injection of device nodes and libraries for the k8s-device-plugin
  containers.

This change also renames the Docker build stage to nvidia-container-toolkit
explicitly for clarity.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 deploy/docker/Dockerfile.images | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images
index 9cc50085..6bda277b 100644
--- a/deploy/docker/Dockerfile.images
+++ b/deploy/docker/Dockerfile.images
@@ -201,7 +201,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certifi
     chmod +x /tmp/helm && \
     rm -rf /var/lib/apt/lists/*
 
-FROM ubuntu:24.04 AS nvidia-toolkit
+FROM ubuntu:24.04 AS nvidia-container-toolkit
 ARG NVIDIA_CONTAINER_TOOLKIT_VERSION
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -213,10 +213,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
-        "nvidia-container-toolkit=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \
         "nvidia-container-toolkit-base=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \
-        "libnvidia-container-tools=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" \
-        "libnvidia-container1=${NVIDIA_CONTAINER_TOOLKIT_VERSION}" && \
     rm -rf /var/lib/apt/lists/*
 
 # ---------------------------------------------------------------------------
@@ -240,13 +237,10 @@ COPY --from=k3s /usr/share/zoneinfo/ /usr/share/zoneinfo/
 ENV PATH="/var/lib/rancher/k3s/data/cni:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/bin/aux" \
     CRI_CONFIG_FILE="/var/lib/rancher/k3s/agent/etc/crictl.yaml"
 
-COPY --from=nvidia-toolkit /usr/bin/nvidia-cdi-hook /usr/bin/
-COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime /usr/bin/
-COPY --from=nvidia-toolkit /usr/bin/nvidia-container-runtime-hook /usr/bin/
-COPY --from=nvidia-toolkit /usr/bin/nvidia-container-cli /usr/bin/
-COPY --from=nvidia-toolkit /usr/bin/nvidia-ctk /usr/bin/
-COPY --from=nvidia-toolkit /etc/nvidia-container-runtime /etc/nvidia-container-runtime
-COPY --from=nvidia-toolkit /usr/lib/*-linux-gnu/libnvidia-container*.so* /usr/lib/
+COPY --from=nvidia-container-toolkit /usr/bin/nvidia-cdi-hook /usr/bin/
+COPY --from=nvidia-container-toolkit /usr/bin/nvidia-container-runtime /usr/bin/
+COPY --from=nvidia-container-toolkit /usr/bin/nvidia-ctk /usr/bin/
+COPY --from=nvidia-container-toolkit /etc/nvidia-container-runtime /etc/nvidia-container-runtime
 COPY --from=supervisor-builder /build/out/openshell-sandbox /opt/openshell/bin/openshell-sandbox
 
 RUN mkdir -p /var/lib/rancher/k3s/server/manifests \