From 575e6da1c8e4c76a342567d66bae79f2e1cfbed0 Mon Sep 17 00:00:00 2001 From: Nathan Hensley Date: Fri, 24 Apr 2026 16:01:42 -0700 Subject: [PATCH] chore(recipes): drop NCCL perf checks from gb200-eks-inference overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the validation.performance block (nccl-all-reduce-bw-net and nccl-all-reduce-bw-nvls) from the gb200-eks-inference overlay. Inference deployments on this platform don't need the NCCL fabric health gate by default — single-node serving would skip them and multi-node serving can opt in via its own overlay. The inheriting gb200-eks-ubuntu-inference overlay picks up the removal automatically. --- recipes/overlays/gb200-eks-inference.yaml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/recipes/overlays/gb200-eks-inference.yaml b/recipes/overlays/gb200-eks-inference.yaml index a6d9dac0d..4d2925aac 100644 --- a/recipes/overlays/gb200-eks-inference.yaml +++ b/recipes/overlays/gb200-eks-inference.yaml @@ -72,20 +72,3 @@ spec: intent: inference dependencyRefs: - nodewright-operator - - # NCCL fabric health checks. NET exercises EFA (inter-node), NVLS exercises - # MNNVL (intra-NVL72). Both matter for multi-node inference that spans the - # fabric (tensor-parallel serving, MoE expert parallelism); single-node - # deployments hit the WorkerCount < 2 skip path gracefully. Thresholds sized - # for a 2-node GB200 pair — will be raised once production NVL72 data is - # available. - validation: - performance: - checks: - - nccl-all-reduce-bw-net - - nccl-all-reduce-bw-nvls - constraints: - - name: nccl-all-reduce-bw-net - value: ">= 40" - - name: nccl-all-reduce-bw-nvls - value: ">= 500"