diff --git a/README.md b/README.md index 53bcd51..542223d 100644 --- a/README.md +++ b/README.md @@ -231,13 +231,13 @@ NicNodePolicy (await each), all remaining CRs in one batch (controllers reconcile concurrently), then verify every manifest reached a terminal state. Example workload manifests (`*example*`) are **not** applied by `l8k deploy` — they're fixtures consumed by `l8k validate --connectivity` or -`l8k deploy --verify` for the data-plane phase. It auto-prefers +`l8k deploy --test-connectivity` for the data-plane phase. It auto-prefers `/network-operator/` (the layout `l8k generate` produces) and falls back to `` itself. `--dry-run` does a server-side dry run. `--deploy-timeout` caps the whole apply+reconcile phase end-to-end (e.g. `--deploy-timeout 90m`); without it, deploy polls indefinitely — right for SR-IOV on large clusters -where reconciliation can take an hour. `--verify` chains the connectivity -matrix straight after a successful apply. +where reconciliation can take an hour. `--test-connectivity` chains the +connectivity matrix straight after a successful apply. Verify the deployment end-to-end: @@ -262,7 +262,7 @@ a per-pair cross-rail canary. The matrix is on by default (`--connectivity=false` to skip), runs concurrent pings capped at 16, and cleans up the test DaemonSet unless `--keep` is set. -A self-contained HTML report lands at `/verify-report.html` +A self-contained HTML report lands at `/k8s-launch-kit-validation-report.html` by default (override with `--report-path`, disable with `--report-path=-`). The report has: header (l8k version, kubeconfig context, API-server version), profile, **Node groups** (per-`clusterConfig[]` entry with east-west / north- diff --git a/pkg/cmd/deploy.go b/pkg/cmd/deploy.go index 3a2392f..3e31eb2 100644 --- a/pkg/cmd/deploy.go +++ b/pkg/cmd/deploy.go @@ -40,10 +40,11 @@ var ( // until every manifest reaches a terminal state or the user // cancels. deployTimeout time.Duration - // deployVerify chains a `l8k validate --connectivity` run right - // after a successful deploy. Off by default; opt-in for + // deployTestConnectivity chains the data-plane connectivity + // matrix (the same one `l8k validate --connectivity` runs) + // right after a successful deploy. Off by default; opt-in for // pipelines that want end-to-end verification in one command. - deployVerify bool + deployTestConnectivity bool ) // DefaultDeploymentDir is the default directory `l8k generate` writes @@ -148,7 +149,7 @@ is used as the manifest directory.`, uiOutput.Success("Deployment completed") } - if deployVerify && !dryRunFlag { + if deployTestConnectivity && !dryRunFlag { // Pipeline the connectivity matrix straight after a // successful apply. We deliberately do NOT re-run the // version check here — deploy just landed manifests @@ -157,16 +158,15 @@ is used as the manifest directory.`, matrix, err := connectivity.RunMatrix(ctx, k8sClient, restConfig, uiOutput, connectivity.Options{ ManifestDir: manifestDir, Timeout: 0, // RunMatrix defaults to 5m - PingCount: 0, // RunMatrix defaults to 3 }) if err != nil { exitWithError(apperrors.NewClusterError( - "deploy succeeded but --verify failed", + "deploy succeeded but --test-connectivity failed", err, "Inspect the test DaemonSet via --keep + kubectl get pods", ), outputFormat) } - if matrix != nil && (matrix.Summary.Failed > 0 || matrix.Summary.ExecErrors > 0) { + if matrix != nil && matrix.Summary.Failed > 0 { exitWithError(apperrors.NewDeploymentError( fmt.Sprintf("connectivity matrix failed: %d/%d passed", matrix.Summary.Passed, matrix.Summary.TotalTests), nil, @@ -201,11 +201,11 @@ func init() { deployCmd.Flags().StringVar(&deploymentFiles, "deployment-files", DefaultDeploymentDir, "Directory containing the manifests to apply") deployCmd.Flags().BoolVar(&dryRunFlag, "dry-run", false, "Preview the deployment via server-side dry-run without persisting changes") deployCmd.Flags().DurationVar(&deployTimeout, "deploy-timeout", 0, "Maximum end-to-end wall-clock budget for the deploy phase (e.g. 45m, 2h). 0 (the default) means no deadline; the deploy polls until every manifest reaches a terminal state. Useful for matching a maintenance window when SR-IOV reconciliation on a large cluster can take an hour or more.") - deployCmd.Flags().BoolVar(&deployVerify, "verify", false, "After a successful deploy, run the connectivity matrix (apply example DaemonSet → wait Ready → ping matrix → cleanup) to verify the data plane end-to-end.") + deployCmd.Flags().BoolVar(&deployTestConnectivity, "test-connectivity", false, "After a successful deploy, run the connectivity matrix (apply example DaemonSet → wait Ready → RDMA matrix → cleanup) to verify the data plane end-to-end.") setFlagGroup(deployCmd, "kubeconfig", GroupCommon) setFlagGroup(deployCmd, "deployment-files", GroupGeneration) setFlagGroup(deployCmd, "deploy-timeout", GroupDeploy) setFlagGroup(deployCmd, "dry-run", GroupDeploy) - setFlagGroup(deployCmd, "verify", GroupDeploy) + setFlagGroup(deployCmd, "test-connectivity", GroupDeploy) } diff --git a/pkg/cmd/validate.go b/pkg/cmd/validate.go index 9d69675..fb78647 100644 --- a/pkg/cmd/validate.go +++ b/pkg/cmd/validate.go @@ -41,26 +41,26 @@ import ( "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin/connectivity" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin/crstate" + "github.com/nvidia/k8s-launch-kit/pkg/presetmatch" + "github.com/nvidia/k8s-launch-kit/pkg/presets" "github.com/nvidia/k8s-launch-kit/pkg/ui" ) // Phase 2 connectivity-test flags. `--connectivity` defaults to ON — // every `l8k validate` verifies the data plane (apply example DS, -// wait Ready, ping matrix, cleanup) unless the caller passes +// wait Ready, RDMA matrix, cleanup) unless the caller passes // `--connectivity=false`. The other flags tune matrix behaviour -// (--ping-count, --connectivity-timeout, --keep) or extend the -// validate semantics (--wait blocks until in-progress manifests reach -// a terminal state). +// (--connectivity-timeout, --keep) or extend the validate semantics +// (--wait blocks until in-progress manifests reach a terminal state). // -// Phase 3 adds --report-path: emit an HTML verify-report alongside +// Phase 3 adds --report-path: emit an HTML validation report alongside // the text/JSON output. Empty default means "auto-place at -// /verify-report.html"; the literal "-" disables -// report writing. +// /k8s-launch-kit-validation-report.html"; the +// literal "-" disables report writing. var ( validateConnectivity bool validateKeep bool validateConnectivityTimeout time.Duration - validatePingCount int validateWait time.Duration validateReportPath string ) @@ -130,6 +130,7 @@ connectivity-matrix failure.`, matrix *connectivity.MatrixResult warnings []string presetDeviations []groupDeviationReport + presetResults []presetmatch.Result reportClient ctrlclient.Client reportRestConfig *rest.Config reportManifestDir string @@ -145,10 +146,19 @@ connectivity-matrix failure.`, if err != nil { warnings = append(warnings, err.Error()) } + // Compute the overall verdict from whatever inputs we + // have at this exit point — verdict may not have been + // computed yet on an early-error path, so build a + // pessimistic FAIL with the error as the reason. + overall := connectivity.OverallVerdict{Pass: false} + if err != nil { + overall.Reasons = append(overall.Reasons, err.Error()) + } + emitVerdictBanner(overall, outputFormat) writeHTMLReportIfWanted(context.Background(), reportClient, reportRestConfig, reportManifestDir, deploymentFiles, operatorNamespace, versionCheck, componentCheck, results, &matrix, &warnings, - userConfigPath(), outputFormat) + presetResults, overall, userConfigPath(), outputFormat) exitWithError(err, outputFormat) } @@ -192,6 +202,14 @@ connectivity-matrix failure.`, Deviations: g.PresetDeviation, }) } + // Re-run preset matching at validate-time. This + // catches drift between the cluster-config.yaml's + // stored hardware view and the certified preset + // even when discover wasn't re-run. Results stay + // informational — a deviation doesn't fail + // validate (matches the historical behaviour of + // presetDeviation), so the exit code is unchanged. + presetResults = presetmatch.MatchAll(cfg) } else if err != nil { log.Log.V(1).Info("user-config not loaded; version check will be skipped", "path", path, "error", err.Error()) @@ -259,6 +277,7 @@ connectivity-matrix failure.`, verdict := emitValidationReport(versionCheck, results, presetDeviations, outputFormat) emitComponentVersionReport(componentCheck, outputFormat) + emitPresetMatchReport(presetResults, outputFormat) warnings = append(warnings, collectVerdictWarnings(verdict)...) if componentCheck != nil && !componentCheck.Skipped && !componentCheck.AllMatch { warnings = append(warnings, "Component versions in NicClusterPolicy / NicNodePolicy diverge from the selectedRelease catalog — see the report's components section.") @@ -269,18 +288,23 @@ connectivity-matrix failure.`, // connectivity failure) since Go's defer doesn't run on // os.Exit and exitWithError does os.Exit. emitReport := func() { + overall := computeOverallVerdict(verdict, componentCheck, matrix, presetResults) + emitVerdictBanner(overall, outputFormat) writeHTMLReportIfWanted(ctx, k8sClient, restConfig, manifestDir, deploymentFiles, operatorNamespace, versionCheck, componentCheck, results, &matrix, &warnings, - userConfigPath(), outputFormat) + presetResults, overall, userConfigPath(), outputFormat) } - // `--connectivity` runs the data-plane ping matrix only when - // every CR is reconciled — otherwise we'd just produce noise. + // `--connectivity` runs the data-plane matrix when no manifest + // is broken (error or missing). Version mismatches (Helm + // release or per-component) are *not* fatal here — the cluster + // is still up, so the connectivity tests are still meaningful; + // the final verdict picks up the mismatch as a fail reason. // In-progress (without errors) prints a warning and exits 0 // so CI/operators can re-run later. componentMismatch := componentCheck != nil && !componentCheck.Skipped && !componentCheck.AllMatch switch { - case verdict.HasError || verdict.HasMissing || !verdict.VersionOK || componentMismatch: + case verdict.HasError || verdict.HasMissing: emitReport() os.Exit(apperrors.ExitDeployment) case verdict.HasInProgress: @@ -292,6 +316,9 @@ connectivity-matrix failure.`, } warnings = append(warnings, "Connectivity matrix skipped — cluster has in-progress manifests.") emitReport() + if !verdict.VersionOK || componentMismatch { + os.Exit(apperrors.ExitDeployment) + } return } @@ -301,7 +328,6 @@ connectivity-matrix failure.`, m, err := connectivity.RunMatrix(ctxWithUI, k8sClient, restConfig, uiOutput, connectivity.Options{ ManifestDir: manifestDir, Timeout: validateConnectivityTimeout, - PingCount: validatePingCount, Keep: validateKeep, }) matrix = m @@ -320,16 +346,187 @@ connectivity-matrix failure.`, if matrix != nil && matrix.Skipped != nil { warnings = append(warnings, "Connectivity matrix skipped: "+matrix.Skipped.Reason) } - if matrix != nil && (matrix.Summary.Failed > 0 || matrix.Summary.ExecErrors > 0) { - emitReport() - os.Exit(apperrors.ExitDeployment) - } } emitReport() + // Final exit code unifies every gating signal: matrix failures, + // version/component mismatches, AND preset deviations all fail + // the verdict, but the matrix still gets a chance to run when + // the only problem is a stale Helm release / catalog mismatch + // / hardware drift from the catalog preset. + matrixFailed := matrix != nil && matrix.Summary.Failed > 0 + if matrixFailed || !verdict.VersionOK || componentMismatch || hasPresetDeviation(presetResults) { + os.Exit(apperrors.ExitDeployment) + } }, } +// hasPresetDeviation reports whether any group deviated from its +// matched preset. Drives the non-zero exit code in addition to the +// PASS/FAIL banner. +func hasPresetDeviation(results []presetmatch.Result) bool { + for _, r := range results { + if r.Status == presetmatch.StatusDeviation { + return true + } + } + return false +} + +// computeOverallVerdict folds every input from the validate run into +// a single pass/fail outcome — the same logic used to decide the +// process exit code, expressed as structured Reasons / Notes so the +// banner can list what failed (and what was merely informational). +func computeOverallVerdict( + verdict validationVerdict, + componentCheck *networkoperatorplugin.ComponentVersionCheck, + matrix *connectivity.MatrixResult, + presetResults []presetmatch.Result, +) connectivity.OverallVerdict { + out := connectivity.OverallVerdict{Pass: true} + if !verdict.VersionOK { + out.Pass = false + out.Reasons = append(out.Reasons, "Network Operator Helm release version does not match the selectedRelease in cluster-config.yaml") + } + if verdict.HasMissing { + out.Pass = false + out.Reasons = append(out.Reasons, fmt.Sprintf("%d manifest(s) not found in the cluster — `l8k deploy` not run or partial", verdict.MissingCount)) + } + if verdict.HasError { + out.Pass = false + out.Reasons = append(out.Reasons, fmt.Sprintf("%d manifest(s) reported an error state", verdict.ErrorCount)) + } + if componentCheck != nil && !componentCheck.Skipped && !componentCheck.AllMatch { + mismatches := 0 + for _, c := range componentCheck.Components { + if !c.Match { + mismatches++ + } + } + out.Pass = false + out.Reasons = append(out.Reasons, fmt.Sprintf("%d component version(s) in NicClusterPolicy/NicNodePolicy diverge from the selectedRelease catalog", mismatches)) + } + if matrix != nil { + if matrix.Summary.Failed > 0 { + out.Pass = false + out.Reasons = append(out.Reasons, fmt.Sprintf("%d RDMA test(s) failed in the connectivity matrix", matrix.Summary.Failed)) + } + if matrix.Skipped != nil { + out.Notes = append(out.Notes, "Connectivity matrix skipped: "+matrix.Skipped.Reason) + } + } + if verdict.HasInProgress { + out.Notes = append(out.Notes, fmt.Sprintf("%d manifest(s) still reconciling — re-run later or use --wait to block (does not gate the verdict)", verdict.InProgressCount)) + } + // Platform topology mismatches fail the verdict but do NOT block + // the other stages (same pattern as version mismatch): when the + // discovered hardware drifts from the certified topology for the + // server type, it's a real validation failure, but the data + // plane is still testable. The HTML report's Node groups section + // shows the in-place diff per PF row — the banner just names the + // affected server type so operators know where to look. + for _, r := range presetResults { + if r.Status != presetmatch.StatusDeviation { + continue + } + out.Pass = false + out.Reasons = append(out.Reasons, + fmt.Sprintf("The detected platform topology does not match the certified topology for %s server type (see Node groups section for the per-device diff)", + platformLabel(r))) + } + return out +} + +// platformLabel renders the server-type identifier used in user +// messages as `--`, dropping any +// empty segments. The manufacturer is the leading segment when the +// matched preset's topology.yaml declares it (e.g. "Dell-PowerEdge- +// XE9680-H200"); on a not-found match it's omitted because we have +// no manufacturer signal outside the catalog. Falls back to the +// group identifier if nothing useful is set. +func platformLabel(r presetmatch.Result) string { + parts := make([]string, 0, 3) + if r.Manufacturer != "" { + parts = append(parts, r.Manufacturer) + } + if r.MachineType != "" { + parts = append(parts, r.MachineType) + } + if r.GPUType != "" { + parts = append(parts, r.GPUType) + } + if len(parts) == 0 { + return r.Group + } + return strings.Join(parts, "-") +} + + +// emitVerdictBanner prints the PASS/FAIL banner at the top of text +// output (above the per-check details that follow). JSON mode skips +// this — the structured Verdict field travels with the report data. +func emitVerdictBanner(v connectivity.OverallVerdict, format string) { + if format == "json" { + return + } + if v.Pass { + fmt.Println("\n══════════════════════════════════════════════════════════") + fmt.Println(" ✓ VALIDATION PASSED") + fmt.Println("══════════════════════════════════════════════════════════") + } else { + fmt.Println("\n══════════════════════════════════════════════════════════") + fmt.Println(" ✗ VALIDATION FAILED") + fmt.Println("══════════════════════════════════════════════════════════") + for _, r := range v.Reasons { + fmt.Printf(" • %s\n", r) + } + } + for _, n := range v.Notes { + fmt.Printf(" ⓘ %s\n", n) + } +} + +// emitPresetMatchReport prints the per-group platform-topology check +// in text mode. JSON mode skips this — the structured field rides +// downstream on the HTML/JSON envelope. +func emitPresetMatchReport(results []presetmatch.Result, format string) { + if len(results) == 0 || format == "json" { + return + } + fmt.Println() + fmt.Println("Platform topology validation") + for _, r := range results { + var status string + switch r.Status { + case presetmatch.StatusMatch: + status = "MATCH " + case presetmatch.StatusDeviation: + status = "MISMATCH " + case presetmatch.StatusNotFound: + status = "NOT CERTIFIED" + case presetmatch.StatusSkipped: + status = "SKIPPED " + default: + status = "UNKNOWN " + } + label := r.Group + if r.MachineType != "" || r.GPUType != "" { + label = fmt.Sprintf("%s (%s/%s)", r.Group, r.MachineType, r.GPUType) + } + detail := r.Reason + switch r.Status { + case presetmatch.StatusMatch: + detail = fmt.Sprintf("matches certified topology for %s server type", platformLabel(r)) + case presetmatch.StatusDeviation: + detail = fmt.Sprintf("does not match certified topology for %s server type · %s", + platformLabel(r), r.Reason) + case presetmatch.StatusNotFound: + detail = fmt.Sprintf("no certified topology available for %s server type", platformLabel(r)) + } + fmt.Printf(" [%s] %s — %s\n", status, label, detail) + } +} + // emitComponentVersionReport prints the component-version cross-check // in text mode (the JSON consumer reads the structured field added by // the HTML/JSON writers downstream). Skipped checks are surfaced as a @@ -402,7 +599,7 @@ func resolveReportPath(flag, deploymentDir string) string { case "-": return "" case "": - return filepath.Join(deploymentDir, "verify-report.html") + return filepath.Join(deploymentDir, "k8s-launch-kit-validation-report.html") default: return flag } @@ -424,6 +621,8 @@ func writeHTMLReportIfWanted( results []networkoperatorplugin.ValidationResult, matrix **connectivity.MatrixResult, warnings *[]string, + presetResults []presetmatch.Result, + overall connectivity.OverallVerdict, userCfgPath string, outputFormat string, ) { @@ -433,6 +632,7 @@ func writeHTMLReportIfWanted( } data := connectivity.ReportData{ + Verdict: overall, Cluster: connectivity.ClusterInfo{ L8kVersion: Version, GeneratedAt: time.Now().UTC(), @@ -441,10 +641,11 @@ func writeHTMLReportIfWanted( OperatorNamespace: operatorNamespace, }, Profile: loadProfileInfo(userCfgPath, manifestDir), - NodeGroups: loadNodeGroups(userCfgPath), + NodeGroups: loadNodeGroups(userCfgPath, presetResults), Nodes: listNodesForReport(ctx, c), Release: versionCheck, ComponentCheck: componentCheck, + PresetMatches: presetResults, Manifests: results, Matrix: *matrix, Warnings: *warnings, @@ -672,7 +873,17 @@ func splitYAMLDocs(s string) []string { // loadNodeGroups projects every cluster-config.yaml `clusterConfig[]` // entry into a NodeGroupInfo for the report. Empty result when the // config wasn't found / parsed — the section just renders empty. -func loadNodeGroups(userCfgPath string) []connectivity.NodeGroupInfo { +// +// presetResults carries the runtime-fresh per-group preset match +// (computed by presetmatch.MatchAll at validate-time). Its +// Deviations are preferred over the snapshot stored in +// `cluster-config.yaml.clusterConfig[].presetDeviation`, which only +// gets populated by `l8k discover` and goes stale as soon as the +// catalog changes or the hardware is swapped. Falling back to the +// stored snapshot when the runtime entry is absent keeps the older +// flow working (e.g. validate run without l8k discover access to +// the cluster). +func loadNodeGroups(userCfgPath string, presetResults []presetmatch.Result) []connectivity.NodeGroupInfo { if userCfgPath == "" { return nil } @@ -680,6 +891,20 @@ func loadNodeGroups(userCfgPath string) []connectivity.NodeGroupInfo { if err != nil || cfg == nil { return nil } + // Build a per-group index of fresh match results so the + // PF-table markers reflect what the banner is currently + // complaining about. The full Result is indexed (not just + // Deviations) because the marker pass also needs Preset.PFs + // to enrich "missing PCI" rows with the expected + // deviceID / rail / netdev that the certified topology + // declares for those addresses. + freshResultsByGroup := map[string]*presetmatch.Result{} + for i := range presetResults { + r := &presetResults[i] + if r.Status == presetmatch.StatusDeviation { + freshResultsByGroup[r.Group] = r + } + } out := make([]connectivity.NodeGroupInfo, 0, len(cfg.ClusterConfig)) for _, g := range cfg.ClusterConfig { ng := connectivity.NodeGroupInfo{ @@ -696,7 +921,19 @@ func loadNodeGroups(userCfgPath string) []connectivity.NodeGroupInfo { ng.RdmaCapable = g.Capabilities.Nodes.Rdma ng.IbCapable = g.Capabilities.Nodes.Ib } - for _, d := range g.PresetDeviation { + // Pick the runtime-fresh result when available, falling back + // to whatever `l8k discover` stamped into cluster-config. + // The runtime path also carries the matched Preset (used to + // enrich missing-PCI rows below); the fallback only has + // stored deviations so missing rows there can't be enriched. + freshResult := freshResultsByGroup[g.Identifier] + var deviations []config.PresetDeviationEntry + if freshResult != nil { + deviations = freshResult.Deviations + } else { + deviations = g.PresetDeviation + } + for _, d := range deviations { ng.PresetDeviations = append(ng.PresetDeviations, connectivity.PresetDeviation{ Field: d.Field, Expected: d.Expected, Got: d.Got, Detail: d.Detail, }) @@ -731,11 +968,149 @@ func loadNodeGroups(userCfgPath string) []connectivity.NodeGroupInfo { ng.EastWestPFs = append(ng.EastWestPFs, row) } } + applyPlatformTopologyMarkers(&ng, deviations, freshResult) out = append(out, ng) } return out } +// applyPlatformTopologyMarkers populates the report's two paired +// PF views (Actual = discovered hardware, Expected = certified +// topology) and flags mismatched rows in each. +// +// - Actual row is Mismatched when its (PCI, deviceID) pair has no +// matching entry in the preset. +// - Expected row is Mismatched when its (PCI, deviceID) pair has +// no matching entry on the cluster. +// - PFCountMismatch is set from the pfCount deviation. +// +// matchResult may be nil when we're falling back to stored +// cluster-config deviations (no preset object available); in that +// case only Mismatched markers based on raw deviations are applied +// and the Expected tables stay empty. +func applyPlatformTopologyMarkers(ng *connectivity.NodeGroupInfo, deviations []config.PresetDeviationEntry, matchResult *presetmatch.Result) { + for _, d := range deviations { + if d.Field == "pfCount" { + var got, want int + _, _ = fmt.Sscanf(d.Got, "%d", &got) + _, _ = fmt.Sscanf(d.Expected, "%d", &want) + ng.PFCountMismatch = &connectivity.PFCountMismatch{Expected: want, Got: got} + } + } + + if matchResult == nil || matchResult.Preset == nil { + // No live preset to compare against — fall back to flagging + // Actual rows whose PCI appears in any pciAddress/deviceID + // deviation from the stored snapshot. Expected tables stay + // empty. + flagged := map[string]bool{} + for _, d := range deviations { + switch d.Field { + case "deviceID": + if _, pci := splitDeviceIDAtPCI(d.Got); pci != "" { + flagged[pci] = true + } + case "pciAddress": + if d.Got != "" { + flagged[d.Got] = true + } + } + } + markActualMismatched(ng.EastWestPFs, flagged) + markActualMismatched(ng.NorthSouthPFs, flagged) + return + } + + // Index actuals by PCI so the Expected pass can quickly check + // whether a preset PF has a matching cluster PF (same PCI + + // same deviceID). + type actualEntry struct{ deviceID string } + actualByPCI := map[string]actualEntry{} + for _, pf := range ng.EastWestPFs { + actualByPCI[pf.PciAddress] = actualEntry{deviceID: pf.DeviceID} + } + for _, pf := range ng.NorthSouthPFs { + actualByPCI[pf.PciAddress] = actualEntry{deviceID: pf.DeviceID} + } + // Index preset by PCI for the Actual pass. + presetByPCI := map[string]presets.PresetPF{} + for _, pf := range matchResult.Preset.PFs { + presetByPCI[pf.PciAddress] = pf + } + + // Actual: flag rows whose PCI isn't in the preset, or whose + // deviceID at that PCI doesn't match the preset. + for i := range ng.EastWestPFs { + pf := &ng.EastWestPFs[i] + expected, ok := presetByPCI[pf.PciAddress] + if !ok || expected.DeviceID != pf.DeviceID { + pf.Mismatched = true + } + } + for i := range ng.NorthSouthPFs { + pf := &ng.NorthSouthPFs[i] + expected, ok := presetByPCI[pf.PciAddress] + if !ok || expected.DeviceID != pf.DeviceID { + pf.Mismatched = true + } + } + + // Expected: project the preset's PFs into the same PFInfo + // shape, bucketed by traffic type. Flag rows whose (PCI, + // deviceID) pair isn't present on the cluster. + for _, pf := range matchResult.Preset.PFs { + row := connectivity.PFInfo{ + PciAddress: pf.PciAddress, + DeviceID: pf.DeviceID, + Rail: "—", + Traffic: pf.Traffic, + NetworkInterface: pf.NetworkInterface, + RdmaDevice: pf.RdmaDevice, + PSID: pf.PSID, + PartNumber: pf.PartNumber, + ConnectedGPU: pf.ConnectedGPU, + GPUProximity: pf.GPUProximity, + } + if pf.Rail != nil { + row.Rail = fmt.Sprintf("%d", *pf.Rail) + } + if pf.NumaNode != nil { + row.NumaNode = fmt.Sprintf("%d", *pf.NumaNode) + } + if actual, ok := actualByPCI[pf.PciAddress]; !ok || actual.deviceID != pf.DeviceID { + row.Mismatched = true + } + switch pf.Traffic { + case "north-south": + ng.ExpectedNorthSouthPFs = append(ng.ExpectedNorthSouthPFs, row) + default: + ng.ExpectedEastWestPFs = append(ng.ExpectedEastWestPFs, row) + } + } +} + +// markActualMismatched flags any Actual PFInfo whose PCI is in the +// flagged set. Used as the fallback path when we don't have a live +// preset object to drive the full Actual/Expected pairing. +func markActualMismatched(pfs []connectivity.PFInfo, flagged map[string]bool) { + for i := range pfs { + if flagged[pfs[i].PciAddress] { + pfs[i].Mismatched = true + } + } +} + +// splitDeviceIDAtPCI splits a "@" composite (the +// format ValidatePreset emits for deviceID deviations) back into its +// two parts. Returns ("", "") on malformed input. +func splitDeviceIDAtPCI(s string) (deviceID, pci string) { + at := strings.IndexByte(s, '@') + if at < 0 { + return "", "" + } + return s[:at], s[at+1:] +} + // listNodesForReport pulls the cluster's node list and projects each // into a NodeInfo for the report. Reads l8k's machine/gpu labels // (config.MachineLabelKey / config.GPULabelKey) plus a best-effort @@ -1007,13 +1382,13 @@ func emitValidationReport(vc *networkoperatorplugin.VersionCheck, results []netw if len(presetDeviations) > 0 { fmt.Println() - fmt.Println("Preset deviations (cluster differs from matched preset)") + fmt.Println("Platform topology mismatches (detected hardware differs from the certified topology)") for _, gd := range presetDeviations { label := gd.Group if gd.MachineType != "" || gd.GPUType != "" { label = fmt.Sprintf("%s (%s/%s)", gd.Group, gd.MachineType, gd.GPUType) } - fmt.Printf(" %s — %d deviation(s):\n", label, len(gd.Deviations)) + fmt.Printf(" %s — %d mismatch(es):\n", label, len(gd.Deviations)) for _, d := range gd.Deviations { expected := d.Expected if expected == "" { @@ -1029,7 +1404,7 @@ func emitValidationReport(vc *networkoperatorplugin.VersionCheck, results []netw } fmt.Println() - fmt.Printf("Summary: %d/%d ready, %d in-progress, %d error, %d missing; version: %s; preset deviations: %d group(s)\n", + fmt.Printf("Summary: %d/%d ready, %d in-progress, %d error, %d missing; version: %s; topology mismatches: %d group(s)\n", verdict.SuccessCount, verdict.Total, verdict.InProgressCount, verdict.ErrorCount, verdict.MissingCount, versionStatusText(vc), len(presetDeviations)) @@ -1081,12 +1456,11 @@ func init() { // `l8k validate` exercises the data plane unless explicitly // disabled. Pass `--connectivity=false` to limit validate to // the static manifest-presence + Helm release-version checks. - validateCmd.Flags().BoolVar(&validateConnectivity, "connectivity", true, "Run a ping matrix between pods of the example DaemonSet to verify the data plane. Default true. Pass --connectivity=false to skip when only the static manifest checks are wanted.") + validateCmd.Flags().BoolVar(&validateConnectivity, "connectivity", true, "Run an RDMA matrix (rping + ib_write_bw) between pods of the example DaemonSet to verify the data plane. Default true. Pass --connectivity=false to skip when only the static manifest checks are wanted.") validateCmd.Flags().BoolVar(&validateKeep, "keep", false, "Leave the example DaemonSet running after --connectivity completes (useful for debugging).") - validateCmd.Flags().DurationVar(&validateConnectivityTimeout, "connectivity-timeout", 5*time.Minute, "Wall-clock budget for the connectivity matrix (DaemonSet rollout + ping execs).") - validateCmd.Flags().IntVar(&validatePingCount, "ping-count", 3, "Number of ICMP echoes per src→dst pair when running --connectivity (ping -c N).") + validateCmd.Flags().DurationVar(&validateConnectivityTimeout, "connectivity-timeout", 5*time.Minute, "Wall-clock budget for the connectivity matrix (DaemonSet rollout + rping + ib_write_bw execs).") validateCmd.Flags().DurationVar(&validateWait, "wait", 0, "Block validate up to this duration waiting for in-progress manifests to reach a terminal state. 0 (default) returns immediately on the first snapshot.") - validateCmd.Flags().StringVar(&validateReportPath, "report-path", "", "Write an HTML verify-report to this path. When empty (default), writes to /verify-report.html. Pass '-' to skip the report file entirely.") + validateCmd.Flags().StringVar(&validateReportPath, "report-path", "", "Write the HTML validation report to this path. When empty (default), writes to /k8s-launch-kit-validation-report.html. Pass '-' to skip the report file entirely.") setFlagGroup(validateCmd, "kubeconfig", GroupCommon) setFlagGroup(validateCmd, "user-config", GroupCommon) diff --git a/pkg/networkoperatorplugin/connectivity/connectivity.go b/pkg/networkoperatorplugin/connectivity/connectivity.go index 8a1339c..39931b8 100644 --- a/pkg/networkoperatorplugin/connectivity/connectivity.go +++ b/pkg/networkoperatorplugin/connectivity/connectivity.go @@ -21,7 +21,6 @@ import ( "fmt" "sort" "strings" - "sync" "time" "github.com/nvidia/k8s-launch-kit/pkg/ui" @@ -39,18 +38,11 @@ type Options struct { // validated). ManifestDir string // Timeout caps the whole connectivity phase: apply + DS rollout - // + ping execs + cleanup. 0 falls back to 5 minutes. + // + RDMA test execs + cleanup. 0 falls back to 5 minutes. Timeout time.Duration - // PingCount is the number of ICMP echoes per src→dst pair - // (`ping -c N`). 0 falls back to 3. - PingCount int // Keep leaves the test DaemonSets running after the matrix // completes, for follow-up debugging. Default is to delete. Keep bool - // MaxConcurrentPings caps the number of in-flight `ping` execs. - // 0 falls back to 16 — enough to keep a 4-rail × 16-pod matrix - // busy without overwhelming the apiserver. - MaxConcurrentPings int } // MatrixResult is the aggregate output of one connectivity run. It's @@ -59,9 +51,8 @@ type MatrixResult struct { // DaemonSets is one entry per applied example DS — typically // one per merged group. DaemonSets []DaemonSetReport - // PingResults is the flat list of every executed ping test. - // SameRail and CrossRail are derived views (filtered by Kind) - // for the report layer. + // PingResults is the flat list of every executed RDMA test + // (rping + ib_write_bw, same-rail + cross-rail). PingResults []PingResult // Skipped is non-nil when fewer than 2 schedulable test pods // were available across all DaemonSets. The matrix is treated @@ -72,11 +63,12 @@ type MatrixResult struct { } // MatrixSummary is the rolled-up counts validate prints under "summary". +// Passed counts tests that exited cleanly; Failed counts tests that +// ran but exited non-zero or produced unparseable output. type MatrixSummary struct { TotalTests int Passed int Failed int - ExecErrors int } // DaemonSetReport captures one DS's rollout state and the test pods it @@ -94,10 +86,14 @@ type DaemonSetReport struct { // 3. Waits for the DS rollout (desired > 0 AND ready == desired). // 4. Lists Running+Ready pods, parses each pod's multus annotation, // filters to secondary networks, picks the first IPv4 per rail. -// 5. Builds a test plan via Plan() — same-rail across pods + per-pod -// cross-rail canary; soft-skip if <2 schedulable pods. -// 6. Runs every ping in parallel up to opts.MaxConcurrentPings. -// 7. Deletes the DaemonSets unless opts.Keep. +// 5. Discovers the RDMA device backing each multus interface per pod. +// 6. Builds a test plan via Plan() — same-rail rping + ib_write_bw +// across pods plus per-pair cross-rail canaries; soft-skip if <2 +// schedulable pods. +// 7. Runs rping then ib_write_bw stages sequentially per the +// "spawn fresh server per test" lifecycle (concurrent runs would +// fight over the listener port). +// 8. Deletes the DaemonSets unless opts.Keep. // // All UI output flows through `uiOutput` (caller passes // ui.FromContext(ctx)). Logs go to controller-runtime's logr. @@ -105,12 +101,6 @@ func RunMatrix(ctx context.Context, c client.Client, restConfig *rest.Config, ui if opts.Timeout <= 0 { opts.Timeout = 5 * time.Minute } - if opts.PingCount <= 0 { - opts.PingCount = 3 - } - if opts.MaxConcurrentPings <= 0 { - opts.MaxConcurrentPings = 16 - } ctx, cancel := context.WithTimeout(ctx, opts.Timeout) defer cancel() @@ -187,10 +177,14 @@ func RunMatrix(ctx context.Context, c client.Client, restConfig *rest.Config, ui } } - // Parse multus annotations and build the TestPod list. + // Parse multus annotations and build the TestPod list. Each + // pod's RDMA-device-by-rail map is filled in from a single + // in-pod shell exec that reads + // /sys/class/net//device/infiniband/ per multus iface, + // so the rping + ib_write_bw stages can pass `-d `. testPods := make([]TestPod, 0, len(allPods)) for _, p := range allPods { - tp, err := buildTestPod(p.pod) + tp, ifaceByRail, err := buildTestPod(p.pod) if err != nil { uiOutput.Warning("Pod %s/%s: %v — excluded from matrix", p.pod.Namespace, p.pod.Name, err) log.Log.V(1).Info("buildTestPod failed", "pod", p.pod.Name, "error", err.Error()) @@ -200,6 +194,10 @@ func RunMatrix(ctx context.Context, c client.Client, restConfig *rest.Config, ui uiOutput.Warning("Pod %s/%s has no secondary network IPs — excluded from matrix", p.pod.Namespace, p.pod.Name) continue } + tp.RDMADevsByRail = DiscoverRDMADevices(ctx, restConfig, p.pod.Namespace, p.pod.Name, p.ref.Container, ifaceByRail) + log.Log.V(1).Info("RDMA device discovery", + "pod", p.pod.Name, "rails", tp.RailOrder, + "rdmaDevsByRail", tp.RDMADevsByRail) testPods = append(testPods, tp) } @@ -210,56 +208,74 @@ func RunMatrix(ctx context.Context, c client.Client, restConfig *rest.Config, ui return result, nil } - uiOutput.Info("Plan: %d same-rail tests, %d cross-rail canary tests", len(plan.SameRail), len(plan.CrossRail)) + totalSameRail := len(plan.RDMASameRail) + totalCrossRail := len(plan.RDMACrossRail) + uiOutput.Info("Plan: %d same-rail rping + %d cross-rail rping; %d same-rail ib_write_bw + %d cross-rail ib_write_bw", + totalSameRail, totalCrossRail, len(plan.RDMABwSameRail), len(plan.RDMABwCrossRail)) - // Build a pod-name → container name map so RunPing knows which - // container to exec in (test DSes have a single container today, - // but the orchestrator stays general). + // Build a pod-name → container name map so the test runners + // know which container to exec in (test DSes have a single + // container today, but the orchestrator stays general). containerByPod := map[string]string{} namespaceByPod := map[string]string{} for _, p := range allPods { - // We use the DS's resolved container name for every pod it - // owns — pods of the same DS share the template. containerByPod[p.pod.Name] = p.ref.Container namespaceByPod[p.pod.Name] = p.pod.Namespace } - tests := append([]PingTest{}, plan.SameRail...) - tests = append(tests, plan.CrossRail...) - result.PingResults = make([]PingResult, len(tests)) - - // Bounded concurrency: a semaphore of size MaxConcurrentPings. - sem := make(chan struct{}, opts.MaxConcurrentPings) - var wg sync.WaitGroup - for i := range tests { - wg.Add(1) - sem <- struct{}{} - go func(idx int, t PingTest) { - defer wg.Done() - defer func() { <-sem }() - ns := namespaceByPod[t.SrcPod] - cn := containerByPod[t.SrcPod] - if ns == "" || cn == "" { - result.PingResults[idx] = PingResult{ + // Two stages, each running sequentially per the "spawn fresh + // server per test" decision: every test starts its own server, + // runs the client, kills the server. Concurrent runs would + // fight over the listener port. + stages := []struct { + label string + tests []PingTest + run func(t PingTest) PingResult + }{ + { + label: "RDMA ping (rping)", + tests: append(append([]PingTest{}, plan.RDMASameRail...), plan.RDMACrossRail...), + run: func(t PingTest) PingResult { + return RunRPing(ctx, restConfig, namespaceByPod[t.DstPod], + t.DstPod, containerByPod[t.DstPod], // server side + t.SrcPod, containerByPod[t.SrcPod], // client side + t, 5) + }, + }, + { + label: "RDMA bandwidth (ib_write_bw)", + tests: append(append([]PingTest{}, plan.RDMABwSameRail...), plan.RDMABwCrossRail...), + run: func(t PingTest) PingResult { + return RunIbWriteBw(ctx, restConfig, namespaceByPod[t.DstPod], + t.DstPod, containerByPod[t.DstPod], + t.SrcPod, containerByPod[t.SrcPod], + t, 0) + }, + }, + } + for _, stage := range stages { + if len(stage.tests) == 0 { + continue + } + uiOutput.Info("Stage: %s — %d test(s)", stage.label, len(stage.tests)) + for _, t := range stage.tests { + if namespaceByPod[t.SrcPod] == "" || namespaceByPod[t.DstPod] == "" { + result.PingResults = append(result.PingResults, PingResult{ Test: t, - Err: fmt.Errorf("no namespace/container lookup for src pod %q", t.SrcPod), - } - return + Err: fmt.Errorf("no namespace lookup for pod pair (%s, %s)", t.SrcPod, t.DstPod), + }) + continue } - result.PingResults[idx] = RunPing(ctx, restConfig, ns, cn, t, opts.PingCount) - }(i, tests[i]) + result.PingResults = append(result.PingResults, stage.run(t)) + } } - wg.Wait() // Summary. for _, r := range result.PingResults { result.Summary.TotalTests++ - switch { - case r.OK: + if r.OK { result.Summary.Passed++ - case r.Err != nil && r.PacketLoss < 0: - result.Summary.ExecErrors++ - default: + } else { result.Summary.Failed++ } } @@ -273,7 +289,7 @@ func RunMatrix(ctx context.Context, c client.Client, restConfig *rest.Config, ui // testPodWithDS pairs a Running+Ready pod with the DaemonSetRef that // owns it so the orchestrator can later find the right container to -// `ping` from. +// exec into. type testPodWithDS struct { pod *corev1.Pod ref DaemonSetRef @@ -281,12 +297,14 @@ type testPodWithDS struct { // buildTestPod converts a corev1.Pod into a TestPod by parsing the // multus `network-status` annotation. The first IPv4 per rail is used; -// IPv6 is currently ignored. -func buildTestPod(p *corev1.Pod) (TestPod, error) { +// IPv6 is currently ignored. Also returns the rail→multus interface +// name mapping (e.g. "rail-0" → "net1") so the caller can hand it to +// DiscoverRDMADevices to populate RDMADevsByRail for the RDMA stages. +func buildTestPod(p *corev1.Pod) (TestPod, map[string]string, error) { ann := p.Annotations[MultusAnnotation] nets, err := ParseNetworkStatus(ann) if err != nil { - return TestPod{}, err + return TestPod{}, nil, err } tp := TestPod{ Name: p.Name, @@ -294,6 +312,7 @@ func buildTestPod(p *corev1.Pod) (TestPod, error) { Node: p.Spec.NodeName, IPsByRail: map[string]string{}, } + ifaceByRail := map[string]string{} for _, n := range SecondaryNetworks(nets) { ip := pickIPv4(n.IPs) if ip == "" { @@ -311,13 +330,16 @@ func buildTestPod(p *corev1.Pod) (TestPod, error) { } tp.IPsByRail[rail] = ip tp.RailOrder = append(tp.RailOrder, rail) + if n.Interface != "" { + ifaceByRail[rail] = n.Interface + } } sort.Strings(tp.RailOrder) - return tp, nil + return tp, ifaceByRail, nil } // pickIPv4 returns the first IPv4 address from the multus IP list, or -// "" if none. IPv6 entries are ignored (matrix uses ping, not ping6). +// "" if none. IPv6 entries are ignored. func pickIPv4(ips []string) string { for _, ip := range ips { if strings.Count(ip, ".") == 3 && !strings.Contains(ip, ":") { @@ -339,10 +361,10 @@ func railKey(name string) string { func emitSummary(uiOutput ui.Output, result *MatrixResult) { s := result.Summary - uiOutput.Info("Matrix complete: %d/%d passed, %d failed, %d exec error(s)", - s.Passed, s.TotalTests, s.Failed, s.ExecErrors) - if s.Failed == 0 && s.ExecErrors == 0 && s.TotalTests > 0 { - uiOutput.Success("All %d ping test(s) passed", s.TotalTests) + uiOutput.Info("Matrix complete: %d/%d passed, %d failed", + s.Passed, s.TotalTests, s.Failed) + if s.Failed == 0 && s.TotalTests > 0 { + uiOutput.Success("All %d RDMA test(s) passed", s.TotalTests) return } // Render up to 5 failures so operators see what broke without @@ -362,15 +384,25 @@ func emitSummary(uiOutput ui.Output, result *MatrixResult) { } src := failureLabel(f.Test.SrcNode, f.Test.SrcPod) dst := failureLabel(f.Test.DstNode, f.Test.DstPod) - switch { - case f.Err != nil: - uiOutput.Error("ping %s→%s (%s): %v", src, dst, f.Test.Rail, f.Err) - default: - uiOutput.Error("ping %s→%s (%s): %d%% packet loss", src, dst, f.Test.Rail, f.PacketLoss) + label := stageLabelOf(f.Test.Kind) + if f.Err != nil { + uiOutput.Error("%s %s→%s (%s): %v", label, src, dst, f.Test.Rail, f.Err) + } else { + uiOutput.Error("%s %s→%s (%s): test failed (no result row parsed or non-zero exit)", + label, src, dst, f.Test.Rail) } } } +// stageLabelOf returns the human-friendly label for a test kind used +// in failure-line prefixes ("rping", "ib_write_bw"). +func stageLabelOf(k PingTestKind) string { + if k.IsRDMABw() { + return "ib_write_bw" + } + return "rping" +} + // failureLabel mirrors axisLabel from text_report.go — duplicated here // so emitSummary doesn't have to expose the renderer's helper. func failureLabel(node, pod string) string { diff --git a/pkg/networkoperatorplugin/connectivity/matrix.go b/pkg/networkoperatorplugin/connectivity/matrix.go index 3c7840c..d522d47 100644 --- a/pkg/networkoperatorplugin/connectivity/matrix.go +++ b/pkg/networkoperatorplugin/connectivity/matrix.go @@ -34,6 +34,12 @@ type TestPod struct { // typically have one v4 per attachment). Rails that didn't yield // an IP are absent from the map. IPsByRail map[string]string + // RDMADevsByRail is "" → RDMA device name (e.g. + // "mlx5_0") for the VF attached to that secondary network. + // Populated by DiscoverRDMADevices(); rails without an + // RDMA-capable device or where the lookup failed are absent. + // Used by RunIbWriteBw / RunRPing to pass `-d `. + RDMADevsByRail map[string]string // RailOrder is the deterministic iteration order over IPsByRail // — needed because Go map iteration is randomized and the cross // -rail canary picks "rail 0" vs "rail 1" by position. @@ -41,37 +47,70 @@ type TestPod struct { } // PingTest is one src→dst test the matrix will execute. Kind tags the -// test bucket: same-rail tests verify the rail is reachable end-to-end; -// cross-rail tests are a single canary per pod pair to spot -// misconfigured routes that would otherwise be invisible if every rail -// passed independently. +// test bucket. Two RDMA test families are scheduled in stages by the +// orchestrator: +// +// - Kind = RDMAPingSameRail / RDMAPingCrossRail — `rping -c -a +// ` against an `rping -s -a ` server. Verifies the QP +// pair establishes end-to-end (RDMA-CM TCP handshake + RDMA +// write). +// - Kind = RDMABwSameRail / RDMABwCrossRail — `ib_write_bw -d +// -R --report_gbits []` against an +// `ib_write_bw -d -R --report_gbits` server. Measures +// bandwidth in Gbps. // // SrcPod/DstPod carry the actual k8s pod names — needed by the // orchestrator to issue the SPDY exec against the right pod. SrcNode/ // DstNode are the kubelet's `Spec.NodeName` for each endpoint and are // used everywhere the matrix is rendered for humans (operators // recognize hostnames; DaemonSet-generated pod suffixes are noise). +// SrcRDMADev / DstRDMADev carry the per-pod RDMA device names so the +// test runner can pass `-d ` for ib_write_bw. type PingTest struct { - Kind PingTestKind - SrcPod string - DstPod string - SrcNode string - DstNode string - Rail string // for same-rail tests; "" for cross - SrcIP string - DstIP string - SrcRail string - DstRail string + Kind PingTestKind + SrcPod string + DstPod string + SrcNode string + DstNode string + Rail string // for same-rail tests; "" for cross + SrcIP string + DstIP string + SrcRail string + DstRail string + SrcRDMADev string + DstRDMADev string } -// PingTestKind enumerates the buckets the matrix renders into. +// PingTestKind enumerates the buckets the matrix renders into. Order +// follows the orchestrator's staged-execution flow: rping first +// (QP-establishment canary), ib_write_bw second (bandwidth). type PingTestKind int const ( - PingSameRail PingTestKind = iota - PingCrossRail + RDMAPingSameRail PingTestKind = iota + RDMAPingCrossRail + RDMABwSameRail + RDMABwCrossRail ) +// IsRDMAPing reports whether the test kind is an rping (RDMA-CM QP) +// test. +func (k PingTestKind) IsRDMAPing() bool { + return k == RDMAPingSameRail || k == RDMAPingCrossRail +} + +// IsRDMABw reports whether the test kind is an ib_write_bw bandwidth +// test. +func (k PingTestKind) IsRDMABw() bool { + return k == RDMABwSameRail || k == RDMABwCrossRail +} + +// IsCrossRail reports whether the test kind is one of the cross-rail +// canary variants (regardless of family). +func (k PingTestKind) IsCrossRail() bool { + return k == RDMAPingCrossRail || k == RDMABwCrossRail +} + // MatrixSkip captures the soft-skip case where fewer than 2 schedulable // test pods were found — the matrix returns an empty TestSet plus this // reason for the operator to see. @@ -82,10 +121,17 @@ type MatrixSkip struct { // MatrixPlan is the full set of tests the orchestrator will execute. // Generated by Plan() from a sorted list of TestPods so the order is // deterministic regardless of the cluster's pod-listing order. +// +// Pairs without RDMA device names on both ends (the discovery step +// couldn't resolve the secondary network → mlx5_X mapping) are +// silently dropped: there's no `-d ` to pass and the test +// would fail with a confusing error. type MatrixPlan struct { - SameRail []PingTest - CrossRail []PingTest - Skip *MatrixSkip + RDMASameRail []PingTest + RDMACrossRail []PingTest + RDMABwSameRail []PingTest + RDMABwCrossRail []PingTest + Skip *MatrixSkip } // Plan builds the test plan from the given pods. The matrix generation @@ -93,13 +139,13 @@ type MatrixPlan struct { // fixture pod lists. // // - Same-rail: for every ordered pod pair (A, B) with A≠B, emit one -// test per rail both endpoints share. This is the data-plane -// verification proper. -// - Cross-rail canary: for every ordered pod pair (A, B), emit a -// single test pinging A.rail[0] → B.rail[1] when both pods have -// ≥2 rails. Catches routing misconfigurations that allow -// accidental rail-to-rail leakage (or, when blocked by design, -// confirms the isolation). +// rping + one ib_write_bw test per rail both endpoints share. This +// is the data-plane verification proper. +// - Cross-rail canary: for every ordered pod pair (A, B), emit one +// rping + one ib_write_bw test pinging A.rail[0] → B.rail[1] when +// both pods have ≥2 rails. Catches routing misconfigurations that +// allow accidental rail-to-rail leakage (or, when blocked by +// design, confirms the isolation). // - Soft-skip on <2 pods: no tests are emitted and Skip is set so // the caller can render "matrix skipped: only N schedulable test // pod(s)". @@ -122,25 +168,32 @@ func Plan(pods []TestPod) MatrixPlan { continue } // Same-rail tests: rails the two endpoints both - // attach to. + // attach to, where RDMA devices are known on both + // ends. for _, rail := range src.RailOrder { srcIP, ok1 := src.IPsByRail[rail] dstIP, ok2 := dst.IPsByRail[rail] if !ok1 || !ok2 || srcIP == "" || dstIP == "" { continue } - plan.SameRail = append(plan.SameRail, PingTest{ - Kind: PingSameRail, - SrcPod: src.Name, - DstPod: dst.Name, - SrcNode: src.Node, - DstNode: dst.Node, - Rail: rail, - SrcIP: srcIP, - DstIP: dstIP, - SrcRail: rail, - DstRail: rail, - }) + srcDev, hasSrcDev := src.RDMADevsByRail[rail] + dstDev, hasDstDev := dst.RDMADevsByRail[rail] + if !hasSrcDev || !hasDstDev || srcDev == "" || dstDev == "" { + continue + } + base := PingTest{ + SrcPod: src.Name, DstPod: dst.Name, + SrcNode: src.Node, DstNode: dst.Node, + Rail: rail, SrcIP: srcIP, DstIP: dstIP, + SrcRail: rail, DstRail: rail, + SrcRDMADev: srcDev, DstRDMADev: dstDev, + } + rpingT := base + rpingT.Kind = RDMAPingSameRail + plan.RDMASameRail = append(plan.RDMASameRail, rpingT) + bwT := base + bwT.Kind = RDMABwSameRail + plan.RDMABwSameRail = append(plan.RDMABwSameRail, bwT) } // Cross-rail canary: src.rail[0] → dst.rail[1] when // both pods have ≥2 rails. Skipped otherwise. @@ -148,20 +201,25 @@ func Plan(pods []TestPod) MatrixPlan { srcRail, dstRail := src.RailOrder[0], dst.RailOrder[1] srcIP, ok1 := src.IPsByRail[srcRail] dstIP, ok2 := dst.IPsByRail[dstRail] - if ok1 && ok2 && srcIP != "" && dstIP != "" { - plan.CrossRail = append(plan.CrossRail, PingTest{ - Kind: PingCrossRail, - SrcPod: src.Name, - DstPod: dst.Name, - SrcNode: src.Node, - DstNode: dst.Node, - Rail: fmt.Sprintf("%s→%s", srcRail, dstRail), - SrcIP: srcIP, - DstIP: dstIP, - SrcRail: srcRail, - DstRail: dstRail, - }) + srcDev, hasSrcDev := src.RDMADevsByRail[srcRail] + dstDev, hasDstDev := dst.RDMADevsByRail[dstRail] + if !ok1 || !ok2 || srcIP == "" || dstIP == "" || !hasSrcDev || !hasDstDev { + continue + } + base := PingTest{ + SrcPod: src.Name, DstPod: dst.Name, + SrcNode: src.Node, DstNode: dst.Node, + Rail: fmt.Sprintf("%s→%s", srcRail, dstRail), + SrcIP: srcIP, DstIP: dstIP, + SrcRail: srcRail, DstRail: dstRail, + SrcRDMADev: srcDev, DstRDMADev: dstDev, } + rpingC := base + rpingC.Kind = RDMAPingCrossRail + plan.RDMACrossRail = append(plan.RDMACrossRail, rpingC) + bwC := base + bwC.Kind = RDMABwCrossRail + plan.RDMABwCrossRail = append(plan.RDMABwCrossRail, bwC) } } } diff --git a/pkg/networkoperatorplugin/connectivity/matrix_test.go b/pkg/networkoperatorplugin/connectivity/matrix_test.go index af5366a..b4e1cc3 100644 --- a/pkg/networkoperatorplugin/connectivity/matrix_test.go +++ b/pkg/networkoperatorplugin/connectivity/matrix_test.go @@ -23,16 +23,20 @@ import ( "github.com/stretchr/testify/require" ) +// mkPod builds a TestPod fixture with synthetic IPs *and* an RDMA +// device per rail. Plan() now requires both sides of a pair to have an +// RDMA device for the rail in order to emit tests (since the matrix +// is RDMA-only), so the test fixture must populate RDMADevsByRail to +// produce non-empty plans. func mkPod(name string, rails ...string) TestPod { tp := TestPod{ - Name: name, - IPsByRail: map[string]string{}, + Name: name, + IPsByRail: map[string]string{}, + RDMADevsByRail: map[string]string{}, } - for i, rail := range rails { - // Synthesize an IP deterministic per (pod, rail) so failed - // assertions are readable: pod-A rail-0 → "10.0.." - _ = i + for _, rail := range rails { tp.IPsByRail[rail] = fakeIP(name, rail) + tp.RDMADevsByRail[rail] = fakeRDMADev(name, rail) tp.RailOrder = append(tp.RailOrder, rail) } return tp @@ -70,12 +74,19 @@ func fakeIP(pod, rail string) string { return "0.0.0.0" } +// fakeRDMADev returns a stable mlx5-style device name per (pod, rail). +// The test only needs the strings to be non-empty for Plan() to emit +// tests; the actual device names are never opened. +func fakeRDMADev(pod, rail string) string { + return "mlx5_" + pod + "_" + rail +} + func TestPlan_SoftSkipOnFewerThan2Pods(t *testing.T) { t.Run("zero pods", func(t *testing.T) { plan := Plan(nil) require.NotNil(t, plan.Skip) assert.Contains(t, plan.Skip.Reason, "0 schedulable") - assert.Empty(t, plan.SameRail) + assert.Empty(t, plan.RDMASameRail) }) t.Run("one pod", func(t *testing.T) { plan := Plan([]TestPod{mkPod("pod-a", "rail-0", "rail-1")}) @@ -91,18 +102,28 @@ func TestPlan_TwoPodsOneRail(t *testing.T) { }) require.Nil(t, plan.Skip) - // Same-rail: 2 pods × 1 ordered pair each direction × 1 rail = 2 tests. - assert.Len(t, plan.SameRail, 2) + // Same-rail: 2 pods × 1 ordered pair each direction × 1 rail = 2 + // rping tests and 2 ib_write_bw tests. + assert.Len(t, plan.RDMASameRail, 2) + assert.Len(t, plan.RDMABwSameRail, 2) // Cross-rail canary requires ≥2 rails per pod — skipped here. - assert.Empty(t, plan.CrossRail) + assert.Empty(t, plan.RDMACrossRail) + assert.Empty(t, plan.RDMABwCrossRail) - // Check direction coverage: both A→B and B→A appear. + // Check direction coverage: both A→B and B→A appear in the rping + // slice. dirs := map[string]bool{} - for _, t := range plan.SameRail { + for _, t := range plan.RDMASameRail { dirs[t.SrcPod+"→"+t.DstPod] = true } assert.True(t, dirs["pod-a→pod-b"]) assert.True(t, dirs["pod-b→pod-a"]) + // Every emitted test carries the per-side RDMA device name so + // ib_write_bw can pass `-d `. + for _, tt := range plan.RDMABwSameRail { + assert.NotEmpty(t, tt.SrcRDMADev, "%+v", tt) + assert.NotEmpty(t, tt.DstRDMADev, "%+v", tt) + } } func TestPlan_TwoPodsTwoRails_IncludesCrossRailCanary(t *testing.T) { @@ -112,15 +133,18 @@ func TestPlan_TwoPodsTwoRails_IncludesCrossRailCanary(t *testing.T) { }) require.Nil(t, plan.Skip) - // Same-rail: 2 pods × 1 ordered pair each direction × 2 rails = 4. - assert.Len(t, plan.SameRail, 4) - // Cross-rail canary: one per ordered pod pair = 2. - assert.Len(t, plan.CrossRail, 2) + // Same-rail: 2 pods × 1 ordered pair each direction × 2 rails = 4 + // per family. + assert.Len(t, plan.RDMASameRail, 4) + assert.Len(t, plan.RDMABwSameRail, 4) + // Cross-rail canary: one per ordered pod pair = 2 per family. + assert.Len(t, plan.RDMACrossRail, 2) + assert.Len(t, plan.RDMABwCrossRail, 2) // Each cross-rail test should ping rail-0 → rail-1 (the first two // rails by sorted order). Concrete IP assertions catch // off-by-one in railOrder indexing. - for _, c := range plan.CrossRail { + for _, c := range plan.RDMACrossRail { assert.Equal(t, "rail-0", c.SrcRail) assert.Equal(t, "rail-1", c.DstRail) assert.Equal(t, "rail-0→rail-1", c.Rail) @@ -141,13 +165,15 @@ func TestPlan_ThreePodsThreeRails_FullMatrix(t *testing.T) { // - pod-a ↔ pod-b: 3 rails × 2 dirs = 6 // - pod-a ↔ pod-c: 2 rails × 2 dirs = 4 // - pod-b ↔ pod-c: 2 rails × 2 dirs = 4 - // Total = 14. - assert.Len(t, plan.SameRail, 14) + // Total = 14 per family (rping + ib_write_bw each). + assert.Len(t, plan.RDMASameRail, 14) + assert.Len(t, plan.RDMABwSameRail, 14) // Cross-rail canary requires ≥2 rails on both endpoints. All // three pods qualify (pod-c has 2), so every ordered pair gets - // one canary: 3 × 2 = 6. - assert.Len(t, plan.CrossRail, 6) + // one canary: 3 × 2 = 6 per family. + assert.Len(t, plan.RDMACrossRail, 6) + assert.Len(t, plan.RDMABwCrossRail, 6) } func TestPlan_StableOrderingAcrossRuns(t *testing.T) { @@ -158,10 +184,12 @@ func TestPlan_StableOrderingAcrossRuns(t *testing.T) { } plan1 := Plan(pods) plan2 := Plan(pods) - require.Equal(t, plan1.SameRail, plan2.SameRail) - require.Equal(t, plan1.CrossRail, plan2.CrossRail) + require.Equal(t, plan1.RDMASameRail, plan2.RDMASameRail) + require.Equal(t, plan1.RDMACrossRail, plan2.RDMACrossRail) + require.Equal(t, plan1.RDMABwSameRail, plan2.RDMABwSameRail) + require.Equal(t, plan1.RDMABwCrossRail, plan2.RDMABwCrossRail) // First test should always be from pod-a (sorted-by-name). - assert.Equal(t, "pod-a", plan1.SameRail[0].SrcPod) + assert.Equal(t, "pod-a", plan1.RDMASameRail[0].SrcPod) } func TestPlan_SkipsPairsWithoutSharedRail(t *testing.T) { @@ -171,7 +199,21 @@ func TestPlan_SkipsPairsWithoutSharedRail(t *testing.T) { }) require.Nil(t, plan.Skip) // No shared rail → zero same-rail tests. - assert.Empty(t, plan.SameRail) + assert.Empty(t, plan.RDMASameRail) + assert.Empty(t, plan.RDMABwSameRail) // Each pod only has 1 rail → no cross-rail canary either. - assert.Empty(t, plan.CrossRail) + assert.Empty(t, plan.RDMACrossRail) + assert.Empty(t, plan.RDMABwCrossRail) +} + +func TestPlan_SkipsPairsWithoutRDMADevices(t *testing.T) { + // Pod with an IP but no RDMA device for the shared rail — + // Plan() drops those silently since RunIbWriteBw needs -d . + a := mkPod("pod-a", "rail-0") + b := mkPod("pod-b", "rail-0") + delete(b.RDMADevsByRail, "rail-0") + plan := Plan([]TestPod{a, b}) + require.Nil(t, plan.Skip) + assert.Empty(t, plan.RDMASameRail, "no test should be emitted for the missing-device pair") + assert.Empty(t, plan.RDMABwSameRail) } diff --git a/pkg/networkoperatorplugin/connectivity/ping.go b/pkg/networkoperatorplugin/connectivity/ping.go deleted file mode 100644 index b0baa51..0000000 --- a/pkg/networkoperatorplugin/connectivity/ping.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2026 NVIDIA CORPORATION & AFFILIATES -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package connectivity - -import ( - "context" - "fmt" - "regexp" - "strconv" - - "github.com/nvidia/k8s-launch-kit/pkg/kubeclient" - "k8s.io/client-go/rest" -) - -// PingResult carries the outcome of one src→dst ping test. PacketLoss -// (0-100) is parsed from `ping`'s summary line; -1 means the line -// couldn't be parsed (treated as fail). RTT fields are best-effort — -// `ping` omits the rtt summary when every packet is lost. -type PingResult struct { - Test PingTest - OK bool - PacketLoss int // percent, 0-100; -1 when not parseable - RTTAvgMs float64 // 0 when unknown - Stdout string - Stderr string - Err error -} - -// pingSummaryRe matches the trailing summary line of `ping -c N` output, -// e.g.: -// -// 3 packets transmitted, 3 received, 0% packet loss, time 2003ms -// 5 packets transmitted, 0 received, 100% packet loss, time 4099ms -// -// Both BusyBox ping and iputils ping share this format; the -// `mellanox/rping-test` image ships iputils. -var pingSummaryRe = regexp.MustCompile(`(\d+)\s+packets transmitted,\s*(\d+)\s+(?:packets\s+)?received,\s*(\d+)%\s+packet loss`) - -// pingRTTRe matches the optional rtt summary line, e.g.: -// -// rtt min/avg/max/mdev = 0.123/0.456/0.789/0.100 ms -var pingRTTRe = regexp.MustCompile(`rtt min/avg/max/mdev = [\d.]+/([\d.]+)/[\d.]+/[\d.]+ ms`) - -// RunPing execs `ping -c -W 1 -I ` inside the -// source pod and parses the summary line. `-I` is critical for the -// rail-based matrix — without it, ping would use the pod's default-CNI -// interface and never traverse the secondary network we're testing. -// -// Returned PingResult is always populated; PingResult.Err carries any -// exec-level failure (RBAC, pod gone, kubelet timeout). PingResult.OK -// is true only when the exec succeeded *and* packet loss < 100%. -func RunPing(ctx context.Context, restConfig *rest.Config, namespace, container string, test PingTest, count int) PingResult { - if count <= 0 { - count = 3 - } - cmd := []string{ - "ping", - "-c", strconv.Itoa(count), - "-W", "1", - "-I", test.SrcIP, - test.DstIP, - } - res, err := kubeclient.ExecInPod(ctx, restConfig, namespace, test.SrcPod, container, cmd) - - r := PingResult{ - Test: test, - PacketLoss: -1, - Stdout: res.Stdout, - Stderr: res.Stderr, - Err: err, - } - - // Even on exec error we try to parse stdout — ping returns - // non-zero when packet loss is 100%, and the SPDY executor - // surfaces that as an error with stdout still populated. - if m := pingSummaryRe.FindStringSubmatch(res.Stdout); m != nil { - if loss, perr := strconv.Atoi(m[3]); perr == nil { - r.PacketLoss = loss - } - } - if m := pingRTTRe.FindStringSubmatch(res.Stdout); m != nil { - if avg, perr := strconv.ParseFloat(m[1], 64); perr == nil { - r.RTTAvgMs = avg - } - } - - switch { - case err != nil && r.PacketLoss < 0: - // Couldn't even parse output — treat as fail and keep the - // exec error for the report. - r.OK = false - case r.PacketLoss < 0: - // Stdout present but no summary line — unusual; surface - // it. - r.OK = false - r.Err = fmt.Errorf("ping output missing summary line") - default: - r.OK = r.PacketLoss < 100 - } - return r -} diff --git a/pkg/networkoperatorplugin/connectivity/rdma.go b/pkg/networkoperatorplugin/connectivity/rdma.go new file mode 100644 index 0000000..2e511e9 --- /dev/null +++ b/pkg/networkoperatorplugin/connectivity/rdma.go @@ -0,0 +1,261 @@ +// Copyright 2026 NVIDIA CORPORATION & AFFILIATES +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package connectivity + +import ( + "context" + "fmt" + "regexp" + "strconv" + "strings" + "time" + + "github.com/nvidia/k8s-launch-kit/pkg/kubeclient" + "k8s.io/client-go/rest" +) + +// rdmaServerSettleDelay is the pause between starting the server +// process and connecting from the client. ib_write_bw and rping both +// open a TCP listener on the server side before accepting QP connect +// requests; without a small settle window the client races the +// listener and reports "Couldn't connect to server". +const rdmaServerSettleDelay = 2 * time.Second + +// rdmaTestTimeout caps any single RDMA test (server start + client +// run + cleanup). Generous enough to survive a sluggish first +// connection; the orchestrator caps the whole matrix separately via +// Options.Timeout. +const rdmaTestTimeout = 90 * time.Second + +// DiscoverRDMADevices runs once per test pod to map each multus +// secondary-network interface to its RDMA device. Reads +// /sys/class/net//device/infiniband/ inside the pod — that +// directory exists for any mlx5 VF and contains exactly one entry +// (the device name, e.g. "mlx5_4"). +// +// nets is the parsed multus annotation; ifaceByRail tells the +// mapping which net interface (e.g. "net1") backs each rail. +// Returns map[rail]→; rails whose interface couldn't be +// resolved are absent from the map. +func DiscoverRDMADevices(ctx context.Context, restConfig *rest.Config, namespace, pod, container string, ifaceByRail map[string]string) map[string]string { + out := map[string]string{} + if len(ifaceByRail) == 0 { + return out + } + // One shell exec per pod: build a tiny script that prints + // `=` per resolved interface. Cheaper than one + // exec per rail. + var b strings.Builder + for rail, iface := range ifaceByRail { + if iface == "" { + continue + } + // `ls -1` prints one filename per line. Capture the + // first line — the mlx5 VF directory always contains + // exactly one entry, but `head -n 1` is a cheap safety + // belt against future kernel changes. + fmt.Fprintf(&b, "dev=$(ls -1 /sys/class/net/%s/device/infiniband/ 2>/dev/null | head -n 1); ", iface) + fmt.Fprintf(&b, "if [ -n \"$dev\" ]; then echo %q=$dev; fi; ", rail) + } + if b.Len() == 0 { + return out + } + res, err := kubeclient.ExecInPod(ctx, restConfig, namespace, pod, container, []string{"/bin/sh", "-c", b.String()}) + if err != nil { + return out + } + for _, line := range strings.Split(res.Stdout, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + eq := strings.IndexByte(line, '=') + if eq < 0 { + continue + } + rail := strings.Trim(strings.TrimSpace(line[:eq]), `"`) + dev := strings.TrimSpace(line[eq+1:]) + if rail != "" && dev != "" { + out[rail] = dev + } + } + return out +} + +// RunRPing executes the rping server/client dance for one test pair. +// Spawn-fresh-server-per-test (the agreed lifecycle): start the +// server as a background process via `nohup rping -s &`, sleep +// rdmaServerSettleDelay, run the client, then `pkill rping` on the +// server pod. Cleanup runs even if the client errored. +// +// rping returns 0 on success after C iterations. We don't read its +// output for any structured value — the binary OK is enough for +// matrix display. rping verifies the QP pair establishes +// end-to-end; ib_write_bw additionally measures throughput. +func RunRPing(ctx context.Context, restConfig *rest.Config, namespace string, serverPod, serverContainer string, clientPod, clientContainer string, test PingTest, iterations int) PingResult { + if iterations <= 0 { + iterations = 5 + } + r := PingResult{Test: test} + + tctx, cancel := context.WithTimeout(ctx, rdmaTestTimeout) + defer cancel() + + serverCmd := fmt.Sprintf("nohup rping -s -a %s -p 9999 -v >/tmp/rping-server.log 2>&1 & echo $!", test.DstIP) + srvRes, err := kubeclient.ExecInPod(tctx, restConfig, namespace, serverPod, serverContainer, []string{"/bin/sh", "-c", serverCmd}) + if err != nil { + r.Err = fmt.Errorf("rping server start: %w (stderr: %s)", err, srvRes.Stderr) + return r + } + serverPID := strings.TrimSpace(srvRes.Stdout) + defer killRDMAServer(restConfig, namespace, serverPod, serverContainer, "rping", serverPID) + + // Settle window — give the server time to bind. + select { + case <-tctx.Done(): + r.Err = fmt.Errorf("rping settle wait: %w", tctx.Err()) + return r + case <-time.After(rdmaServerSettleDelay): + } + + clientCmd := fmt.Sprintf("rping -c -a %s -p 9999 -C %d -v", test.DstIP, iterations) + cliRes, cliErr := kubeclient.ExecInPod(tctx, restConfig, namespace, clientPod, clientContainer, []string{"/bin/sh", "-c", clientCmd}) + r.Stdout, r.Stderr = cliRes.Stdout, cliRes.Stderr + r.Err = cliErr + // rping exits 0 only on a clean run; non-zero exit propagates + // to cliErr. + r.OK = cliErr == nil + return r +} + +// RunIbWriteBw runs an ib_write_bw bandwidth test for one src→dst pair. +// Uses `-R` (RDMA-CM connection management — same handshake the +// example DS expects) and `-s 65536 --report_gbits` to produce a +// single-line, single-message-size output that's straightforward to +// parse for a matrix cell. Each test allocates a fresh TCP listener +// port (default 18515 + an orchestrator-supplied offset) to avoid +// collisions with concurrent runs against unrelated rails. +func RunIbWriteBw(ctx context.Context, restConfig *rest.Config, namespace string, serverPod, serverContainer string, clientPod, clientContainer string, test PingTest, port int) PingResult { + if port <= 0 { + port = 18515 + } + r := PingResult{Test: test} + if test.SrcRDMADev == "" || test.DstRDMADev == "" { + r.Err = fmt.Errorf("ib_write_bw needs RDMA device names; got src=%q dst=%q", test.SrcRDMADev, test.DstRDMADev) + return r + } + + tctx, cancel := context.WithTimeout(ctx, rdmaTestTimeout) + defer cancel() + + // `ib_write_bw -d -R -s 65536 --report_gbits -p ` + // runs as the server when invoked without a peer IP, as the + // client when invoked with one. The server prints a banner and + // then waits; the client connects, runs the test, and both + // sides print the summary table. + serverCmd := fmt.Sprintf("nohup ib_write_bw -d %s -R -s 65536 --report_gbits -p %d >/tmp/ibwritebw-server.log 2>&1 & echo $!", + test.DstRDMADev, port) + srvRes, err := kubeclient.ExecInPod(tctx, restConfig, namespace, serverPod, serverContainer, []string{"/bin/sh", "-c", serverCmd}) + if err != nil { + r.Err = fmt.Errorf("ib_write_bw server start: %w (stderr: %s)", err, srvRes.Stderr) + return r + } + serverPID := strings.TrimSpace(srvRes.Stdout) + defer killRDMAServer(restConfig, namespace, serverPod, serverContainer, "ib_write_bw", serverPID) + + select { + case <-tctx.Done(): + r.Err = fmt.Errorf("ib_write_bw settle wait: %w", tctx.Err()) + return r + case <-time.After(rdmaServerSettleDelay): + } + + clientCmd := fmt.Sprintf("ib_write_bw -d %s -R -s 65536 --report_gbits -p %d %s", + test.SrcRDMADev, port, test.DstIP) + cliRes, cliErr := kubeclient.ExecInPod(tctx, restConfig, namespace, clientPod, clientContainer, []string{"/bin/sh", "-c", clientCmd}) + r.Stdout, r.Stderr = cliRes.Stdout, cliRes.Stderr + + bw, msgRate, parseOK := parseIbWriteBwOutput(cliRes.Stdout) + r.BandwidthGbps = bw + r.MsgRateMpps = msgRate + switch { + case cliErr != nil && !parseOK: + r.Err = cliErr + r.OK = false + case !parseOK: + r.Err = fmt.Errorf("ib_write_bw output missing summary row") + r.OK = false + default: + // Treat any non-zero bandwidth as success — partial-bw + // numbers are still informative (slow rail, MTU + // mismatch, etc.) and the cell value tells the story. + r.OK = bw > 0 + } + return r +} + +// killRDMAServer is best-effort cleanup. We try the captured PID +// first (precise) and fall back to a wildcard `pkill -f ` for +// the cases where the PID didn't survive the exec round-trip. +// Errors are swallowed because by this point we've already recorded +// the client's result — leaking a stale server is preferable to +// failing the test on a cleanup hiccup. +func killRDMAServer(restConfig *rest.Config, namespace, pod, container, prog, pid string) { + // Use a fresh background context so the cleanup runs even if + // the test's deadline already fired. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + cmd := fmt.Sprintf("kill %s 2>/dev/null; pkill -f %q 2>/dev/null; true", pid, prog) + _, _ = kubeclient.ExecInPod(ctx, restConfig, namespace, pod, container, []string{"/bin/sh", "-c", cmd}) +} + +// ibWriteBwSummaryRe matches the single-row summary line that +// `ib_write_bw -s 65536 --report_gbits` emits on both client and +// server. Format (whitespace-separated columns, varies slightly +// across perftest versions; this regex matches the common shape): +// +// #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] +// 65536 5000 194.39 193.21 0.368 +// +// The leading message-size column is always integer; subsequent +// columns are float with optional decimals. We don't anchor the +// regex to start-of-line because perftest sometimes prefixes the +// summary with " ". +var ibWriteBwSummaryRe = regexp.MustCompile(`^\s*\d+\s+\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$`) + +// parseIbWriteBwOutput scans the client's stdout for the summary +// row. Returns the peak Gbps, message rate (Mpps), and ok=true when a +// summary row was found. We return the *peak* rather than average +// because the average can be dragged down by ramp-up cycles on the +// first few iterations; for a "is this rail healthy" matrix the +// peak is the more useful number. +func parseIbWriteBwOutput(stdout string) (peakGbps, msgRateMpps float64, ok bool) { + for _, line := range strings.Split(stdout, "\n") { + m := ibWriteBwSummaryRe.FindStringSubmatch(line) + if m == nil { + continue + } + peak, err1 := strconv.ParseFloat(m[1], 64) + _, err2 := strconv.ParseFloat(m[2], 64) // average — read for the side effect of validating the regex; not surfaced + rate, err3 := strconv.ParseFloat(m[3], 64) + if err1 != nil || err2 != nil || err3 != nil { + continue + } + return peak, rate, true + } + return 0, 0, false +} diff --git a/pkg/networkoperatorplugin/connectivity/rdma_test.go b/pkg/networkoperatorplugin/connectivity/rdma_test.go new file mode 100644 index 0000000..0da5ce0 --- /dev/null +++ b/pkg/networkoperatorplugin/connectivity/rdma_test.go @@ -0,0 +1,111 @@ +// Copyright 2026 NVIDIA CORPORATION & AFFILIATES +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package connectivity + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// realisticIbWriteBwOutput is a captured sample of perftest's +// `ib_write_bw -d -R -s 65536 --report_gbits ` client +// output. Includes the banner, the column header, the summary row, +// and the trailing banner so parseIbWriteBwOutput's regex anchoring +// is exercised against realistic surrounding context. +const realisticIbWriteBwOutput = `--------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_4 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + PCIe relax order: ON + ibv_wr* API : ON + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 3 + Max inline data : 0[B] + rdma_cm QPs : ON + Data ex. method : rdma_cm +--------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x0094 PSN 0x4d2cb8 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:10:00:00:01 + remote address: LID 0000 QPN 0x00a4 PSN 0xabcdef + GID: 00:00:00:00:00:00:00:00:00:00:255:255:10:00:00:02 +--------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 65536 5000 194.39 193.21 0.368459 +--------------------------------------------------------------------------------------- +` + +func TestParseIbWriteBwOutput(t *testing.T) { + t.Run("realistic summary row parses", func(t *testing.T) { + peak, rate, ok := parseIbWriteBwOutput(realisticIbWriteBwOutput) + require.True(t, ok) + assert.InDelta(t, 194.39, peak, 0.001) + assert.InDelta(t, 0.368459, rate, 0.0001) + }) + + t.Run("output without summary returns ok=false", func(t *testing.T) { + // Output the server side prints before the client + // finishes — no summary row, just the banner. + short := `------------------------------------------------------ + RDMA_Write BW Test + Device : mlx5_4 +------------------------------------------------------ +` + peak, rate, ok := parseIbWriteBwOutput(short) + assert.False(t, ok) + assert.Equal(t, 0.0, peak) + assert.Equal(t, 0.0, rate) + }) + + t.Run("malformed summary line is ignored", func(t *testing.T) { + bad := ` 65536 5000 not_a_number 193.21 0.368 +` + _, _, ok := parseIbWriteBwOutput(bad) + assert.False(t, ok) + }) + + t.Run("smaller message size sample", func(t *testing.T) { + // perftest sometimes emits sub-1Gbps numbers — the + // regex must accept them. + sample := ` 2 1000 0.05 0.04 3.125 +` + peak, rate, ok := parseIbWriteBwOutput(sample) + require.True(t, ok) + assert.InDelta(t, 0.05, peak, 0.001) + assert.InDelta(t, 3.125, rate, 0.001) + }) +} + +func TestPingTestKind_Predicates(t *testing.T) { + assert.True(t, RDMAPingSameRail.IsRDMAPing()) + assert.True(t, RDMAPingCrossRail.IsRDMAPing()) + assert.False(t, RDMABwSameRail.IsRDMAPing()) + + assert.True(t, RDMABwSameRail.IsRDMABw()) + assert.True(t, RDMABwCrossRail.IsRDMABw()) + assert.False(t, RDMAPingSameRail.IsRDMABw()) + + assert.True(t, RDMAPingCrossRail.IsCrossRail()) + assert.True(t, RDMABwCrossRail.IsCrossRail()) + assert.False(t, RDMAPingSameRail.IsCrossRail()) + assert.False(t, RDMABwSameRail.IsCrossRail()) +} diff --git a/pkg/networkoperatorplugin/connectivity/report.go b/pkg/networkoperatorplugin/connectivity/report.go index 20e2164..e14ee1a 100644 --- a/pkg/networkoperatorplugin/connectivity/report.go +++ b/pkg/networkoperatorplugin/connectivity/report.go @@ -26,6 +26,7 @@ import ( "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin/crstate" + "github.com/nvidia/k8s-launch-kit/pkg/presetmatch" ) // reportTemplate is the embedded HTML body the validate CLI's @@ -38,10 +39,33 @@ import ( //go:embed report.html.tmpl var reportTemplate string +// OverallVerdict captures the high-level pass/fail outcome rendered +// as a prominent banner at the top of the report. PASS means every +// gating check succeeded (Helm release version, component versions, +// manifest state, connectivity matrix). Preset deviations are +// informational and do NOT downgrade PASS — they're surfaced +// separately in the Node groups section. +type OverallVerdict struct { + // Pass is true when every gating check succeeded. + Pass bool + // Reasons lists each individual gating check that failed (one + // short line per failure). Empty when Pass. + Reasons []string + // Notes lists informational items that don't gate (preset + // deviations, in-progress manifests when --wait wasn't used, + // matrix soft-skip). Surfaced in the banner subtitle when + // non-empty. + Notes []string +} + // ReportData is the input to RenderHTML. The validate CLI populates // it from values it has already computed (Manifests, Matrix) plus a // few small lookups (cluster API version, node labels). type ReportData struct { + // Verdict is the overall pass/fail outcome rendered as a + // prominent banner at the top of the report. Computed by the + // caller (CLI) from the same inputs that drive the exit code. + Verdict OverallVerdict Cluster ClusterInfo Profile ProfileInfo NodeGroups []NodeGroupInfo @@ -52,8 +76,12 @@ type ReportData struct { // sub-table under "Network Operator release" in the HTML // report. Nil when the check couldn't run. ComponentCheck *networkoperatorplugin.ComponentVersionCheck - Manifests []networkoperatorplugin.ValidationResult - Matrix *MatrixResult + // PresetMatches carries one Result per cluster group from + // pkg/presetmatch — surfaced under the "Node groups" section. + // Empty when validate ran without a usable cluster-config.yaml. + PresetMatches []presetmatch.Result + Manifests []networkoperatorplugin.ValidationResult + Matrix *MatrixResult // Warnings is a flat list of one-line strings rendered as a // bulleted rollup at the bottom of the report — typically the // "in-progress manifest re-run later" notes and the matrix @@ -79,13 +107,33 @@ type NodeGroupInfo struct { IbCapable bool PresetApplied bool PresetDeviations []PresetDeviation - EastWestPFs []PFInfo - NorthSouthPFs []PFInfo + // EastWestPFs / NorthSouthPFs are the *actual* (discovered) + // PFs on the cluster. ExpectedEastWestPFs / ExpectedNorthSouthPFs + // mirror the layout but carry the certified topology's PFs from + // the matched preset (nil when no preset matched). When both are + // populated the report renders them as paired Actual / Expected + // sub-tables with mismatched rows highlighted in each. + EastWestPFs []PFInfo + NorthSouthPFs []PFInfo + ExpectedEastWestPFs []PFInfo + ExpectedNorthSouthPFs []PFInfo + // PFCountMismatch is non-nil when the discovered PF count + // differs from the certified topology. Surfaced inline in the + // East-west PFs Actual header. + PFCountMismatch *PFCountMismatch } // PFInfo is one row in a node group's PF table. The fields mirror // PFConfig but as strings so the template doesn't need helpers for // "—" fallbacks on nil pointers. +// +// Mismatched is true when this row diverges from its counterpart in +// the other table (the Actual table flags PCIs whose deviceID drifts +// from the certified topology or PCIs the certified topology +// doesn't list; the Expected table flags PCIs whose deviceID drifts +// or PCIs the cluster doesn't actually have). Rendered as a tinted +// row in both tables — the operator scans down and the diff is +// obvious without inline annotations. type PFInfo struct { PciAddress string DeviceID string @@ -97,7 +145,16 @@ type PFInfo struct { PartNumber string NumaNode string ConnectedGPU string - GPUProximity string + GPUProximity string + Mismatched bool +} + +// PFCountMismatch is set on NodeGroupInfo when the discovered PF +// count differs from the certified topology — rendered next to the +// "East-west PFs (N)" header rather than as a separate row. +type PFCountMismatch struct { + Expected int + Got int } // PresetDeviation is one row in a node group's deviation list — @@ -152,7 +209,7 @@ type NodeInfo struct { // parse / execute failure; the writer is not flushed (callers // typically pass an *os.File which the OS will flush on Close). func RenderHTML(w io.Writer, data ReportData) error { - tmpl, err := template.New("verify-report"). + tmpl, err := template.New("validation-report"). Funcs(reportFuncMap()). Parse(reportTemplate) if err != nil { @@ -184,6 +241,37 @@ func reportFuncMap() template.FuncMap { } return "state-unknown" }, + // presetStatusClass maps presetmatch.Status to the same + // state-color CSS classes manifest rows use, so the visual + // language is consistent across the report. + "presetStatusClass": func(s presetmatch.Status) string { + switch s { + case presetmatch.StatusMatch: + return "state-success" + case presetmatch.StatusDeviation: + return "state-inprogress" + case presetmatch.StatusNotFound: + return "state-missing" + case presetmatch.StatusSkipped: + return "state-missing" + } + return "state-unknown" + }, + // presetStatusLabel renders the human-facing label for the + // per-group platform-topology row's status badge. + "presetStatusLabel": func(s presetmatch.Status) string { + switch s { + case presetmatch.StatusMatch: + return "MATCH" + case presetmatch.StatusDeviation: + return "MISMATCH" + case presetmatch.StatusNotFound: + return "NOT CERTIFIED" + case presetmatch.StatusSkipped: + return "SKIPPED" + } + return "UNKNOWN" + }, // stateLabel renders the human-facing state name. "stateLabel": func(s crstate.CRState) string { switch s { @@ -213,48 +301,86 @@ func reportFuncMap() template.FuncMap { sort.Strings(out) return out }, - // matrixByRail groups same-rail PingResults by rail name so - // the template can render one sub-table per rail. + // matrixByRail groups same-rail PingResults by (rail, kind + // family) so the template can render one sub-table per + // (rail, family) bucket. Families are RDMA-ping and + // RDMA-bandwidth — see PingTestKind.IsRDMAPing / + // IsRDMABw. "matrixByRail": func(results []PingResult) []railSection { - byRail := map[string][]PingResult{} - rails := []string{} + type key struct { + rail string + fam string + } + byKey := map[key][]PingResult{} + order := []key{} for _, r := range results { - if r.Test.Kind != PingSameRail { + if r.Test.Kind.IsCrossRail() { continue } - if _, seen := byRail[r.Test.Rail]; !seen { - rails = append(rails, r.Test.Rail) + k := key{r.Test.Rail, htmlFamilyOf(r.Test.Kind)} + if _, seen := byKey[k]; !seen { + order = append(order, k) } - byRail[r.Test.Rail] = append(byRail[r.Test.Rail], r) + byKey[k] = append(byKey[k], r) } - sort.Strings(rails) - out := make([]railSection, 0, len(rails)) - for _, rail := range rails { - rs := railSection{Rail: rail} - rs.Nodes, rs.Table = buildRailTable(byRail[rail]) + // Sort deterministically: rail name, then a fixed + // family order so rping → ib_write_bw appear in + // execution order in the rendered report. + sort.Slice(order, func(i, j int) bool { + if order[i].rail != order[j].rail { + return order[i].rail < order[j].rail + } + return htmlFamilyRank(order[i].fam) < htmlFamilyRank(order[j].fam) + }) + out := make([]railSection, 0, len(order)) + for _, k := range order { + rs := railSection{Rail: k.rail, Family: k.fam} + rs.Nodes, rs.Table = buildRailTable(byKey[k]) out = append(out, rs) } return out }, // matrixCrossRail filters down to the cross-rail canaries - // and sorts them deterministically. - "matrixCrossRail": func(results []PingResult) []PingResult { - cross := make([]PingResult, 0) + // and groups them by family. Sorted by family rank, then + // src/dst node within each family. + "matrixCrossRail": func(results []PingResult) []crossRailSection { + byFam := map[string][]PingResult{} + fams := []string{} for _, r := range results { - if r.Test.Kind == PingCrossRail { - cross = append(cross, r) + if !r.Test.Kind.IsCrossRail() { + continue } - } - sort.Slice(cross, func(i, j int) bool { - if cross[i].Test.SrcNode != cross[j].Test.SrcNode { - return cross[i].Test.SrcNode < cross[j].Test.SrcNode + fam := htmlFamilyOf(r.Test.Kind) + if _, seen := byFam[fam]; !seen { + fams = append(fams, fam) } - return cross[i].Test.DstNode < cross[j].Test.DstNode - }) - return cross + byFam[fam] = append(byFam[fam], r) + } + sort.Slice(fams, func(i, j int) bool { return htmlFamilyRank(fams[i]) < htmlFamilyRank(fams[j]) }) + out := make([]crossRailSection, 0, len(fams)) + for _, fam := range fams { + rows := byFam[fam] + sort.Slice(rows, func(i, j int) bool { + if rows[i].Test.SrcNode != rows[j].Test.SrcNode { + return rows[i].Test.SrcNode < rows[j].Test.SrcNode + } + return rows[i].Test.DstNode < rows[j].Test.DstNode + }) + out = append(out, crossRailSection{Family: fam, Results: rows}) + } + return out + }, + // familyTitle is the human-friendly section header for a + // given kind family. + "familyTitle": func(fam string) string { + if fam == "ibbw" { + return "RDMA bandwidth (ib_write_bw)" + } + return "RDMA ping (rping)" }, // cellClass returns the CSS class for a matrix cell based - // on the ping outcome. + // on the result outcome (family-agnostic — color depends + // only on pass/fail/missing). "cellClass": func(r *PingResult) string { if r == nil { return "cell-missing" @@ -264,21 +390,22 @@ func reportFuncMap() template.FuncMap { } return "cell-fail" }, - // cellText renders the cell's body (loss%/rtt or "ERR"). - "cellText": func(r *PingResult) string { + // cellText renders the cell's body. Family-specific + // formatting: rping shows ✓/✗, ib_write_bw shows ✓ N Gbps. + "cellText": func(r *PingResult, fam string) string { if r == nil { return "·" } - if r.OK { - if r.RTTAvgMs > 0 { - return fmt.Sprintf("✓ %d%% %.2fms", r.PacketLoss, r.RTTAvgMs) + if fam == "ibbw" { + if r.OK && r.BandwidthGbps > 0 { + return fmt.Sprintf("✓ %.1f Gbps", r.BandwidthGbps) } - return fmt.Sprintf("✓ %d%%", r.PacketLoss) + return "✗ ERR" } - if r.PacketLoss >= 0 { - return fmt.Sprintf("✗ %d%%", r.PacketLoss) + if r.OK { + return "✓" } - return "✗ ERR" + return "✗" }, // nodeLabel mirrors the text renderer's axisLabel: prefer // SrcNode/DstNode, fall back to the pod name when the @@ -298,13 +425,42 @@ func reportFuncMap() template.FuncMap { } } -// railSection feeds the per-rail sub-table in the report. Nodes is -// the deterministic axis ordering; Table is a flat node→node→*result -// map (the template indexes by string keys for cells). +// railSection feeds the per-(rail, kind family) sub-table in the +// report. Nodes is the deterministic axis ordering; Table is a flat +// node→node→*result map (the template indexes by string keys for +// cells). Family is one of "rping" / "ibbw" — drives the section +// header and the cell-formatter dispatch. type railSection struct { - Rail string - Nodes []string - Table map[string]map[string]*PingResult + Rail string + Family string + Nodes []string + Table map[string]map[string]*PingResult +} + +// crossRailSection groups cross-rail canary results by kind family +// for the cross-rail list at the bottom of the matrix section. +type crossRailSection struct { + Family string + Results []PingResult +} + +// htmlFamilyOf maps a PingTestKind to the family identifier used by +// the HTML template's funcs (kept as a separate string from +// text_report.go's kindFamily enum to keep package boundaries clean). +func htmlFamilyOf(k PingTestKind) string { + if k.IsRDMABw() { + return "ibbw" + } + return "rping" +} + +// htmlFamilyRank gives a stable ordering of families in the rendered +// report — rping (QP establishment) before ib_write_bw (bandwidth). +func htmlFamilyRank(fam string) int { + if fam == "ibbw" { + return 1 + } + return 0 } // buildRailTable indexes a slice of same-rail PingResults by source diff --git a/pkg/networkoperatorplugin/connectivity/report.html.tmpl b/pkg/networkoperatorplugin/connectivity/report.html.tmpl index fcf9368..cb8d958 100644 --- a/pkg/networkoperatorplugin/connectivity/report.html.tmpl +++ b/pkg/networkoperatorplugin/connectivity/report.html.tmpl @@ -123,6 +123,36 @@ main { padding: 32px 40px 48px; } +.verdict-banner { + border-radius: 8px; + padding: 20px 24px; + margin-bottom: 28px; + border: 1px solid; +} +.verdict-banner.verdict-pass { + background: var(--success-bg); + border-color: var(--success-bd); + color: var(--success); +} +.verdict-banner.verdict-fail { + background: var(--error-bg); + border-color: var(--error-bd); + color: var(--error); +} +.verdict-headline { + font-size: 18px; + font-weight: 700; + letter-spacing: 0.4px; +} +.verdict-list { + margin-top: 10px; + padding-left: 20px; + font-size: 13.5px; + color: var(--text); +} +.verdict-list li { margin-top: 2px; } +.verdict-notes { color: var(--text-muted); } + section { background: var(--bg); border: 1px solid var(--border); @@ -324,6 +354,26 @@ details table th, details table td { padding: 6px 10px; } } .matrix-rail code { font-size: 12px; color: var(--text); } +/* Colour-coded headers for the paired Actual / Expected PF + * sub-tables. Actual = red ("this is what the cluster has; + * the highlighted rows are wrong"). Expected = orange ("this + * is what the certified topology says; the highlighted rows + * are what's missing or mismatched"). */ +.matrix-rail.pf-section-actual { + color: #991B1B; /* red-800 */ + background: rgba(220, 38, 38, 0.10); + border-left: 3px solid #DC2626; /* red-600 */ + padding: 6px 10px; + border-radius: 4px; +} +.matrix-rail.pf-section-expected { + color: #9A3412; /* orange-800 */ + background: rgba(234, 88, 12, 0.10); + border-left: 3px solid #EA580C; /* orange-600 */ + padding: 6px 10px; + border-radius: 4px; +} + .summary-line { font-size: 14px; color: var(--text); @@ -372,6 +422,35 @@ details table th, details table td { padding: 6px 10px; } } .muted { color: var(--text-muted); } +/* Platform-topology diff styling. The PF section renders paired + * Actual / Expected sub-tables; rows that don't match their + * counterpart in the other table are tinted amber. The reader + * scans down both tables and the divergence is obvious without + * inline annotations. */ +.mismatch-inline { + color: #B45309; /* amber-700, contrast against the row */ + font-size: 11px; + margin-left: 6px; +} +/* Mismatched rows take their tint from the table they're in, + * matching the section header: red in Actual, orange in + * Expected. The adjacent-sibling combinator targets the + * that immediately follows each section header div. */ +.matrix-rail.pf-section-actual + table tr.pf-mismatched { + background: rgba(220, 38, 38, 0.12); +} +.matrix-rail.pf-section-actual + table tr.pf-mismatched td { + border-top: 1px solid rgba(220, 38, 38, 0.32); + border-bottom: 1px solid rgba(220, 38, 38, 0.32); +} +.matrix-rail.pf-section-expected + table tr.pf-mismatched { + background: rgba(234, 88, 12, 0.12); +} +.matrix-rail.pf-section-expected + table tr.pf-mismatched td { + border-top: 1px solid rgba(234, 88, 12, 0.32); + border-bottom: 1px solid rgba(234, 88, 12, 0.32); +} + .warning-list { list-style: none; padding-left: 0; @@ -419,6 +498,26 @@ footer strong { color: var(--text-muted); }
+
+
+ {{if .Verdict.Pass}}✓ VALIDATION PASSED{{else}}✗ VALIDATION FAILED{{end}} +
+ {{- if .Verdict.Reasons}} +
    + {{- range .Verdict.Reasons}} +
  • {{.}}
  • + {{- end}} +
+ {{- end}} + {{- if .Verdict.Notes}} +
    + {{- range .Verdict.Notes}} +
  • {{.}}
  • + {{- end}} +
+ {{- end}} +
+

Profile

@@ -436,6 +535,36 @@ footer strong { color: var(--text-muted); }

Node groups ({{len .NodeGroups}})

+ {{- if .PresetMatches}} +
Platform topology validation
+
+ + + + + {{- range .PresetMatches}} + + + + + + + + {{- end}} + +
GroupMachineGPUStatusDetail
{{or .Group "—"}}{{or .MachineType "—"}}{{or .GPUType "—"}}{{presetStatusLabel .Status}} + {{- if eq (printf "%s" .Status) "match"}} + matches certified topology for {{.PresetName}} server type + {{- else if eq (printf "%s" .Status) "deviation"}} + does not match certified topology for {{.PresetName}} server type · {{.Reason}} + {{- else if eq (printf "%s" .Status) "not-found"}} + no certified topology available for this server type · {{.Reason}} + {{- else}} + {{.Reason}} + {{- end}} +
+ {{- end}} + {{- if .NodeGroups}} {{- range .NodeGroups}}
@@ -466,17 +595,39 @@ footer strong { color: var(--text-muted); } {{- if and (not .SriovCapable) (not .RdmaCapable) (not .IbCapable)}}none{{end}} {{- if .PresetApplied}} -
Preset
applied
+
Certified topology
applied
{{- end}} {{- if .EastWestPFs}} -
East-west PFs ({{len .EastWestPFs}})
+
East-west PFs — Actual ({{if .PFCountMismatch}}{{.PFCountMismatch.Got}} certified topology expects {{.PFCountMismatch.Expected}}{{else}}{{len .EastWestPFs}}{{end}})
{{- range .EastWestPFs}} - + + + + + + + + + + + + {{- end}} + +
PCIDevice IDRailNetdevRDMAPSIDPart #NUMAGPU
{{.PciAddress}}{{or .DeviceID "—"}}{{.Rail}}{{or .NetworkInterface "—"}}{{or .RdmaDevice "—"}}{{or .PSID "—"}}{{or .PartNumber "—"}}{{or .NumaNode "—"}}{{or .ConnectedGPU "—"}}{{if .GPUProximity}} ({{.GPUProximity}}){{end}}
+ {{- end}} + + {{- if .ExpectedEastWestPFs}} +
East-west PFs — Expected by certified topology ({{len .ExpectedEastWestPFs}})
+ + + + {{- range .ExpectedEastWestPFs}} + @@ -493,12 +644,12 @@ footer strong { color: var(--text-muted); } {{- end}} {{- if .NorthSouthPFs}} -
North-south PFs ({{len .NorthSouthPFs}})
+
North-south PFs — Actual ({{len .NorthSouthPFs}})
PCIDevice IDRailNetdevRDMAPSIDPart #NUMAGPU
{{.PciAddress}} {{or .DeviceID "—"}} {{.Rail}}
{{- range .NorthSouthPFs}} - + @@ -511,13 +662,20 @@ footer strong { color: var(--text-muted); }
PCIDevice IDNetdevRDMAPSIDPart #
{{.PciAddress}} {{or .DeviceID "—"}} {{or .NetworkInterface "—"}}
{{- end}} - {{- if .PresetDeviations}} -
Preset deviations ({{len .PresetDeviations}})
+ {{- if .ExpectedNorthSouthPFs}} +
North-south PFs — Expected by certified topology ({{len .ExpectedNorthSouthPFs}})
- + - {{- range .PresetDeviations}} - + {{- range .ExpectedNorthSouthPFs}} + + + + + + + + {{- end}}
FieldExpectedGotDetail
PCIDevice IDNetdevRDMAPSIDPart #
{{.Field}}{{or .Expected "—"}}{{or .Got "—"}}{{.Detail}}
{{.PciAddress}}{{or .DeviceID "—"}}{{or .NetworkInterface "—"}}{{or .RdmaDevice "—"}}{{or .PSID "—"}}{{or .PartNumber "—"}}
@@ -681,12 +839,12 @@ footer strong { color: var(--text-muted); }

{{.Matrix.Summary.Passed}}/{{.Matrix.Summary.TotalTests}} passed {{- if gt .Matrix.Summary.Failed 0}} · {{.Matrix.Summary.Failed}} failed{{end}} - {{- if gt .Matrix.Summary.ExecErrors 0}} · {{.Matrix.Summary.ExecErrors}} exec error(s){{end}}

{{- $rails := matrixByRail .Matrix.PingResults}} {{- range $rails}} -
Rail {{.Rail}}
+ {{- $fam := .Family}} +
Rail {{.Rail}} — {{familyTitle $fam}}
@@ -706,7 +864,7 @@ footer strong { color: var(--text-muted); } {{- else}} {{- $cell := index (index $rs.Table $src) $dst}} - + {{- end}} {{- end}} @@ -715,21 +873,22 @@ footer strong { color: var(--text-muted); }
{{cellText $cell}}{{cellText $cell $fam}}
{{- end}} - {{- $cross := matrixCrossRail .Matrix.PingResults}} - {{- if $cross}} -
Cross-rail canary
+ {{- $crossSections := matrixCrossRail .Matrix.PingResults}} + {{- range $crossSections}} + {{- $fam := .Family}} +
Cross-rail canary — {{familyTitle $fam}}
- {{- range $cross}} + {{- range .Results}} - + {{- end}} diff --git a/pkg/networkoperatorplugin/connectivity/report_test.go b/pkg/networkoperatorplugin/connectivity/report_test.go index 55c5201..b2542bb 100644 --- a/pkg/networkoperatorplugin/connectivity/report_test.go +++ b/pkg/networkoperatorplugin/connectivity/report_test.go @@ -24,8 +24,10 @@ import ( "testing" "time" + "github.com/nvidia/k8s-launch-kit/pkg/config" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin/crstate" + "github.com/nvidia/k8s-launch-kit/pkg/presetmatch" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -33,7 +35,7 @@ import ( // updateGolden re-writes the checked-in golden file when set. Run with // `go test ./pkg/networkoperatorplugin/connectivity/ -update-golden` // after intentional template changes. -var updateGolden = flag.Bool("update-golden", false, "rewrite testdata/verify-report.golden.html with the current renderer output") +var updateGolden = flag.Bool("update-golden", false, "rewrite testdata/k8s-launch-kit-validation-report.golden.html with the current renderer output") // fixtureData is the deterministic input the renderer is exercised // against. Includes one of every state badge, an IN-PROGRESS row with @@ -42,6 +44,13 @@ var updateGolden = flag.Bool("update-golden", false, "rewrite testdata/verify-re // surface. func fixtureData() ReportData { return ReportData{ + Verdict: OverallVerdict{ + Pass: false, + Reasons: []string{ + "1 RDMA test(s) failed in the connectivity matrix", + `The detected platform topology does not match the certified topology for ACME-vendor-a-h200 server type (see Node groups section for the per-device diff)`, + }, + }, Cluster: ClusterInfo{ L8kVersion: "v0.0.0-test", GeneratedAt: time.Date(2026, 5, 23, 11, 30, 0, 0, time.UTC), @@ -65,14 +74,43 @@ func fixtureData() ReportData { NodeSelector: map[string]string{"nvidia.kubernetes-launch-kit.machine": "vendor-a-h200"}, WorkerNodes: []string{"node-a", "node-b"}, SriovCapable: true, RdmaCapable: true, + // Actual = discovered hardware. EastWestPFs: []PFInfo{ + // deviceID drift from the certified topology + // at this PCI — row tinted. {PciAddress: "0000:08:00.0", DeviceID: "1021", Rail: "0", Traffic: "east-west", - NetworkInterface: "eth_r0", RdmaDevice: "rdma_r0", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "0"}, - {PciAddress: "0000:08:00.1", DeviceID: "1021", Rail: "1", Traffic: "east-west", + NetworkInterface: "eth_r0", RdmaDevice: "rdma_r0", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "0", + Mismatched: true}, + // Matches the certified topology — clean row. + {PciAddress: "0000:08:00.1", DeviceID: "1023", Rail: "1", Traffic: "east-west", + NetworkInterface: "eth_r1", RdmaDevice: "rdma_r1", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "1"}, + // Cluster has this PCI but the certified + // topology doesn't — row tinted. + {PciAddress: "0000:08:00.2", DeviceID: "1023", Rail: "2", Traffic: "east-west", + NetworkInterface: "eth_r2", RdmaDevice: "rdma_r2", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "2", + Mismatched: true}, + }, + // Expected = certified topology from the matched preset. + ExpectedEastWestPFs: []PFInfo{ + // Same PCI as Actual[0] but the expected + // deviceID — tinted in this table too because + // the actual deviceID drifts. + {PciAddress: "0000:08:00.0", DeviceID: "1023", Rail: "0", Traffic: "east-west", + NetworkInterface: "eth_r0", RdmaDevice: "rdma_r0", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "0", + Mismatched: true}, + // Matches the cluster — clean row. + {PciAddress: "0000:08:00.1", DeviceID: "1023", Rail: "1", Traffic: "east-west", NetworkInterface: "eth_r1", RdmaDevice: "rdma_r1", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "1"}, + // Certified topology expects this PCI but the + // cluster doesn't have it — tinted. + {PciAddress: "0000:08:00.3", DeviceID: "1023", Rail: "3", Traffic: "east-west", + NetworkInterface: "eth_r3", RdmaDevice: "rdma_r3", PartNumber: "MCX713106AE", NumaNode: "0", ConnectedGPU: "3", + Mismatched: true}, }, PresetDeviations: []PresetDeviation{ - {Field: "deviceID", Expected: "1023", Got: "1021", Detail: "expected ConnectX-8 (1023), found ConnectX-7 (1021)"}, + {Field: "deviceID", Expected: "1023@0000:08:00.0", Got: "1021@0000:08:00.0", Detail: "device ID at PCI address differs from preset"}, + {Field: "pciAddress", Got: "0000:08:00.2", Detail: "discovered PCI address not present in preset"}, + {Field: "pciAddress", Expected: "0000:08:00.3", Detail: "preset PCI address not present on discovered hardware"}, }, PresetApplied: true, }, @@ -142,47 +180,84 @@ func fixtureData() ReportData { }, Matrix: &MatrixResult{ PingResults: []PingResult{ + // rping: same-rail, both directions, rail-0 (both pass) { - Test: PingTest{Kind: PingSameRail, + Test: PingTest{Kind: RDMAPingSameRail, SrcNode: "node-a", DstNode: "node-b", Rail: "rail-0", SrcRail: "rail-0", DstRail: "rail-0"}, - OK: true, PacketLoss: 0, RTTAvgMs: 0.12, + OK: true, }, { - Test: PingTest{Kind: PingSameRail, + Test: PingTest{Kind: RDMAPingSameRail, SrcNode: "node-b", DstNode: "node-a", Rail: "rail-0", SrcRail: "rail-0", DstRail: "rail-0"}, - OK: true, PacketLoss: 0, RTTAvgMs: 0.15, + OK: true, }, + // rping: same-rail, rail-1 (one direction fails) { - Test: PingTest{Kind: PingSameRail, + Test: PingTest{Kind: RDMAPingSameRail, SrcNode: "node-a", DstNode: "node-b", Rail: "rail-1", SrcRail: "rail-1", DstRail: "rail-1"}, - OK: true, PacketLoss: 0, RTTAvgMs: 0.18, + OK: true, }, { - Test: PingTest{Kind: PingSameRail, + Test: PingTest{Kind: RDMAPingSameRail, SrcNode: "node-b", DstNode: "node-a", Rail: "rail-1", SrcRail: "rail-1", DstRail: "rail-1"}, - OK: false, PacketLoss: 100, + OK: false, + }, + // rping: cross-rail canary + { + Test: PingTest{Kind: RDMAPingCrossRail, + SrcNode: "node-a", DstNode: "node-b", + Rail: "rail-0→rail-1", SrcRail: "rail-0", DstRail: "rail-1"}, + OK: true, + }, + // ib_write_bw: same-rail rail-0 both directions pass with bandwidth + { + Test: PingTest{Kind: RDMABwSameRail, + SrcNode: "node-a", DstNode: "node-b", + Rail: "rail-0", SrcRail: "rail-0", DstRail: "rail-0"}, + OK: true, BandwidthGbps: 194.4, + }, + { + Test: PingTest{Kind: RDMABwSameRail, + SrcNode: "node-b", DstNode: "node-a", + Rail: "rail-0", SrcRail: "rail-0", DstRail: "rail-0"}, + OK: true, BandwidthGbps: 193.8, }, + // ib_write_bw: cross-rail canary passes with bandwidth { - Test: PingTest{Kind: PingCrossRail, + Test: PingTest{Kind: RDMABwCrossRail, SrcNode: "node-a", DstNode: "node-b", Rail: "rail-0→rail-1", SrcRail: "rail-0", DstRail: "rail-1"}, - OK: true, PacketLoss: 0, RTTAvgMs: 0.20, + OK: true, BandwidthGbps: 187.6, }, }, - Summary: MatrixSummary{TotalTests: 5, Passed: 4, Failed: 1}, + Summary: MatrixSummary{TotalTests: 8, Passed: 7, Failed: 1}, }, Warnings: []string{ "SriovNetworkNodePolicy/ethernet-sriov-rail-0 is in-progress on 1/2 nodes — re-run later.", "IPPool/rail-0-pool not found — l8k generate was not run before validate.", }, + PresetMatches: []presetmatch.Result{ + { + Group: "vendor-a-h200", + MachineType: "vendor-a", + GPUType: "h200", + Manufacturer: "ACME", + Status: presetmatch.StatusDeviation, + PresetName: "vendor-a/h200", + Reason: "1 deviation(s) from matched preset", + Deviations: []config.PresetDeviationEntry{ + {Field: "deviceID", Expected: "1023", Got: "1021", Detail: "expected ConnectX-8 (1023), found ConnectX-7 (1021)"}, + }, + }, + }, } } -const goldenPath = "testdata/verify-report.golden.html" +const goldenPath = "testdata/k8s-launch-kit-validation-report.golden.html" func TestRenderHTML_Golden(t *testing.T) { var buf bytes.Buffer diff --git a/pkg/networkoperatorplugin/connectivity/result.go b/pkg/networkoperatorplugin/connectivity/result.go new file mode 100644 index 0000000..5b5d757 --- /dev/null +++ b/pkg/networkoperatorplugin/connectivity/result.go @@ -0,0 +1,38 @@ +// Copyright 2026 NVIDIA CORPORATION & AFFILIATES +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package connectivity + +// PingResult carries the outcome of one src→dst matrix test. The +// matrix currently runs rping and ib_write_bw — fields specific to a +// kind stay zero for tests of the other kind: +// +// - rping: OK + Err + Stdout/Stderr; BandwidthGbps stays at 0. +// - ib_write_bw: OK + Err + Stdout/Stderr + BandwidthGbps + +// MsgRateMpps populated by parseIbWriteBwOutput. +// +// The struct name is kept for backwards compatibility with the +// historical ICMP-first pipeline; the matrix is RDMA-only now but +// callers and the JSON schema continue to reference PingResult. +type PingResult struct { + Test PingTest + OK bool + BandwidthGbps float64 // ib_write_bw only; 0 when n/a + MsgRateMpps float64 // ib_write_bw only; 0 when n/a + Stdout string + Stderr string + Err error +} diff --git a/pkg/networkoperatorplugin/connectivity/testdata/verify-report.golden.html b/pkg/networkoperatorplugin/connectivity/testdata/k8s-launch-kit-validation-report.golden.html similarity index 74% rename from pkg/networkoperatorplugin/connectivity/testdata/verify-report.golden.html rename to pkg/networkoperatorplugin/connectivity/testdata/k8s-launch-kit-validation-report.golden.html index a294c7f..2001f5d 100644 --- a/pkg/networkoperatorplugin/connectivity/testdata/verify-report.golden.html +++ b/pkg/networkoperatorplugin/connectivity/testdata/k8s-launch-kit-validation-report.golden.html @@ -104,6 +104,36 @@ padding: 32px 40px 48px; } +.verdict-banner { + border-radius: 8px; + padding: 20px 24px; + margin-bottom: 28px; + border: 1px solid; +} +.verdict-banner.verdict-pass { + background: var(--success-bg); + border-color: var(--success-bd); + color: var(--success); +} +.verdict-banner.verdict-fail { + background: var(--error-bg); + border-color: var(--error-bd); + color: var(--error); +} +.verdict-headline { + font-size: 18px; + font-weight: 700; + letter-spacing: 0.4px; +} +.verdict-list { + margin-top: 10px; + padding-left: 20px; + font-size: 13.5px; + color: var(--text); +} +.verdict-list li { margin-top: 2px; } +.verdict-notes { color: var(--text-muted); } + section { background: var(--bg); border: 1px solid var(--border); @@ -297,6 +327,22 @@ } .matrix-rail code { font-size: 12px; color: var(--text); } + +.matrix-rail.pf-section-actual { + color: #991B1B; + background: rgba(220, 38, 38, 0.10); + border-left: 3px solid #DC2626; + padding: 6px 10px; + border-radius: 4px; +} +.matrix-rail.pf-section-expected { + color: #9A3412; + background: rgba(234, 88, 12, 0.10); + border-left: 3px solid #EA580C; + padding: 6px 10px; + border-radius: 4px; +} + .summary-line { font-size: 14px; color: var(--text); @@ -345,6 +391,28 @@ } .muted { color: var(--text-muted); } + +.mismatch-inline { + color: #B45309; + font-size: 11px; + margin-left: 6px; +} + +.matrix-rail.pf-section-actual + table tr.pf-mismatched { + background: rgba(220, 38, 38, 0.12); +} +.matrix-rail.pf-section-actual + table tr.pf-mismatched td { + border-top: 1px solid rgba(220, 38, 38, 0.32); + border-bottom: 1px solid rgba(220, 38, 38, 0.32); +} +.matrix-rail.pf-section-expected + table tr.pf-mismatched { + background: rgba(234, 88, 12, 0.12); +} +.matrix-rail.pf-section-expected + table tr.pf-mismatched td { + border-top: 1px solid rgba(234, 88, 12, 0.32); + border-bottom: 1px solid rgba(234, 88, 12, 0.32); +} + .warning-list { list-style: none; padding-left: 0; @@ -390,6 +458,16 @@

Validation Report

+
+
+ ✗ VALIDATION FAILED +
+
    +
  • 1 RDMA test(s) failed in the connectivity matrix
  • +
  • The detected platform topology does not match the certified topology for ACME-vendor-a-h200 server type (see Node groups section for the per-device diff)
  • +
+
+

Profile

@@ -404,6 +482,23 @@

Profile

Node groups (1)

+
Platform topology validation
+
SourceSrc railDestinationDst railResult
{{srcLabel .Test}} {{.Test.SrcRail}} {{dstLabel .Test}} {{.Test.DstRail}}{{cellText .}}{{cellText . $fam}}
+ + + + + + + + + + + + +
GroupMachineGPUStatusDetail
vendor-a-h200vendor-ah200MISMATCH + does not match certified topology for vendor-a/h200 server type · 1 deviation(s) from matched preset +
vendor-a-h200 · vendor-a · h200 · Ethernet @@ -418,13 +513,13 @@

Node groups (1)

Capabilities
sriovrdma
-
Preset
applied
+
Certified topology
applied
-
East-west PFs (2)
+
East-west PFs — Actual (3)
- + @@ -437,7 +532,7 @@

Node groups (1)

- + @@ -446,13 +541,56 @@

Node groups (1)

+ + + + + + + + + + +
PCIDevice IDRailNetdevRDMAPSIDPart #NUMAGPU
0000:08:00.0 1021 0
0000:08:00.110211023 1 eth_r1 rdma_r1 0 1
0000:08:00.210232eth_r2rdma_r2MCX713106AE02
-
Preset deviations (1)
+
East-west PFs — Expected by certified topology (3)
- + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldExpectedGotDetail
PCIDevice IDRailNetdevRDMAPSIDPart #NUMAGPU
deviceID10231021expected ConnectX-8 (1023), found ConnectX-7 (1021)
0000:08:00.010230eth_r0rdma_r0MCX713106AE00
0000:08:00.110231eth_r1rdma_r1MCX713106AE01
0000:08:00.310233eth_r3rdma_r3MCX713106AE03
@@ -622,9 +760,9 @@

Manifest state (4)

Connectivity matrix

- 4/5 passed · 1 failed + 7/8 passed · 1 failed

-
Rail rail-0
+
Rail rail-0 — RDMA ping (rping)
@@ -637,16 +775,16 @@

Connectivity matrix

- + - +
node-a ✓ 0% 0.12ms
node-b✓ 0% 0.15ms
-
Rail rail-1
+
Rail rail-0 — RDMA bandwidth (ib_write_bw)
@@ -659,16 +797,53 @@

Connectivity matrix

- + - +
node-a ✓ 0% 0.18ms✓ 194.4 Gbps
node-b✗ 100%✓ 193.8 Gbps
-
Cross-rail canary
+
Rail rail-1 — RDMA ping (rping)
+ + + + + + + + + + + + + + + + + + + + +
src \ dstnode-anode-b
node-a
node-b
+
Cross-rail canary — RDMA ping (rping)
+ + + + + + + + + + + + + +
SourceSrc railDestinationDst railResult
node-arail-0node-brail-1
+
Cross-rail canary — RDMA bandwidth (ib_write_bw)
@@ -679,7 +854,7 @@

Connectivity matrix

- +
SourceSrc railDestinationDst railResult
rail-0 node-b rail-1✓ 0% 0.20ms✓ 187.6 Gbps
diff --git a/pkg/networkoperatorplugin/connectivity/text_report.go b/pkg/networkoperatorplugin/connectivity/text_report.go index c479047..84b9177 100644 --- a/pkg/networkoperatorplugin/connectivity/text_report.go +++ b/pkg/networkoperatorplugin/connectivity/text_report.go @@ -33,12 +33,11 @@ import ( // structured MatrixResult that the validate CLI marshals to stdout. // // Cells: -// - "—" src equals dst (self-ping, not run) +// - "—" src equals dst (self-test, not run) // - "·" no test for this (src,dst,rail) — shared-rail set // didn't include both endpoints -// - "✓ 0%" ping passed; packet loss percentage when non-zero, -// RTT when present (e.g. "✓ 0% 0.5ms") -// - "✗ 100%" ping failed; "✗ ERR" if the exec itself errored +// - rping — "✓" / "✗" / "✗ ERR" +// - ib_write_bw — "✓ 194.4 Gbps" / "✗ ERR" // // ANSI color is applied only when uiOutput.IsTTY() — keeps log files // and CI pipelines free of escape sequences. @@ -51,34 +50,75 @@ func RenderMatrixText(uiOutput ui.Output, result *MatrixResult) { } tty := uiOutput.IsTTY() - byRail, crossRail, nodesSorted, railsSorted := groupResults(result.PingResults) + // Group per (rail, kind family). One grid is rendered for + // every (rail, family) bucket that has at least one result, in + // the test-execution order so the reader sees rping + // (QP-establishment canary) before ib_write_bw (bandwidth). + byRail, byCross, nodes, rails := groupResultsByKind(result.PingResults) - for _, rail := range railsSorted { - uiOutput.Info("") - uiOutput.Info("Rail %s:", rail) - for _, line := range renderRailGrid(nodesSorted, byRail[rail], tty) { - uiOutput.Info("%s", line) + families := []kindFamily{familyRPing, familyIbBw} + for _, rail := range rails { + for _, fam := range families { + grid, ok := byRail[rail][fam] + if !ok { + continue + } + uiOutput.Info("") + uiOutput.Info("Rail %s — %s:", rail, familyTitle(fam)) + for _, line := range renderRailGrid(nodes, grid, fam, tty) { + uiOutput.Info("%s", line) + } } } - if len(crossRail) > 0 { + for _, fam := range families { + cross, ok := byCross[fam] + if !ok || len(cross) == 0 { + continue + } uiOutput.Info("") - uiOutput.Info("Cross-rail canary:") - for _, line := range renderCrossRailList(crossRail, tty) { + uiOutput.Info("Cross-rail canary — %s:", familyTitle(fam)) + for _, line := range renderCrossRailList(cross, fam, tty) { uiOutput.Info("%s", line) } } } -// groupResults indexes ping results by rail and source node for the -// per-rail grid, plus a flat slice for the cross-rail canary. Returns -// the deterministic node and rail orderings so callers don't have to -// sort again. Node names are used as the axis labels because they're -// what operators recognize ("worker-03" vs the DaemonSet-generated -// pod suffix like "sriov-test-7t8h9"); the underlying pod name is -// still carried on the PingTest for the SPDY exec path. -func groupResults(results []PingResult) (byRail map[string]map[string]map[string]*PingResult, crossRail []*PingResult, nodes []string, rails []string) { - byRail = map[string]map[string]map[string]*PingResult{} +// kindFamily collapses the four PingTestKind values down to the two +// families the renderer cares about: rping and ib_write_bw. The +// same-rail / cross-rail axis is handled separately (per-rail grid vs +// cross-rail list). +type kindFamily int + +const ( + familyRPing kindFamily = iota + familyIbBw +) + +func kindFamilyOf(k PingTestKind) kindFamily { + if k.IsRDMABw() { + return familyIbBw + } + return familyRPing +} + +func familyTitle(f kindFamily) string { + if f == familyIbBw { + return "RDMA bandwidth (ib_write_bw)" + } + return "RDMA ping (rping)" +} + +// groupResultsByKind indexes results by (rail, family, src) → dst → +// result for the per-rail grids, and by family → []result for the +// cross-rail canary lists. Returns the deterministic node and rail +// orderings so callers don't have to sort again. Node names are used +// as the axis labels because they're what operators recognize +// ("worker-03" vs the DaemonSet-generated pod suffix); the underlying +// pod name is still carried on the PingTest for the SPDY exec path. +func groupResultsByKind(results []PingResult) (byRail map[string]map[kindFamily]map[string]map[string]*PingResult, byCross map[kindFamily][]*PingResult, nodes []string, rails []string) { + byRail = map[string]map[kindFamily]map[string]map[string]*PingResult{} + byCross = map[kindFamily][]*PingResult{} nodeSet := map[string]struct{}{} railSet := map[string]struct{}{} @@ -88,18 +128,22 @@ func groupResults(results []PingResult) (byRail map[string]map[string]map[string dst := axisLabel(r.Test.DstNode, r.Test.DstPod) nodeSet[src] = struct{}{} nodeSet[dst] = struct{}{} - if r.Test.Kind == PingCrossRail { - crossRail = append(crossRail, r) + fam := kindFamilyOf(r.Test.Kind) + if r.Test.Kind.IsCrossRail() { + byCross[fam] = append(byCross[fam], r) continue } if _, ok := byRail[r.Test.Rail]; !ok { - byRail[r.Test.Rail] = map[string]map[string]*PingResult{} + byRail[r.Test.Rail] = map[kindFamily]map[string]map[string]*PingResult{} railSet[r.Test.Rail] = struct{}{} } - if _, ok := byRail[r.Test.Rail][src]; !ok { - byRail[r.Test.Rail][src] = map[string]*PingResult{} + if _, ok := byRail[r.Test.Rail][fam]; !ok { + byRail[r.Test.Rail][fam] = map[string]map[string]*PingResult{} + } + if _, ok := byRail[r.Test.Rail][fam][src]; !ok { + byRail[r.Test.Rail][fam][src] = map[string]*PingResult{} } - byRail[r.Test.Rail][src][dst] = r + byRail[r.Test.Rail][fam][src][dst] = r } nodes = make([]string, 0, len(nodeSet)) @@ -130,7 +174,9 @@ func axisLabel(node, pod string) string { // renderRailGrid produces the lines for one rail's src×dst grid, with // columns aligned via text/tabwriter. Axis labels are node names // (with pod-name fallback for endpoints whose NodeName wasn't set). -func renderRailGrid(nodes []string, table map[string]map[string]*PingResult, tty bool) []string { +// fam selects per-kind cell formatting (ICMP shows loss + RTT, +// rping shows ✓/✗, ib_write_bw shows ✓ N Gbps). +func renderRailGrid(nodes []string, table map[string]map[string]*PingResult, fam kindFamily, tty bool) []string { var buf bytes.Buffer tw := tabwriter.NewWriter(&buf, 0, 0, 2, ' ', 0) @@ -144,7 +190,7 @@ func renderRailGrid(nodes []string, table map[string]map[string]*PingResult, tty for _, src := range nodes { fmt.Fprintf(tw, " %s\t", shortPodName(src)) for _, dst := range nodes { - fmt.Fprintf(tw, "%s\t", cellFor(src, dst, table[src][dst], tty)) + fmt.Fprintf(tw, "%s\t", cellFor(src, dst, table[src][dst], fam, tty)) } fmt.Fprintln(tw) } @@ -158,7 +204,7 @@ func renderRailGrid(nodes []string, table map[string]map[string]*PingResult, tty // asymmetric (srcRail, dstRail) shape neatly. Same node-name fallback // as the per-rail grid: prefer Pod.Spec.NodeName, fall back to the // pod name. -func renderCrossRailList(results []*PingResult, tty bool) []string { +func renderCrossRailList(results []*PingResult, fam kindFamily, tty bool) []string { type row struct { src, dst string r *PingResult @@ -184,7 +230,7 @@ func renderCrossRailList(results []*PingResult, tty bool) []string { left := fmt.Sprintf(" %s [%s]\t→ %s [%s]\t", shortPodName(row.src), row.r.Test.SrcRail, shortPodName(row.dst), row.r.Test.DstRail) - fmt.Fprintf(tw, "%s%s\n", left, cellDetail(row.r, tty)) + fmt.Fprintf(tw, "%s%s\n", left, cellDetail(row.r, fam, tty)) } tw.Flush() return trailingTrim(strings.Split(buf.String(), "\n")) @@ -193,31 +239,24 @@ func renderCrossRailList(results []*PingResult, tty bool) []string { // cellFor renders one src×dst grid cell. self-pairs render as "—", // missing pairs as "·" (the rail set didn't pair these two pods — // rare; e.g. one pod's multus annotation was missing this rail). -func cellFor(src, dst string, r *PingResult, tty bool) string { +func cellFor(src, dst string, r *PingResult, fam kindFamily, tty bool) string { if src == dst { return "—" } if r == nil { return "·" } - return cellDetail(r, tty) + return cellDetail(r, fam, tty) } // cellDetail formats a single result into its terse cell representation -// + optional ANSI color when TTY. -func cellDetail(r *PingResult, tty bool) string { - var body string - switch { - case r.OK: - body = fmt.Sprintf("✓ %d%%", r.PacketLoss) - if r.RTTAvgMs > 0 { - body = fmt.Sprintf("%s %.1fms", body, r.RTTAvgMs) - } - case r.PacketLoss >= 0: - body = fmt.Sprintf("✗ %d%%", r.PacketLoss) - default: - body = "✗ ERR" - } +// + optional ANSI color when TTY. The body shape depends on the +// kind family: +// +// - rping: ✓ / ✗ +// - ib_write_bw: ✓ 194.4 Gbps / ✗ ERR +func cellDetail(r *PingResult, fam kindFamily, tty bool) string { + body := cellBody(r, fam) if tty { if r.OK { return "\033[32m" + body + "\033[0m" @@ -227,6 +266,20 @@ func cellDetail(r *PingResult, tty bool) string { return body } +func cellBody(r *PingResult, fam kindFamily) string { + if fam == familyIbBw { + if r.OK && r.BandwidthGbps > 0 { + return fmt.Sprintf("✓ %.1f Gbps", r.BandwidthGbps) + } + return "✗ ERR" + } + // rping + if r.OK { + return "✓" + } + return "✗" +} + // shortPodName trims a long pod name to keep grid columns from blowing // out — DaemonSet pods often have a 5-char random suffix after a long // app name. We keep the leading portion + the last 5 chars (the hash) diff --git a/pkg/networkoperatorplugin/connectivity/text_report_test.go b/pkg/networkoperatorplugin/connectivity/text_report_test.go index f3df1a3..7c34a92 100644 --- a/pkg/networkoperatorplugin/connectivity/text_report_test.go +++ b/pkg/networkoperatorplugin/connectivity/text_report_test.go @@ -30,58 +30,75 @@ import ( // the order it was emitted so tests can assert on exact lines. type captureOutput struct{ lines []string } -func (c *captureOutput) Info(format string, args ...interface{}) { c.lines = append(c.lines, fmt.Sprintf(format, args...)) } -func (c *captureOutput) Success(format string, args ...interface{}) { c.lines = append(c.lines, "SUCCESS: "+fmt.Sprintf(format, args...)) } -func (c *captureOutput) Warning(format string, args ...interface{}) { c.lines = append(c.lines, "WARNING: "+fmt.Sprintf(format, args...)) } -func (c *captureOutput) Error(format string, args ...interface{}) { c.lines = append(c.lines, "ERROR: "+fmt.Sprintf(format, args...)) } -func (c *captureOutput) StartProgress(message string) ui.Progress { return &captureProgress{out: c, msg: message} } -func (c *captureOutput) Header(text string) {} -func (c *captureOutput) Section(text string) { c.lines = append(c.lines, "SECTION: "+text) } -func (c *captureOutput) Confirm(string) (bool, error) { return true, nil } -func (c *captureOutput) IsTTY() bool { return false } +func (c *captureOutput) Info(format string, args ...interface{}) { + c.lines = append(c.lines, fmt.Sprintf(format, args...)) +} +func (c *captureOutput) Success(format string, args ...interface{}) { + c.lines = append(c.lines, "SUCCESS: "+fmt.Sprintf(format, args...)) +} +func (c *captureOutput) Warning(format string, args ...interface{}) { + c.lines = append(c.lines, "WARNING: "+fmt.Sprintf(format, args...)) +} +func (c *captureOutput) Error(format string, args ...interface{}) { + c.lines = append(c.lines, "ERROR: "+fmt.Sprintf(format, args...)) +} +func (c *captureOutput) StartProgress(message string) ui.Progress { + return &captureProgress{out: c, msg: message} +} +func (c *captureOutput) Header(text string) {} +func (c *captureOutput) Section(text string) { c.lines = append(c.lines, "SECTION: "+text) } +func (c *captureOutput) Confirm(string) (bool, error) { return true, nil } +func (c *captureOutput) IsTTY() bool { return false } type captureProgress struct { out *captureOutput msg string } -func (p *captureProgress) Update(string) {} -func (p *captureProgress) Success(m string) { p.out.lines = append(p.out.lines, "PROGRESS_OK: "+m) } -func (p *captureProgress) Fail(m string) { p.out.lines = append(p.out.lines, "PROGRESS_FAIL: "+m) } +func (p *captureProgress) Update(string) {} +func (p *captureProgress) Success(m string) { p.out.lines = append(p.out.lines, "PROGRESS_OK: "+m) } +func (p *captureProgress) Fail(m string) { p.out.lines = append(p.out.lines, "PROGRESS_FAIL: "+m) } -// Helpers synthesize results with both pod and node names so the -// renderer's node-label preference is exercised. +// Helpers synthesize RDMA results so the renderer's per-(rail, family) +// grouping and node-label fallback are exercised. -func successResult(src, dst, rail, srcIP, dstIP string, lossPercent int, rttMs float64) PingResult { +func rpingResult(src, dst, rail string, ok bool) PingResult { return PingResult{ Test: PingTest{ - Kind: PingSameRail, + Kind: RDMAPingSameRail, SrcPod: src + "-pod", DstPod: dst + "-pod", SrcNode: src, DstNode: dst, Rail: rail, - SrcIP: srcIP, - DstIP: dstIP, SrcRail: rail, DstRail: rail, }, - OK: lossPercent < 100, - PacketLoss: lossPercent, - RTTAvgMs: rttMs, + OK: ok, } } -func failResult(src, dst, rail string, lossPercent int) PingResult { - r := successResult(src, dst, rail, "", "", lossPercent, 0) - r.OK = false - return r +func ibBwResult(src, dst, rail string, ok bool, bwGbps float64) PingResult { + return PingResult{ + Test: PingTest{ + Kind: RDMABwSameRail, + SrcPod: src + "-pod", + DstPod: dst + "-pod", + SrcNode: src, + DstNode: dst, + Rail: rail, + SrcRail: rail, + DstRail: rail, + }, + OK: ok, + BandwidthGbps: bwGbps, + } } -func crossResult(src, dst, srcRail, dstRail string, ok bool) PingResult { +func crossRpingResult(src, dst, srcRail, dstRail string, ok bool) PingResult { return PingResult{ Test: PingTest{ - Kind: PingCrossRail, + Kind: RDMAPingCrossRail, SrcPod: src + "-pod", DstPod: dst + "-pod", SrcNode: src, @@ -90,23 +107,24 @@ func crossResult(src, dst, srcRail, dstRail string, ok bool) PingResult { SrcRail: srcRail, DstRail: dstRail, }, - OK: ok, - PacketLoss: 0, + OK: ok, } } func TestRenderMatrixText_RailGridAndCrossRail(t *testing.T) { result := &MatrixResult{ PingResults: []PingResult{ - // Rail rail-0: 2 nodes, all green - successResult("worker-1", "worker-2", "rail-0", "10.0.0.1", "10.0.0.2", 0, 0.5), - successResult("worker-2", "worker-1", "rail-0", "10.0.0.2", "10.0.0.1", 0, 0.6), - // Rail rail-1: one direction fails - successResult("worker-1", "worker-2", "rail-1", "10.0.1.1", "10.0.1.2", 0, 0.7), - failResult("worker-2", "worker-1", "rail-1", 100), + // Rail rail-0: 2 nodes, both rping pass + rpingResult("worker-1", "worker-2", "rail-0", true), + rpingResult("worker-2", "worker-1", "rail-0", true), + ibBwResult("worker-1", "worker-2", "rail-0", true, 194.4), + ibBwResult("worker-2", "worker-1", "rail-0", true, 193.8), + // Rail rail-1: one rping direction fails + rpingResult("worker-1", "worker-2", "rail-1", true), + rpingResult("worker-2", "worker-1", "rail-1", false), // Cross-rail canaries - crossResult("worker-1", "worker-2", "rail-0", "rail-1", true), - crossResult("worker-2", "worker-1", "rail-0", "rail-1", false), + crossRpingResult("worker-1", "worker-2", "rail-0", "rail-1", true), + crossRpingResult("worker-2", "worker-1", "rail-0", "rail-1", false), }, } @@ -115,21 +133,25 @@ func TestRenderMatrixText_RailGridAndCrossRail(t *testing.T) { joined := strings.Join(out.lines, "\n") - // Sanity: both rails rendered. - assert.Contains(t, joined, "Rail rail-0:") - assert.Contains(t, joined, "Rail rail-1:") + // Sanity: both rails rendered. The kind family appears in the + // header so we look for the "Rail — RDMA ping" prefix. + assert.Contains(t, joined, "Rail rail-0 — RDMA ping (rping):") + assert.Contains(t, joined, "Rail rail-1 — RDMA ping (rping):") + assert.Contains(t, joined, "Rail rail-0 — RDMA bandwidth (ib_write_bw):") // Cross-rail section rendered. - assert.Contains(t, joined, "Cross-rail canary:") + assert.Contains(t, joined, "Cross-rail canary — RDMA ping (rping):") // Header row of the grid. assert.Contains(t, joined, "src \\ dst") // Axis labels must be node names, NOT pod names. assert.Contains(t, joined, "worker-1") assert.Contains(t, joined, "worker-2") assert.NotContains(t, joined, "worker-1-pod") - // Sample cells: green ✓ in non-TTY mode is just plain text. - assert.Contains(t, joined, "✓ 0% 0.5ms") - // Failure cell shows ✗ with packet loss. - assert.Contains(t, joined, "✗ 100%") + // Sample cells: rping ✓ in non-TTY mode is just plain text. + assert.Contains(t, joined, "✓") + // ib_write_bw cell shows formatted Gbps. + assert.Contains(t, joined, "194.4 Gbps") + // Failure cell shows ✗ for rping. + assert.Contains(t, joined, "✗") // Self-pairs are dashes. selfDash := 0 for _, l := range out.lines { @@ -169,27 +191,46 @@ func TestShortPodName(t *testing.T) { func TestCellFor(t *testing.T) { t.Run("self pair is dash", func(t *testing.T) { - assert.Equal(t, "—", cellFor("pod-a", "pod-a", nil, false)) + assert.Equal(t, "—", cellFor("pod-a", "pod-a", nil, familyRPing, false)) }) t.Run("missing result is bullet", func(t *testing.T) { - assert.Equal(t, "·", cellFor("pod-a", "pod-b", nil, false)) + assert.Equal(t, "·", cellFor("pod-a", "pod-b", nil, familyRPing, false)) + }) + t.Run("rping OK is bare checkmark", func(t *testing.T) { + r := PingResult{OK: true} + assert.Equal(t, "✓", cellFor("a", "b", &r, familyRPing, false)) + }) + t.Run("rping fail is bare X", func(t *testing.T) { + r := PingResult{OK: false} + assert.Equal(t, "✗", cellFor("a", "b", &r, familyRPing, false)) }) - t.Run("OK result formats with loss + rtt", func(t *testing.T) { - r := PingResult{OK: true, PacketLoss: 0, RTTAvgMs: 1.23} - assert.Equal(t, "✓ 0% 1.2ms", cellFor("a", "b", &r, false)) + t.Run("ib_write_bw OK shows Gbps", func(t *testing.T) { + r := PingResult{OK: true, BandwidthGbps: 194.39} + assert.Equal(t, "✓ 194.4 Gbps", cellFor("a", "b", &r, familyIbBw, false)) }) - t.Run("partial loss is failure", func(t *testing.T) { - r := PingResult{OK: false, PacketLoss: 33} - assert.Equal(t, "✗ 33%", cellFor("a", "b", &r, false)) + t.Run("ib_write_bw fail with no bandwidth is ERR", func(t *testing.T) { + r := PingResult{OK: false} + assert.Equal(t, "✗ ERR", cellFor("a", "b", &r, familyIbBw, false)) }) - t.Run("exec error has no loss reading", func(t *testing.T) { - r := PingResult{OK: false, PacketLoss: -1} - assert.Equal(t, "✗ ERR", cellFor("a", "b", &r, false)) + t.Run("ib_write_bw OK but zero bandwidth is ERR", func(t *testing.T) { + // Defensive: OK=true with zero bandwidth shouldn't happen + // in practice, but if it does the cell renders as ERR + // since a "0 Gbps" link is broken even if perftest exited + // cleanly. + r := PingResult{OK: true, BandwidthGbps: 0} + assert.Equal(t, "✗ ERR", cellFor("a", "b", &r, familyIbBw, false)) }) - t.Run("TTY mode wraps in ANSI green", func(t *testing.T) { - r := PingResult{OK: true, PacketLoss: 0} - got := cellFor("a", "b", &r, true) + t.Run("TTY mode wraps in ANSI green for OK", func(t *testing.T) { + r := PingResult{OK: true} + got := cellFor("a", "b", &r, familyRPing, true) assert.Contains(t, got, "\033[32m") assert.Contains(t, got, "\033[0m") }) } + +func TestKindFamilyOf(t *testing.T) { + assert.Equal(t, familyRPing, kindFamilyOf(RDMAPingSameRail)) + assert.Equal(t, familyRPing, kindFamilyOf(RDMAPingCrossRail)) + assert.Equal(t, familyIbBw, kindFamilyOf(RDMABwSameRail)) + assert.Equal(t, familyIbBw, kindFamilyOf(RDMABwCrossRail)) +} diff --git a/pkg/networkoperatorplugin/crstate/registry.go b/pkg/networkoperatorplugin/crstate/registry.go index 027c5aa..7c157e0 100644 --- a/pkg/networkoperatorplugin/crstate/registry.go +++ b/pkg/networkoperatorplugin/crstate/registry.go @@ -66,3 +66,45 @@ func (r *Registry) Validate(ctx context.Context, c client.Client, obj *unstructu } return DefaultExistenceValidator(ctx, c, obj) } + +// NeedsObservationGate reports whether a CR's authoritative status +// lives on the object itself (vs on a companion CR). The deploy +// state machine uses this to decide whether the post-apply +// resourceVersion bump is a meaningful "controller has reacted" +// signal: +// +// - true — validator reads `obj.status.*` directly (NCP/NNP/the +// three Mellanox Network Kinds, SpectrumXRailPoolConfig +// v1alpha2). Until the controller writes status back the RV +// stays at the apply-time value, and the validator would return +// stale data from the previous reconcile. Gate the verdict +// until live RV moves past apply-time RV. +// +// - false — validator reads companion CRs whose lifecycle is +// independent of the apply (SriovNetworkNodePolicy → +// SriovNetworkNodeState per node; NicInterfaceNameTemplate / +// NicConfigurationTemplate → NicDevice per device). The +// companion's RV evolves on its own schedule and the +// SriovNetworkNodePolicy itself never gets a status write, so +// gating on its RV would block forever. Validators in this +// bucket get to give an immediate verdict — staleness is their +// own problem to detect (and the SR-IOV validator does, by +// bucketing the "Succeeded but numVfs not at target" case as +// in-progress per the SR-IOV soft-progress rule). +// +// IPPool / CIDRPool / SriovNetwork / SriovIBNetwork / OVSNetwork +// fall through to the default existence-only validator and also +// don't need the gate — there's no status to read at all. +func NeedsObservationGate(gvk schema.GroupVersionKind) bool { + if gvk.Group == "mellanox.com" && gvk.Version == "v1alpha1" { + switch gvk.Kind { + case "NicClusterPolicy", "NicNodePolicy", + "HostDeviceNetwork", "IPoIBNetwork", "MacvlanNetwork": + return true + } + } + if gvk.Group == spcxGroup && gvk.Version == spcxVersionAlpha2 && gvk.Kind == spcxKindRailPoolConfig { + return true + } + return false +} diff --git a/pkg/networkoperatorplugin/crstate/registry_test.go b/pkg/networkoperatorplugin/crstate/registry_test.go index 160931f..b8f4514 100644 --- a/pkg/networkoperatorplugin/crstate/registry_test.go +++ b/pkg/networkoperatorplugin/crstate/registry_test.go @@ -190,6 +190,42 @@ func TestStatusStringValidator_RegisteredKinds(t *testing.T) { } } +func TestNeedsObservationGate(t *testing.T) { + cases := []struct { + gvk schema.GroupVersionKind + want bool + name string + }{ + // Kinds whose validators read .status on the CR itself — + // gate is meaningful, controller's RV bump is the signal. + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1alpha1", Kind: "NicClusterPolicy"}, true, "NicClusterPolicy"}, + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1alpha1", Kind: "NicNodePolicy"}, true, "NicNodePolicy"}, + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1alpha1", Kind: "HostDeviceNetwork"}, true, "HostDeviceNetwork"}, + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1alpha1", Kind: "IPoIBNetwork"}, true, "IPoIBNetwork"}, + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1alpha1", Kind: "MacvlanNetwork"}, true, "MacvlanNetwork"}, + {schema.GroupVersionKind{Group: "spectrumx.nvidia.com", Version: "v1alpha2", Kind: "SpectrumXRailPoolConfig"}, true, "SpectrumXRailPoolConfig"}, + + // Kinds whose validators read companion CRs — gate would + // block forever waiting for an RV bump that never lands. + {schema.GroupVersionKind{Group: "sriovnetwork.openshift.io", Version: "v1", Kind: "SriovNetworkNodePolicy"}, false, "SriovNetworkNodePolicy (companion = SriovNetworkNodeState)"}, + {schema.GroupVersionKind{Group: "configuration.net.nvidia.com", Version: "v1alpha1", Kind: "NicInterfaceNameTemplate"}, false, "NicInterfaceNameTemplate (companion = NicDevice)"}, + {schema.GroupVersionKind{Group: "configuration.net.nvidia.com", Version: "v1alpha1", Kind: "NicConfigurationTemplate"}, false, "NicConfigurationTemplate (companion = NicDevice)"}, + + // Existence-only Kinds — no status at all, gate irrelevant. + {schema.GroupVersionKind{Group: "nv-ipam.nvidia.com", Version: "v1alpha1", Kind: "IPPool"}, false, "IPPool"}, + {schema.GroupVersionKind{Group: "sriovnetwork.openshift.io", Version: "v1", Kind: "SriovNetwork"}, false, "SriovNetwork"}, + + // Wrong group / version — never gate. + {schema.GroupVersionKind{Group: "mellanox.com", Version: "v1beta1", Kind: "NicClusterPolicy"}, false, "wrong version"}, + {schema.GroupVersionKind{Group: "other.example.com", Version: "v1", Kind: "NicClusterPolicy"}, false, "wrong group"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, NeedsObservationGate(tc.gvk)) + }) + } +} + // node returns a labelled *corev1.Node for fake-client seeding. func node(name string, labels map[string]string) *corev1.Node { n := &corev1.Node{} diff --git a/pkg/networkoperatorplugin/deploy.go b/pkg/networkoperatorplugin/deploy.go index 37a181a..3b6374e 100644 --- a/pkg/networkoperatorplugin/deploy.go +++ b/pkg/networkoperatorplugin/deploy.go @@ -191,7 +191,7 @@ func ApplyManifestsFromDir(ctx context.Context, kubeClient client.Client, manife } am := appliedManifest{obj: obj} - if obj.GetGeneration() > preApplyGen { + if obj.GetGeneration() > preApplyGen && crstate.NeedsObservationGate(obj.GroupVersionKind()) { am.awaitObservationAfterRV = obj.GetResourceVersion() } appliedOthers = append(appliedOthers, am) @@ -331,11 +331,12 @@ func applyAndWait(ctx context.Context, c client.Client, registry *crstate.Regist // applyUnstructured does an SSA Patch that returns the // server-decided object on `obj`. Its current resourceVersion - // is "the version right after our apply". If the spec changed, - // any subsequent live RV != this value means the controller - // has written status since. + // is "the version right after our apply". If the spec changed + // AND this Kind's validator reads .status from the CR itself + // (vs from companion CRs), any subsequent live RV != this + // value means the controller has written status since. awaitObservationAfterRV := "" - if obj.GetGeneration() > preApplyGen { + if obj.GetGeneration() > preApplyGen && crstate.NeedsObservationGate(obj.GroupVersionKind()) { awaitObservationAfterRV = obj.GetResourceVersion() log.Log.V(1).Info("Spec changed; gating poll on controller observation", "kind", obj.GetKind(), "name", obj.GetName(), diff --git a/pkg/networkoperatorplugin/discovery.go b/pkg/networkoperatorplugin/discovery.go index 7ce73c6..04591a4 100644 --- a/pkg/networkoperatorplugin/discovery.go +++ b/pkg/networkoperatorplugin/discovery.go @@ -22,7 +22,6 @@ import ( "fmt" "slices" "sort" - "strconv" "strings" "time" @@ -32,6 +31,7 @@ import ( "github.com/nvidia/k8s-launch-kit/pkg/config" "github.com/nvidia/k8s-launch-kit/pkg/kubeclient" "github.com/nvidia/k8s-launch-kit/pkg/networkoperatorplugin/internal/pciids" + "github.com/nvidia/k8s-launch-kit/pkg/presetmatch" "github.com/nvidia/k8s-launch-kit/pkg/presets" "github.com/nvidia/k8s-launch-kit/pkg/ui" corev1 "k8s.io/api/core/v1" @@ -295,55 +295,44 @@ func (p *NetworkOperatorPlugin) DiscoverClusterConfig(ctx context.Context, c cli discoverGroupFabric(ctx, p.RESTConfig, defaultConfig.NetworkOperator.Namespace, group, dsPods) - // Try to enrich with a predefined topology preset for this (machine, - // GPU) pair. Presets provide authoritative traffic classification, - // rail assignments, and NUMA/GPU topology metadata for known - // hardware configurations. Lookup is exact-match on (machineType, + // Try to enrich with a predefined topology preset for this + // (machine, GPU) pair. presetmatch.MatchGroup runs the + // shared lookup + deviation comparison (also used by + // `l8k validate`); the discovery path then additionally + // applies the preset onto the group so rail/NUMA topology + // fields populate. Lookup is exact-match on (machineType, // gpuType) — both must be known for a preset to apply. - if group.MachineType != "" && group.GPUType != "" { - log.Log.V(1).Info("Looking up preset by (machineType, gpuType)", - "group", group.Identifier, - "machineType", group.MachineType, - "gpuType", group.GPUType) - preset, presetErr := presets.LoadPreset(group.MachineType, group.GPUType) - if presetErr != nil { - log.Log.Error(presetErr, "failed to load preset", - "machineType", group.MachineType, "gpuType", group.GPUType) - uiOutput.Warning("Failed to load preset for %s/%s: %v", - group.MachineType, group.GPUType, presetErr) - } else if preset == nil { - log.Log.V(1).Info("No preset matched (machineType, gpuType)", - "group", group.Identifier, - "machineType", group.MachineType, - "gpuType", group.GPUType) - } else { - // Always apply the matched preset on a best-effort basis. - // Any discrepancies (PF count, PCI address drift, - // device-ID drift) are recorded as soft deviations and - // re-warned about on every subsequent config load. - deviations := presets.ValidatePreset(preset, group.PFs) + matchResult := presetmatch.MatchGroup(*group) + log.Log.V(1).Info("Preset match", + "group", group.Identifier, + "machineType", group.MachineType, + "gpuType", group.GPUType, + "status", string(matchResult.Status), + "presetName", matchResult.PresetName, + "deviationCount", len(matchResult.Deviations)) + switch matchResult.Status { + case presetmatch.StatusMatch, presetmatch.StatusDeviation: + // LoadPreset was successful — load it again to get + // the Topology so ApplyPreset can enrich the group. + // (MatchGroup intentionally doesn't mutate.) + if preset, err := presets.LoadPreset(group.MachineType, group.GPUType); err == nil && preset != nil { presets.ApplyPreset(preset, group) - log.Log.V(1).Info("Preset matched and applied", - "group", group.Identifier, - "machineType", group.MachineType, - "gpuType", group.GPUType, - "presetPFCount", len(preset.PFs), - "discoveredPFCount", len(group.PFs), - "deviationCount", len(deviations)) - if len(deviations) > 0 { - group.PresetDeviation = deviations - log.Log.Info("Preset applied with deviations from matched preset", - "group", group.Identifier, - "machineType", group.MachineType, - "gpuType", group.GPUType, - "deviationCount", len(deviations)) - uiOutput.Warning( - "Preset for %s/%s applied with %d deviation(s) from the matched preset. The deployment is not certified — see 'presetDeviation' in cluster-config.yaml.", - group.MachineType, group.GPUType, len(deviations)) - } else { - uiOutput.Info("Applied preset configuration for %s", group.MachineType) - } } + if matchResult.Status == presetmatch.StatusDeviation { + group.PresetDeviation = matchResult.Deviations + uiOutput.Warning( + "Preset for %s/%s applied with %d deviation(s) from the matched preset. The deployment is not certified — see 'presetDeviation' in cluster-config.yaml.", + group.MachineType, group.GPUType, len(matchResult.Deviations)) + } else { + uiOutput.Info("Applied preset configuration for %s", group.MachineType) + } + case presetmatch.StatusNotFound: + // No catalog entry — discovery continues without + // preset enrichment. Logged at V(1) only; not a + // user-actionable warning. + case presetmatch.StatusSkipped: + // machineType / gpuType wasn't discovered. Discovery + // already logs this via the hardware-type probes. } modules, err := discoverThirdPartyRDMAModules(ctx, p.RESTConfig, @@ -1360,16 +1349,21 @@ func truncateForLog(s string, maxLen int) string { // discoverGroupFabric probes the InfiniBand sysfs entries on a representative // daemon pod for every east-west PF in `group` that has an RdmaDevice and, -// when the per-port verdicts unanimously agree on a confirmed value, sets +// when the per-port verdicts unanimously agree on a value, sets // `group.LinkType`. Otherwise the field is left empty — discovery couldn't -// prove the cluster is using a specific fabric, and downstream code treats -// absence as "unknown". +// determine the fabric type, and downstream code treats absence as +// "unknown". // -// "Confirmed" means the port is ACTIVE and (for InfiniBand) a subnet -// manager is present (sm_lid != 0). Anything else — port down, IB without -// SM, malformed sysfs output — yields no contribution to the group's -// verdict. Reading link_layer alone would be unreliable: that file just -// reflects firmware config and may be a default the cluster doesn't use. +// The verdict is the port's configured `link_layer` (sysfs file) — what +// the firmware says the port is wired for, regardless of whether the +// netdev is currently up. We deliberately ignore `state` (ACTIVE vs +// DOWN) and `sm_lid` (IB subnet manager presence) here: requiring an +// active link broke discovery on freshly-provisioned clusters where +// the switch wasn't yet plugged in, and the configured link_layer is +// what every downstream template needs anyway. An operator who +// reflashes a card to a different fabric needs to re-run discover, but +// that's the only failure mode we accept in exchange for the +// "discover before the cluster is wired up" win. // // Multi-node groups whose RdmaDevice is empty (per the existing // per-node-vs-group safety rule) skip the probe — there's no ibdev name @@ -1391,7 +1385,7 @@ func discoverGroupFabric(ctx context.Context, restConfig *rest.Config, containerName = targetPod.Spec.Containers[0].Name } - verdicts := map[string]int{} // confirmed linkType -> count + verdicts := map[string]int{} // linkType -> count of contributing PFs probed := 0 for _, pf := range group.PFs { if pf.Traffic != "east-west" || pf.RdmaDevice == "" { @@ -1429,7 +1423,7 @@ func discoverGroupFabric(ctx context.Context, restConfig *rest.Config, "group", group.Identifier, "linkType", k, "probedPFs", probed, - "confirmedPFs", verdicts[k]) + "contributingPFs", verdicts[k]) } case len(verdicts) > 1: log.Log.V(1).Info("Group fabric ambiguous (probes disagree); leaving linkType unset", @@ -1437,126 +1431,65 @@ func discoverGroupFabric(ctx context.Context, restConfig *rest.Config, "probedPFs", probed, "verdicts", verdicts) default: - log.Log.V(1).Info("Group fabric unconfirmed (no port produced a confirmed verdict); leaving linkType unset", + log.Log.V(1).Info("Group fabric unresolved (no port reported a recognised link_layer); leaving linkType unset", "group", group.Identifier, "probedPFs", probed) } } // discoverPortFabric reads -// /sys/class/infiniband//ports//{state,phys_state,link_layer,sm_lid} -// inside the daemon pod via a single shell exec and returns the confirmed -// fabric for that port (empty when the port could not produce a confirmed -// verdict). rawSummary is a short human-readable joined version of the -// four sysfs values for debug logs. +// /sys/class/infiniband//ports//link_layer inside the +// daemon pod and returns the configured fabric for that port — +// "Ethernet", "InfiniBand", or "" when the file is empty / unreadable / +// unrecognised. The port's runtime state (ACTIVE / DOWN) is +// intentionally NOT consulted: discovery has to work on freshly +// provisioned clusters where the switch isn't yet plugged in. // // Tries `/sys/class/infiniband/...` first (works when the daemon pod // shares host pid+net namespace and exposes the host sysfs at /sys), // then falls back to `/host/sys/class/infiniband/...` for daemons that // mount the host filesystem under /host (matches consts.HostPath = -// "/host" used by the rest of nic-configuration-operator). Stderr is -// captured rather than swallowed so a failed read surfaces in the -// debug log instead of producing a silent empty verdict. +// "/host" used by the rest of nic-configuration-operator). The first +// path that yields a recognised link_layer wins. func discoverPortFabric(ctx context.Context, restConfig *rest.Config, namespace, podName, containerName, rdmaDevice string, port int) (string, string, error) { + var lastErr error for _, base := range []string{ - fmt.Sprintf("/sys/class/infiniband/%s/ports/%d", rdmaDevice, port), - fmt.Sprintf("/host/sys/class/infiniband/%s/ports/%d", rdmaDevice, port), + fmt.Sprintf("/sys/class/infiniband/%s/ports/%d/link_layer", rdmaDevice, port), + fmt.Sprintf("/host/sys/class/infiniband/%s/ports/%d/link_layer", rdmaDevice, port), } { - cmd := fmt.Sprintf( - "echo state=$(cat %s/state); "+ - "echo phys_state=$(cat %s/phys_state); "+ - "echo link_layer=$(cat %s/link_layer); "+ - "echo sm_lid=$(cat %s/sm_lid)", - base, base, base, base) + cmd := fmt.Sprintf("cat %s", base) output, err := execInPod(ctx, restConfig, namespace, podName, containerName, []string{"/bin/sh", "-c", cmd}) - // Even on exec error we attempt to parse — `cat` returns - // non-zero for missing files but the SPDY executor still - // captures the stdout from earlier successful echoes. + if err != nil { + lastErr = err + log.Log.V(1).Info("Fabric port probe: read failed at this base", + "rdmaDevice", rdmaDevice, "port", port, "base", base, + "execErr", err.Error()) + continue + } linkType, raw := parsePortFabricVerdict(output) if linkType != "" { return linkType, raw, nil } - // First-path miss: log the raw read so an operator can see - // what came back, then try the next base path. We only - // surface the final error (if any) to the caller. - log.Log.V(1).Info("Fabric port probe: empty/unconfirmed at this base", + log.Log.V(1).Info("Fabric port probe: link_layer at this base not recognised", "rdmaDevice", rdmaDevice, "port", port, "base", base, - "raw", raw, "execErr", errString(err)) - if err == nil && raw != "" { - // Got a clean read that simply didn't meet the - // confirmation criteria (port DOWN, IB without - // SM, etc.). No point trying the other base — - // the port really isn't in a usable state. - return "", raw, nil - } + "raw", raw) } - return "", "", fmt.Errorf("no readable sysfs at either /sys/class/infiniband/%s or /host/sys/class/infiniband/%s", - rdmaDevice, rdmaDevice) -} - -// errString returns err.Error() or "" when err is nil — used in the -// V(1) probe log so we don't get the literal "" sentinel. -func errString(err error) string { - if err == nil { - return "" + if lastErr != nil { + return "", "", lastErr } - return err.Error() + return "", "", nil } -// parsePortFabricVerdict converts the four-line "key=value" output of the -// sysfs probe into a confirmed fabric verdict (or empty when no -// confirmation is possible). -// -// Confirmation rule: -// - Active + InfiniBand + sm_lid != 0 → "InfiniBand". -// - Active + Ethernet → "Ethernet". -// - Anything else → "" (no confirmation; caller -// leaves group.LinkType unset). -// -// Active means the state file matches "ACTIVE" (case-insensitive); the -// kernel formats it as "4: ACTIVE", "1: DOWN", etc. +// parsePortFabricVerdict normalises a sysfs `link_layer` read into the +// l8k vocabulary ("Ethernet" / "InfiniBand"). The output may be the +// raw file content ("Ethernet\n"), a `cat`'s output with possible +// trailing newline, or empty when the file didn't exist. raw is the +// trimmed input echoed back for debug-log breadcrumbs. func parsePortFabricVerdict(output string) (linkType, raw string) { - fields := map[string]string{} - for _, line := range strings.Split(output, "\n") { - eq := strings.IndexByte(line, '=') - if eq < 0 { - continue - } - fields[strings.TrimSpace(line[:eq])] = strings.TrimSpace(line[eq+1:]) - } - state := fields["state"] - linkLayer := normalizeLinkLayer(fields["link_layer"]) - smLid := fields["sm_lid"] - - raw = fmt.Sprintf("state=%q phys_state=%q link_layer=%q sm_lid=%q", - state, fields["phys_state"], fields["link_layer"], smLid) - - active := strings.Contains(strings.ToUpper(state), "ACTIVE") - hasSM := smLidIsNonZero(smLid) - - switch { - case active && linkLayer == "InfiniBand" && hasSM: - return "InfiniBand", raw - case active && linkLayer == "Ethernet": - return "Ethernet", raw - default: - return "", raw - } -} - -// smLidIsNonZero parses a sysfs `sm_lid` value (e.g. "0", "0x0", "0x0000", -// "0x0001") as an unsigned integer and returns true when the value is -// strictly greater than zero. Kernel versions disagree on the format — -// some emit decimal, some emit hex — so we accept both via auto-base -// (base=0 in strconv.ParseUint). -func smLidIsNonZero(s string) bool { - v, err := strconv.ParseUint(strings.TrimSpace(s), 0, 32) - if err != nil { - return false - } - return v != 0 + raw = strings.TrimSpace(output) + return normalizeLinkLayer(raw), raw } // normalizeLinkLayer canonicalises sysfs link_layer strings to the YAML diff --git a/pkg/networkoperatorplugin/discovery_test.go b/pkg/networkoperatorplugin/discovery_test.go index 305c678..449c162 100644 --- a/pkg/networkoperatorplugin/discovery_test.go +++ b/pkg/networkoperatorplugin/discovery_test.go @@ -432,46 +432,43 @@ func TestKnownStorageModules_MatchesMofedmodules(t *testing.T) { // --- parsePortFabricVerdict tests --- -func TestParsePortFabricVerdict_ConfirmedInfiniBand(t *testing.T) { - out := "state=4: ACTIVE\nphys_state=5: LinkUp\nlink_layer=InfiniBand\nsm_lid=0x0001\n" - linkType, raw := parsePortFabricVerdict(out) +func TestParsePortFabricVerdict_InfiniBand(t *testing.T) { + linkType, raw := parsePortFabricVerdict("InfiniBand\n") assert.Equal(t, "InfiniBand", linkType) - assert.Contains(t, raw, "sm_lid=\"0x0001\"") + assert.Equal(t, "InfiniBand", raw) } -func TestParsePortFabricVerdict_ConfirmedEthernet(t *testing.T) { - // Ethernet ports don't need an SM; ACTIVE + Ethernet alone is enough. - out := "state=4: ACTIVE\nphys_state=5: LinkUp\nlink_layer=Ethernet\nsm_lid=0x0000\n" - linkType, _ := parsePortFabricVerdict(out) +func TestParsePortFabricVerdict_Ethernet(t *testing.T) { + linkType, raw := parsePortFabricVerdict("Ethernet\n") assert.Equal(t, "Ethernet", linkType) + assert.Equal(t, "Ethernet", raw) } -func TestParsePortFabricVerdict_UnverifiedIB_NoSM_ReturnsEmpty(t *testing.T) { - // Active IB port without a subnet manager — we can't confirm the - // cluster is using IB. Verdict is empty (caller leaves group.LinkType - // unset). - out := "state=4: ACTIVE\nphys_state=5: LinkUp\nlink_layer=InfiniBand\nsm_lid=0x0000\n" - linkType, _ := parsePortFabricVerdict(out) - assert.Equal(t, "", linkType) +func TestParsePortFabricVerdict_DownPortStillResolvesByLinkLayer(t *testing.T) { + // Old behaviour required ACTIVE state; the new probe reads + // only the configured link_layer file, which gives a verdict + // regardless of runtime state. This is what unblocks + // discovery on freshly provisioned clusters where the switch + // isn't plugged in yet. + linkType, _ := parsePortFabricVerdict("Ethernet\n") + assert.Equal(t, "Ethernet", linkType) } -func TestParsePortFabricVerdict_DownPort_ReturnsEmpty(t *testing.T) { - // Port not ACTIVE — no confirmation possible. - out := "state=1: DOWN\nphys_state=3: Disabled\nlink_layer=Ethernet\nsm_lid=0x0000\n" - linkType, _ := parsePortFabricVerdict(out) +func TestParsePortFabricVerdict_EmptyOutput(t *testing.T) { + linkType, raw := parsePortFabricVerdict("") assert.Equal(t, "", linkType) + assert.Equal(t, "", raw) } -func TestParsePortFabricVerdict_EmptyOutput(t *testing.T) { - linkType, _ := parsePortFabricVerdict("") +func TestParsePortFabricVerdict_UnrecognisedValue(t *testing.T) { + linkType, raw := parsePortFabricVerdict("Foo\n") assert.Equal(t, "", linkType) + assert.Equal(t, "Foo", raw) } -func TestParsePortFabricVerdict_PartialOutput(t *testing.T) { - // link_layer line missing — no verdict. - out := "state=4: ACTIVE\nphys_state=5: LinkUp\nsm_lid=0x0001\n" - linkType, _ := parsePortFabricVerdict(out) - assert.Equal(t, "", linkType) +func TestParsePortFabricVerdict_TrimsWhitespace(t *testing.T) { + linkType, _ := parsePortFabricVerdict(" Ethernet \n") + assert.Equal(t, "Ethernet", linkType) } func TestNormalizeLinkLayer(t *testing.T) { diff --git a/pkg/networkoperatorplugin/validate.go b/pkg/networkoperatorplugin/validate.go index d13f26f..ad2d604 100644 --- a/pkg/networkoperatorplugin/validate.go +++ b/pkg/networkoperatorplugin/validate.go @@ -62,7 +62,7 @@ type ValidationResult struct { Details map[string]string // LiveYAML is the cluster's view of the object, marshalled - // back to YAML for the verify-report's expandable "Live YAML" + // back to YAML for the validation report's expandable "Live YAML" // dropdown. Empty when the object isn't present // (StateNotDeployed) or when the post-validate fetch failed. // Managed-fields / status are kept; we want the operator to diff --git a/pkg/presetmatch/presetmatch.go b/pkg/presetmatch/presetmatch.go new file mode 100644 index 0000000..3f204f5 --- /dev/null +++ b/pkg/presetmatch/presetmatch.go @@ -0,0 +1,176 @@ +// Copyright 2026 NVIDIA CORPORATION & AFFILIATES +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +// Package presetmatch is the single entry point for comparing a +// discovered cluster group against the topology-preset catalog. +// +// Both `l8k discover` (at discovery time, when preset data is also +// applied into the group via presets.ApplyPreset) and `l8k validate` +// (at validation time, to confirm that the cluster's recorded hardware +// still matches the certified preset) call MatchGroup / MatchAll +// here. Keeping the lookup logic in one place means new lookup +// behaviour (fuzzy matching, preset-version awareness, …) lands in +// one file and both call sites pick it up. +// +// MatchGroup does not mutate the input group. Callers that want to +// also enrich the group's transient topology fields call +// presets.ApplyPreset themselves (typically only the discovery path +// does that). +package presetmatch + +import ( + "fmt" + + "github.com/nvidia/k8s-launch-kit/pkg/config" + "github.com/nvidia/k8s-launch-kit/pkg/presets" +) + +// Status enumerates the outcomes of a per-group preset lookup. +type Status string + +const ( + // StatusMatch — preset found and every comparison field + // matches the discovered hardware exactly. + StatusMatch Status = "match" + // StatusDeviation — preset found but the hardware drifts on at + // least one field (PF count, PCI addresses, device IDs). + // Deviations are informational: the deployment can still run + // correctly against drifted hardware, just not against the + // certified preset. + StatusDeviation Status = "deviation" + // StatusNotFound — no preset matches the (machineType, gpuType) + // pair. Most common reason: the pair hasn't been catalogued + // yet, or discovery didn't populate machineType / gpuType + // (e.g. running on hardware the GPU operator labels don't + // cover). + StatusNotFound Status = "not-found" + // StatusSkipped — the lookup couldn't even be attempted (e.g. + // machineType or gpuType empty on the group). Distinct from + // StatusNotFound so callers can render "discovery is + // incomplete" rather than "no preset for this hardware." + StatusSkipped Status = "skipped" +) + +// Result describes one group's preset-lookup outcome. Surfaced +// verbatim by validate's text/JSON output and the HTML report. The +// Deviations slice mirrors config.PresetDeviationEntry so callers +// that already render the existing presetDeviation field keep +// working without translation. +type Result struct { + Group string + MachineType string + GPUType string + // Manufacturer is propagated from the matched preset's + // topology.yaml when a preset was found (StatusMatch / + // StatusDeviation). Empty otherwise. Surfaced in the user- + // facing "server type" label as the leading segment of + // --. + Manufacturer string + Status Status + // PresetName is the catalog directory name (what `l8k preset + // list` prints) when a preset was found. Empty otherwise. + PresetName string + // Reason carries a short human-readable explanation when + // Status is StatusNotFound or StatusSkipped, and a one-line + // summary when StatusDeviation ("3 deviation(s) — pfCount, + // pciAddress, deviceID"). Empty when StatusMatch. + Reason string + Deviations []config.PresetDeviationEntry + // Preset is the loaded topology that produced the match. Used + // downstream to enrich "missing PCI" rows in the validation + // report with the expected deviceID / rail / netdev when the + // cluster doesn't have a device the certified topology + // expects. nil when Status is NotFound or Skipped. + Preset *presets.Topology +} + +// MatchGroup runs the preset lookup + comparison for one group. +// Does not mutate the group; callers that also want to enrich +// rail/NUMA topology fields invoke presets.ApplyPreset separately. +// +// Lookup is exact-match on (machineType, gpuType). Empty fields +// short-circuit to StatusSkipped — without those values the preset +// catalog can't be queried, and we don't fall back to fuzzy matching +// because picking the wrong preset would silently rewrite the +// deployment to target the wrong hardware shape. +func MatchGroup(group config.ClusterConfig) Result { + res := Result{ + Group: group.Identifier, + MachineType: group.MachineType, + GPUType: group.GPUType, + } + if group.MachineType == "" || group.GPUType == "" { + res.Status = StatusSkipped + switch { + case group.MachineType == "" && group.GPUType == "": + res.Reason = "machineType and gpuType not discovered on group — `l8k discover` did not populate them" + case group.MachineType == "": + res.Reason = "machineType not discovered on group" + default: + res.Reason = "gpuType not discovered on group" + } + return res + } + preset, err := presets.LoadPreset(group.MachineType, group.GPUType) + if err != nil { + res.Status = StatusNotFound + res.Reason = fmt.Sprintf("preset lookup failed: %v", err) + return res + } + if preset == nil { + res.Status = StatusNotFound + res.Reason = fmt.Sprintf("no preset matches (%s, %s) in the local presets directory", group.MachineType, group.GPUType) + return res + } + deviations := presets.ValidatePreset(preset, group.PFs) + res.PresetName = preset.MachineType + "/" + preset.GPUType + res.Manufacturer = preset.Manufacturer + res.Deviations = deviations + res.Preset = preset + if len(deviations) == 0 { + res.Status = StatusMatch + return res + } + res.Status = StatusDeviation + res.Reason = fmt.Sprintf("%d deviation(s) from matched preset", len(deviations)) + return res +} + +// MatchAll runs MatchGroup over every entry in cfg.ClusterConfig and +// returns the results in the same order. cfg is never mutated. +func MatchAll(cfg *config.LaunchKubernetesConfig) []Result { + if cfg == nil { + return nil + } + out := make([]Result, 0, len(cfg.ClusterConfig)) + for _, g := range cfg.ClusterConfig { + out = append(out, MatchGroup(g)) + } + return out +} + +// AnyMatched reports whether any of the results found AND fully +// matched a preset. Used by the validate CLI to decide whether the +// "preset" check is worth surfacing at all (vs. hidden when no +// presets exist for the cluster's hardware). +func AnyMatched(results []Result) bool { + for _, r := range results { + if r.Status == StatusMatch || r.Status == StatusDeviation { + return true + } + } + return false +} diff --git a/pkg/presetmatch/presetmatch_test.go b/pkg/presetmatch/presetmatch_test.go new file mode 100644 index 0000000..0e31559 --- /dev/null +++ b/pkg/presetmatch/presetmatch_test.go @@ -0,0 +1,85 @@ +// Copyright 2026 NVIDIA CORPORATION & AFFILIATES +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 + +package presetmatch + +import ( + "testing" + + "github.com/nvidia/k8s-launch-kit/pkg/config" + "github.com/stretchr/testify/assert" +) + +func TestMatchGroup_SkippedWhenMachineOrGPUMissing(t *testing.T) { + t.Run("both empty", func(t *testing.T) { + r := MatchGroup(config.ClusterConfig{Identifier: "g"}) + assert.Equal(t, StatusSkipped, r.Status) + assert.Contains(t, r.Reason, "machineType and gpuType not discovered") + }) + t.Run("machineType only", func(t *testing.T) { + r := MatchGroup(config.ClusterConfig{Identifier: "g", MachineType: "vendor-a-h200"}) + assert.Equal(t, StatusSkipped, r.Status) + assert.Contains(t, r.Reason, "gpuType not discovered") + }) + t.Run("gpuType only", func(t *testing.T) { + r := MatchGroup(config.ClusterConfig{Identifier: "g", GPUType: "h200"}) + assert.Equal(t, StatusSkipped, r.Status) + assert.Contains(t, r.Reason, "machineType not discovered") + }) +} + +// The preset catalog can be absent in the test environment (it's +// installed under /usr/local/share/l8k or similar). When that's the +// case LoadPreset returns "not found" for any pair — exercising the +// StatusNotFound path. The other Status outcomes need real preset +// fixtures and are covered by the higher-level integration tests in +// pkg/presets and pkg/networkoperatorplugin. +func TestMatchGroup_NotFoundWhenNoCatalog(t *testing.T) { + r := MatchGroup(config.ClusterConfig{ + Identifier: "g", + MachineType: "fictional-machine-no-such-preset", + GPUType: "h200", + }) + // Either StatusNotFound (catalog exists but no match) or + // StatusNotFound (no catalog at all) — both produce the same + // status code, so a single assertion is enough. + assert.Equal(t, StatusNotFound, r.Status) + assert.NotEmpty(t, r.Reason) +} + +func TestMatchAll(t *testing.T) { + cfg := &config.LaunchKubernetesConfig{ + ClusterConfig: []config.ClusterConfig{ + {Identifier: "a"}, + {Identifier: "b", MachineType: "vendor-x", GPUType: "h200"}, + }, + } + results := MatchAll(cfg) + assert.Len(t, results, 2) + assert.Equal(t, "a", results[0].Group) + assert.Equal(t, StatusSkipped, results[0].Status) + assert.Equal(t, "b", results[1].Group) + // StatusNotFound when no preset catalog; never panics either way. + assert.NotEqual(t, StatusSkipped, results[1].Status) +} + +func TestAnyMatched(t *testing.T) { + assert.False(t, AnyMatched(nil)) + assert.False(t, AnyMatched([]Result{{Status: StatusSkipped}, {Status: StatusNotFound}})) + assert.True(t, AnyMatched([]Result{{Status: StatusMatch}})) + assert.True(t, AnyMatched([]Result{{Status: StatusDeviation}})) + assert.True(t, AnyMatched([]Result{{Status: StatusSkipped}, {Status: StatusMatch}})) +} diff --git a/profiles/host-device-rdma/40-example-daemonset.yaml b/profiles/host-device-rdma/40-example-daemonset.yaml index 904a658..ad13a26 100644 --- a/profiles/host-device-rdma/40-example-daemonset.yaml +++ b/profiles/host-device-rdma/40-example-daemonset.yaml @@ -44,7 +44,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: @@ -107,7 +107,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: diff --git a/profiles/ipoib-rdma-shared/40-example-daemonset.yaml b/profiles/ipoib-rdma-shared/40-example-daemonset.yaml index 29738ee..43448fe 100644 --- a/profiles/ipoib-rdma-shared/40-example-daemonset.yaml +++ b/profiles/ipoib-rdma-shared/40-example-daemonset.yaml @@ -44,7 +44,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: @@ -108,7 +108,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: diff --git a/profiles/macvlan-rdma-shared/40-example-daemonset.yaml b/profiles/macvlan-rdma-shared/40-example-daemonset.yaml index 3e1aeea..b016505 100644 --- a/profiles/macvlan-rdma-shared/40-example-daemonset.yaml +++ b/profiles/macvlan-rdma-shared/40-example-daemonset.yaml @@ -44,7 +44,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: @@ -108,7 +108,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: diff --git a/profiles/spectrum-x-ra2.1/90-example-daemonset.yaml b/profiles/spectrum-x-ra2.1/90-example-daemonset.yaml index 5cf601a..9afcaff 100644 --- a/profiles/spectrum-x-ra2.1/90-example-daemonset.yaml +++ b/profiles/spectrum-x-ra2.1/90-example-daemonset.yaml @@ -46,7 +46,8 @@ spec: {{- end }} containers: - name: spectrum-x-test - image: nvcr.io/nvidia/mellanox/rping-test:latest + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host + command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: add: ["IPC_LOCK", "NET_RAW"] diff --git a/profiles/spectrum-x/90-example-daemonset.yaml b/profiles/spectrum-x/90-example-daemonset.yaml index 5cf601a..9afcaff 100644 --- a/profiles/spectrum-x/90-example-daemonset.yaml +++ b/profiles/spectrum-x/90-example-daemonset.yaml @@ -46,7 +46,8 @@ spec: {{- end }} containers: - name: spectrum-x-test - image: nvcr.io/nvidia/mellanox/rping-test:latest + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host + command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: add: ["IPC_LOCK", "NET_RAW"] diff --git a/profiles/sriov-ethernet-rdma/60-example-daemonset.yaml b/profiles/sriov-ethernet-rdma/60-example-daemonset.yaml index abffc47..8a60e87 100644 --- a/profiles/sriov-ethernet-rdma/60-example-daemonset.yaml +++ b/profiles/sriov-ethernet-rdma/60-example-daemonset.yaml @@ -44,7 +44,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: @@ -107,7 +107,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: diff --git a/profiles/sriov-ib-rdma/60-example-daemonset.yaml b/profiles/sriov-ib-rdma/60-example-daemonset.yaml index 7d56b5c..b14bfd7 100644 --- a/profiles/sriov-ib-rdma/60-example-daemonset.yaml +++ b/profiles/sriov-ib-rdma/60-example-daemonset.yaml @@ -44,7 +44,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: @@ -107,7 +107,7 @@ spec: {{- end }} containers: - name: test-container - image: mellanox/rping-test + image: nvcr.io/nvidia/doca/doca:3.3.0-full-rt-host command: ["/bin/bash", "-c", "sleep infinity"] securityContext: capabilities: