From 3bb6515650e4c8fd6d926fe915537f24fccb607c Mon Sep 17 00:00:00 2001 From: be0x74a Date: Fri, 8 May 2026 01:50:32 +0200 Subject: [PATCH] feat(bench): add self-heal and ns-flip events, label source-update fields The harness already measured one event (timestamp annotation propagation) but the report fields - e2e_np_p50_ns, e2e_cp_sel_earliest_p50_ns, etc. - were unlabeled, conflating the metric with its semantics. Adding more events forces the rename: source-update is now explicit in field names, self-heal lives next to it for every topology, and ns-flip (cleanup + add) covers the CP-selector watcher path. Renames (no compat aliases - pre-v1.0, no users): - e2e_np_*_ns -> e2e_np_source_update_*_ns - e2e_cp_sel_*_ns -> e2e_cp_sel_source_update_*_ns - e2e_cp_list_*_ns -> e2e_cp_list_source_update_*_ns New events: - self-heal (NP, CP-sel, CP-list): delete K destinations, time recreation per-destination via UID change. K = min(100, NPRefs) for NP, min(20, fanout) for CP. Per-destination latency, no fan-out earliest/slowest - each recreate is independent. - ns-flip (CP-selector only): K = min(20, dstSet/2) namespaces. Cleanup phase removes the matching label and times destination delete; add phase re-adds the label and times destination create. The /2 floor leaves the rest of the fanout as a steady backdrop. Each event ends with a 5s settle before the next, so the previous event's tail (controller queue drain, cache settling) doesn't leak into the next distribution. The smoke comment script keeps the table source-update-only - self-heal and ns-flip are noise for a per-PR shape-break check. The full report JSON carries every distribution. --- .github/scripts/bench-comment.sh | 48 +++--- test/bench/measure.go | 242 +++++++++++++++++++++++++++++++ test/bench/measure_test.go | 29 ++++ test/bench/report.go | 149 ++++++++++++++----- test/bench/report_test.go | 206 ++++++++++++++++++++------ test/bench/runner.go | 134 ++++++++++++++--- 6 files changed, 683 insertions(+), 125 deletions(-) diff --git a/.github/scripts/bench-comment.sh b/.github/scripts/bench-comment.sh index ca4accb..6c3d1bb 100755 --- a/.github/scripts/bench-comment.sh +++ b/.github/scripts/bench-comment.sh @@ -4,6 +4,10 @@ # (workflow-link callout). The marker line is required at the top of both — # the workflow uses it to find an existing comment to update. # +# The smoke comment intentionally only surfaces source-update numbers — the +# headline event for a per-PR check. Self-heal and ns-flip distributions are +# tracked in bench.json but kept out of this comment to avoid noise. +# # Args: # $1: path to bench.json (may be missing or empty on failure) # $2: short commit SHA @@ -90,38 +94,38 @@ if [ "$CPLIST_COUNT" -gt 0 ]; then fi [ -z "$PROFILE_DESC" ] && PROFILE_DESC="(no shapes set — empty profile?)" -NP_SAMPLES=$(n measurements.e2e_np_samples) -NP_P50=$(ms measurements.e2e_np_p50_ns) -NP_P95=$(ms measurements.e2e_np_p95_ns) -NP_P99=$(ms measurements.e2e_np_p99_ns) - -CPSEL_SAMPLES=$(n measurements.e2e_cp_sel_samples) -CPSEL_E_P50=$(ms measurements.e2e_cp_sel_earliest_p50_ns) -CPSEL_E_P95=$(ms measurements.e2e_cp_sel_earliest_p95_ns) -CPSEL_E_P99=$(ms measurements.e2e_cp_sel_earliest_p99_ns) -CPSEL_S_P50=$(ms measurements.e2e_cp_sel_slowest_p50_ns) -CPSEL_S_P95=$(ms measurements.e2e_cp_sel_slowest_p95_ns) -CPSEL_S_P99=$(ms measurements.e2e_cp_sel_slowest_p99_ns) - -CPLIST_SAMPLES=$(n measurements.e2e_cp_list_samples) -CPLIST_E_P50=$(ms measurements.e2e_cp_list_earliest_p50_ns) -CPLIST_E_P95=$(ms measurements.e2e_cp_list_earliest_p95_ns) -CPLIST_E_P99=$(ms measurements.e2e_cp_list_earliest_p99_ns) -CPLIST_S_P50=$(ms measurements.e2e_cp_list_slowest_p50_ns) -CPLIST_S_P95=$(ms measurements.e2e_cp_list_slowest_p95_ns) -CPLIST_S_P99=$(ms measurements.e2e_cp_list_slowest_p99_ns) +NP_SAMPLES=$(n measurements.e2e_np_source_update_samples) +NP_P50=$(ms measurements.e2e_np_source_update_p50_ns) +NP_P95=$(ms measurements.e2e_np_source_update_p95_ns) +NP_P99=$(ms measurements.e2e_np_source_update_p99_ns) + +CPSEL_SAMPLES=$(n measurements.e2e_cp_sel_source_update_samples) +CPSEL_E_P50=$(ms measurements.e2e_cp_sel_source_update_earliest_p50_ns) +CPSEL_E_P95=$(ms measurements.e2e_cp_sel_source_update_earliest_p95_ns) +CPSEL_E_P99=$(ms measurements.e2e_cp_sel_source_update_earliest_p99_ns) +CPSEL_S_P50=$(ms measurements.e2e_cp_sel_source_update_slowest_p50_ns) +CPSEL_S_P95=$(ms measurements.e2e_cp_sel_source_update_slowest_p95_ns) +CPSEL_S_P99=$(ms measurements.e2e_cp_sel_source_update_slowest_p99_ns) + +CPLIST_SAMPLES=$(n measurements.e2e_cp_list_source_update_samples) +CPLIST_E_P50=$(ms measurements.e2e_cp_list_source_update_earliest_p50_ns) +CPLIST_E_P95=$(ms measurements.e2e_cp_list_source_update_earliest_p95_ns) +CPLIST_E_P99=$(ms measurements.e2e_cp_list_source_update_earliest_p99_ns) +CPLIST_S_P50=$(ms measurements.e2e_cp_list_source_update_slowest_p50_ns) +CPLIST_S_P95=$(ms measurements.e2e_cp_list_source_update_slowest_p95_ns) +CPLIST_S_P99=$(ms measurements.e2e_cp_list_source_update_slowest_p99_ns) cat < ## Bench smoke — \`$PROFILE_NAME\` -End-to-end latency from a 2-vCPU GHA runner. Treat absolute numbers as a sanity check, not a perf claim — runner noise is high. The point of this check is to catch shape-break regressions on \`api/v1\` / controller / bench changes before merge. +End-to-end source-update latency from a 2-vCPU GHA runner. Treat absolute numbers as a sanity check, not a perf claim — runner noise is high. The point of this check is to catch shape-break regressions on \`api/v1\` / controller / bench changes before merge. (Self-heal and ns-flip distributions are recorded in \`bench.json\` but omitted here for signal-to-noise.) ### Profile $PROFILE_DESC, layered in one bootstrap. -### Results +### Results — source-update latency | Path | Samples | p50 | p95 | p99 | |---|---|---|---|---| diff --git a/test/bench/measure.go b/test/bench/measure.go index e426308..7e4cd76 100644 --- a/test/bench/measure.go +++ b/test/bench/measure.go @@ -356,3 +356,245 @@ func measureE2EClusterFanout( Slowest: LatencyResult{Samples: stamps, P50: s50, P95: s95, P99: s99}, }, nil } + +// capSample returns sample[:min(len(sample), n)] without allocating. The +// caller is responsible for passing a sample they already shuffled or +// otherwise selected; this helper just enforces the cap. +func capSample[T any](sample []T, n int) []T { + if n <= 0 || len(sample) <= n { + return sample + } + return sample[:n] +} + +// waitForRecreate polls one destination object until it is observed with a +// UID different from `oldUID`. NotFound is treated as "still recreating" and +// the loop continues; any other error aborts. Returns the elapsed duration +// from `t0` when the new UID is observed, or an error on 30s timeout. +// +// Self-heal latency end is "destination CR present with new UID", not "spec +// matches source". The controller's create call is the user-visible event; +// follow-up reconciles to align spec are measured by source-update. +func waitForRecreate( + ctx context.Context, + c *clients, + gvkIdx int, + dstNs, name string, + oldUID k8stypes.UID, + t0 time.Time, +) (time.Duration, error) { + deadline := time.Now().Add(30 * time.Second) + for { + if time.Now().After(deadline) { + return 0, fmt.Errorf("timeout waiting for recreation of %s/%s", dstNs, name) + } + dst, err := c.dynamic.Resource(gvr(gvkIdx)).Namespace(dstNs). + Get(ctx, name, metav1.GetOptions{}) + if err == nil && dst.GetUID() != oldUID && dst.GetUID() != "" { + return time.Since(t0), nil + } + if err != nil && !apierrors.IsNotFound(err) { + return 0, err + } + time.Sleep(10 * time.Millisecond) + } +} + +// waitForDeletion polls one destination object until it returns NotFound. +// Used by ns-flip cleanup: when a namespace's matching label is removed, the +// destination CR in that namespace should be deleted by the controller. +// Returns the elapsed duration from `t0`, or an error on 30s timeout. +func waitForDeletion( + ctx context.Context, + c *clients, + gvkIdx int, + dstNs, name string, + t0 time.Time, +) (time.Duration, error) { + deadline := time.Now().Add(30 * time.Second) + for { + if time.Now().After(deadline) { + return 0, fmt.Errorf("timeout waiting for deletion of %s/%s", dstNs, name) + } + _, err := c.dynamic.Resource(gvr(gvkIdx)).Namespace(dstNs). + Get(ctx, name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return time.Since(t0), nil + } + if err != nil { + return 0, err + } + time.Sleep(10 * time.Millisecond) + } +} + +// waitForCreation polls one destination object until it returns successfully +// (i.e. the destination CR has been created in `dstNs`). Used by ns-flip add: +// when a namespace's matching label is re-added, the controller should +// re-create the destination CR. Returns the elapsed duration from `t0`, or +// an error on 30s timeout. +func waitForCreation( + ctx context.Context, + c *clients, + gvkIdx int, + dstNs, name string, + t0 time.Time, +) (time.Duration, error) { + deadline := time.Now().Add(30 * time.Second) + for { + if time.Now().After(deadline) { + return 0, fmt.Errorf("timeout waiting for creation of %s/%s", dstNs, name) + } + _, err := c.dynamic.Resource(gvr(gvkIdx)).Namespace(dstNs). + Get(ctx, name, metav1.GetOptions{}) + if err == nil { + return time.Since(t0), nil + } + if !apierrors.IsNotFound(err) { + return 0, err + } + time.Sleep(10 * time.Millisecond) + } +} + +// measureSelfHealNP deletes each sample destination CR and times the +// controller's recreation. Per-destination latency, no fan-out (each NP +// destination's recreation is independent of the others). Returns one +// LatencyResult. +func measureSelfHealNP(ctx context.Context, c *clients, sample []projectionRef) (LatencyResult, error) { + durations := make([]time.Duration, 0, len(sample)) + for _, ref := range sample { + // Capture the original UID so we can distinguish "recreated" from + // "still being recreated" (NotFound) and from "delete didn't take". + orig, err := c.dynamic.Resource(gvr(ref.GVKIdx)).Namespace(ref.DstNs). + Get(ctx, ref.SrcName, metav1.GetOptions{}) + if err != nil { + return LatencyResult{}, fmt.Errorf("reading destination %s/%s: %w", ref.DstNs, ref.SrcName, err) + } + oldUID := orig.GetUID() + t0 := time.Now() + if err := c.dynamic.Resource(gvr(ref.GVKIdx)).Namespace(ref.DstNs). + Delete(ctx, ref.SrcName, metav1.DeleteOptions{}); err != nil { + return LatencyResult{}, fmt.Errorf("deleting destination %s/%s: %w", ref.DstNs, ref.SrcName, err) + } + elapsed, err := waitForRecreate(ctx, c, ref.GVKIdx, ref.DstNs, ref.SrcName, oldUID, t0) + if err != nil { + return LatencyResult{}, err + } + durations = append(durations, elapsed) + } + sort.Slice(durations, func(i, j int) bool { return durations[i] < durations[j] }) + p50, p95, p99 := quantiles(durations) + return LatencyResult{Samples: len(durations), P50: p50, P95: p95, P99: p99}, nil +} + +// measureSelfHealClusterFanout deletes K destinations from a CP fan-out set +// (one at a time, sequentially) and times each recreation. Sample is the +// pre-selected subset of `dstNs` namespaces — caller chooses K. Each +// deletion's latency is independent of the others (the controller's recreate +// path is the user-visible event), so this returns a single LatencyResult, +// not a fan-out result. +// +// Works for both CP-selector and CP-list shapes — both write the same +// destination object name to a set of namespaces. +func measureSelfHealClusterFanout( + ctx context.Context, + c *clients, + gvkIdx int, + dstName string, + sampleDstNs []string, +) (LatencyResult, error) { + durations := make([]time.Duration, 0, len(sampleDstNs)) + for _, dstNs := range sampleDstNs { + orig, err := c.dynamic.Resource(gvr(gvkIdx)).Namespace(dstNs). + Get(ctx, dstName, metav1.GetOptions{}) + if err != nil { + return LatencyResult{}, fmt.Errorf("reading destination %s/%s: %w", dstNs, dstName, err) + } + oldUID := orig.GetUID() + t0 := time.Now() + if err := c.dynamic.Resource(gvr(gvkIdx)).Namespace(dstNs). + Delete(ctx, dstName, metav1.DeleteOptions{}); err != nil { + return LatencyResult{}, fmt.Errorf("deleting destination %s/%s: %w", dstNs, dstName, err) + } + elapsed, err := waitForRecreate(ctx, c, gvkIdx, dstNs, dstName, oldUID, t0) + if err != nil { + return LatencyResult{}, err + } + durations = append(durations, elapsed) + } + sort.Slice(durations, func(i, j int) bool { return durations[i] < durations[j] }) + p50, p95, p99 := quantiles(durations) + return LatencyResult{Samples: len(durations), P50: p50, P95: p95, P99: p99}, nil +} + +// patchNamespaceLabel sets a namespace label to `value`, or removes the key +// entirely when `remove` is true. Idempotent. +func patchNamespaceLabel(ctx context.Context, c *clients, ns, key, value string, remove bool) error { + var patchBody string + if remove { + // JSON-merge-patch: setting a key to null removes it. + patchBody = fmt.Sprintf(`{"metadata":{"labels":{%q:null}}}`, key) + } else { + patchBody = fmt.Sprintf(`{"metadata":{"labels":{%q:%q}}}`, key, value) + } + _, err := c.dynamic.Resource(nsGVR).Patch(ctx, ns, k8stypes.MergePatchType, + []byte(patchBody), metav1.PatchOptions{}) + return err +} + +// measureNSFlip exercises the CP-selector ns-flip event for a sampled subset +// of destination namespaces. For each namespace in `sampleDstNs`, in +// sequence: +// +// 1. Cleanup phase: remove the matching label, time until the destination +// CR in that namespace returns NotFound. +// 2. Add phase: re-add the matching label, time until the destination +// CR is observed again. +// +// Returns two LatencyResults (cleanup, add). One namespace at a time keeps +// the measurement independent of fan-out scheduling — every flip starts from +// a "rest of the world is steady" baseline. +func measureNSFlip( + ctx context.Context, + c *clients, + gvkIdx int, + dstName string, + sampleDstNs []string, + labelKey, labelValue string, +) (cleanup, add LatencyResult, err error) { + cleanupDur := make([]time.Duration, 0, len(sampleDstNs)) + addDur := make([]time.Duration, 0, len(sampleDstNs)) + for _, dstNs := range sampleDstNs { + // Cleanup phase: drop the label, wait for destination delete. + t0 := time.Now() + if perr := patchNamespaceLabel(ctx, c, dstNs, labelKey, "", true); perr != nil { + return LatencyResult{}, LatencyResult{}, + fmt.Errorf("removing label from %s: %w", dstNs, perr) + } + elapsed, werr := waitForDeletion(ctx, c, gvkIdx, dstNs, dstName, t0) + if werr != nil { + return LatencyResult{}, LatencyResult{}, werr + } + cleanupDur = append(cleanupDur, elapsed) + + // Add phase: re-add the label, wait for destination create. + t1 := time.Now() + if perr := patchNamespaceLabel(ctx, c, dstNs, labelKey, labelValue, false); perr != nil { + return LatencyResult{}, LatencyResult{}, + fmt.Errorf("re-adding label to %s: %w", dstNs, perr) + } + elapsed, werr = waitForCreation(ctx, c, gvkIdx, dstNs, dstName, t1) + if werr != nil { + return LatencyResult{}, LatencyResult{}, werr + } + addDur = append(addDur, elapsed) + } + sort.Slice(cleanupDur, func(i, j int) bool { return cleanupDur[i] < cleanupDur[j] }) + sort.Slice(addDur, func(i, j int) bool { return addDur[i] < addDur[j] }) + c50, c95, c99 := quantiles(cleanupDur) + a50, a95, a99 := quantiles(addDur) + return LatencyResult{Samples: len(cleanupDur), P50: c50, P95: c95, P99: c99}, + LatencyResult{Samples: len(addDur), P50: a50, P95: a95, P99: a99}, + nil +} diff --git a/test/bench/measure_test.go b/test/bench/measure_test.go index c8bca38..4832daf 100644 --- a/test/bench/measure_test.go +++ b/test/bench/measure_test.go @@ -51,3 +51,32 @@ func TestParseMetrics(t *testing.T) { t.Errorf("ReconcileP99: got %v, want in [0.025, 0.05]", m.ReconcileP99) } } + +func TestCapSample(t *testing.T) { + cases := []struct { + name string + in []int + n int + want []int + }{ + {"empty stays empty", nil, 5, nil}, + {"under cap returns input", []int{1, 2, 3}, 5, []int{1, 2, 3}}, + {"at cap returns input", []int{1, 2, 3, 4, 5}, 5, []int{1, 2, 3, 4, 5}}, + {"over cap truncates from front", []int{1, 2, 3, 4, 5, 6, 7}, 4, []int{1, 2, 3, 4}}, + {"zero cap returns input unchanged", []int{1, 2, 3}, 0, []int{1, 2, 3}}, + {"negative cap returns input unchanged", []int{1, 2, 3}, -1, []int{1, 2, 3}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := capSample(tc.in, tc.n) + if len(got) != len(tc.want) { + t.Fatalf("len mismatch: got %d, want %d", len(got), len(tc.want)) + } + for i := range got { + if got[i] != tc.want[i] { + t.Errorf("element %d: got %d, want %d", i, got[i], tc.want[i]) + } + } + }) + } +} diff --git a/test/bench/report.go b/test/bench/report.go index 038c668..028fe1a 100644 --- a/test/bench/report.go +++ b/test/bench/report.go @@ -23,10 +23,20 @@ type Environment struct { OSArch string `json:"os_arch,omitempty"` } -// Measurements is the per-profile result. Up to three e2e distributions -// coexist for mixed profiles: NP single-target latency, CP-selector fan-out -// (earliest/slowest), and CP-list fan-out (earliest/slowest). Zero fields -// indicate the corresponding shape was not exercised. +// Measurements is the per-profile result. Up to three e2e topologies coexist +// (NP single-target, CP-selector fan-out, CP-list fan-out) and each topology +// can record up to three event distributions: +// +// - source-update: timestamp annotation patched on the source propagates to +// every destination. +// - self-heal: a destination CR is deleted and the controller recreates +// it (per-destination latency, no fan-out earliest/slowest). +// - ns-flip: (CP-selector only) a destination namespace's matching +// label is removed and re-added; cleanup and add latencies are tracked +// separately. +// +// Zero / missing fields indicate the corresponding shape did not exercise +// that event. type Measurements struct { WatchedGVKs float64 `json:"watched_gvks"` ControllerHeapMB float64 `json:"controller_heap_mb"` @@ -36,29 +46,63 @@ type Measurements struct { ReconcileP95Ms float64 `json:"reconcile_p95_ms"` ReconcileP99Ms float64 `json:"reconcile_p99_ms"` - // NP latency (single-target). Zero when no NP shape. - E2ENPSamples int `json:"e2e_np_samples,omitempty"` - E2ENPP50 time.Duration `json:"e2e_np_p50_ns,omitempty"` - E2ENPP95 time.Duration `json:"e2e_np_p95_ns,omitempty"` - E2ENPP99 time.Duration `json:"e2e_np_p99_ns,omitempty"` + // NP source-update latency (single-target). Zero when no NP shape. + E2ENPSourceUpdateSamples int `json:"e2e_np_source_update_samples,omitempty"` + E2ENPSourceUpdateP50 time.Duration `json:"e2e_np_source_update_p50_ns,omitempty"` + E2ENPSourceUpdateP95 time.Duration `json:"e2e_np_source_update_p95_ns,omitempty"` + E2ENPSourceUpdateP99 time.Duration `json:"e2e_np_source_update_p99_ns,omitempty"` - // CP-selector fan-out (earliest + slowest). Zero when no CP-selector shape. - E2ECPSelSamples int `json:"e2e_cp_sel_samples,omitempty"` - E2ECPSelEarliestP50 time.Duration `json:"e2e_cp_sel_earliest_p50_ns,omitempty"` - E2ECPSelEarliestP95 time.Duration `json:"e2e_cp_sel_earliest_p95_ns,omitempty"` - E2ECPSelEarliestP99 time.Duration `json:"e2e_cp_sel_earliest_p99_ns,omitempty"` - E2ECPSelSlowestP50 time.Duration `json:"e2e_cp_sel_slowest_p50_ns,omitempty"` - E2ECPSelSlowestP95 time.Duration `json:"e2e_cp_sel_slowest_p95_ns,omitempty"` - E2ECPSelSlowestP99 time.Duration `json:"e2e_cp_sel_slowest_p99_ns,omitempty"` + // NP self-heal latency. Per-destination (no fan-out). + E2ENPSelfHealSamples int `json:"e2e_np_self_heal_samples,omitempty"` + E2ENPSelfHealP50 time.Duration `json:"e2e_np_self_heal_p50_ns,omitempty"` + E2ENPSelfHealP95 time.Duration `json:"e2e_np_self_heal_p95_ns,omitempty"` + E2ENPSelfHealP99 time.Duration `json:"e2e_np_self_heal_p99_ns,omitempty"` - // CP-list fan-out (earliest + slowest). Zero when no CP-list shape. - E2ECPListSamples int `json:"e2e_cp_list_samples,omitempty"` - E2ECPListEarliestP50 time.Duration `json:"e2e_cp_list_earliest_p50_ns,omitempty"` - E2ECPListEarliestP95 time.Duration `json:"e2e_cp_list_earliest_p95_ns,omitempty"` - E2ECPListEarliestP99 time.Duration `json:"e2e_cp_list_earliest_p99_ns,omitempty"` - E2ECPListSlowestP50 time.Duration `json:"e2e_cp_list_slowest_p50_ns,omitempty"` - E2ECPListSlowestP95 time.Duration `json:"e2e_cp_list_slowest_p95_ns,omitempty"` - E2ECPListSlowestP99 time.Duration `json:"e2e_cp_list_slowest_p99_ns,omitempty"` + // CP-selector source-update fan-out (earliest + slowest). Zero when no + // CP-selector shape. + E2ECPSelSourceUpdateSamples int `json:"e2e_cp_sel_source_update_samples,omitempty"` + E2ECPSelSourceUpdateEarliestP50 time.Duration `json:"e2e_cp_sel_source_update_earliest_p50_ns,omitempty"` + E2ECPSelSourceUpdateEarliestP95 time.Duration `json:"e2e_cp_sel_source_update_earliest_p95_ns,omitempty"` + E2ECPSelSourceUpdateEarliestP99 time.Duration `json:"e2e_cp_sel_source_update_earliest_p99_ns,omitempty"` + E2ECPSelSourceUpdateSlowestP50 time.Duration `json:"e2e_cp_sel_source_update_slowest_p50_ns,omitempty"` + E2ECPSelSourceUpdateSlowestP95 time.Duration `json:"e2e_cp_sel_source_update_slowest_p95_ns,omitempty"` + E2ECPSelSourceUpdateSlowestP99 time.Duration `json:"e2e_cp_sel_source_update_slowest_p99_ns,omitempty"` + + // CP-selector self-heal latency (per-destination). + E2ECPSelSelfHealSamples int `json:"e2e_cp_sel_self_heal_samples,omitempty"` + E2ECPSelSelfHealP50 time.Duration `json:"e2e_cp_sel_self_heal_p50_ns,omitempty"` + E2ECPSelSelfHealP95 time.Duration `json:"e2e_cp_sel_self_heal_p95_ns,omitempty"` + E2ECPSelSelfHealP99 time.Duration `json:"e2e_cp_sel_self_heal_p99_ns,omitempty"` + + // CP-selector ns-flip cleanup latency: namespace label removed → matching + // destination CR deleted by the controller. + E2ECPSelNSFlipCleanupSamples int `json:"e2e_cp_sel_ns_flip_cleanup_samples,omitempty"` + E2ECPSelNSFlipCleanupP50 time.Duration `json:"e2e_cp_sel_ns_flip_cleanup_p50_ns,omitempty"` + E2ECPSelNSFlipCleanupP95 time.Duration `json:"e2e_cp_sel_ns_flip_cleanup_p95_ns,omitempty"` + E2ECPSelNSFlipCleanupP99 time.Duration `json:"e2e_cp_sel_ns_flip_cleanup_p99_ns,omitempty"` + + // CP-selector ns-flip add latency: namespace label re-added → destination + // CR recreated. + E2ECPSelNSFlipAddSamples int `json:"e2e_cp_sel_ns_flip_add_samples,omitempty"` + E2ECPSelNSFlipAddP50 time.Duration `json:"e2e_cp_sel_ns_flip_add_p50_ns,omitempty"` + E2ECPSelNSFlipAddP95 time.Duration `json:"e2e_cp_sel_ns_flip_add_p95_ns,omitempty"` + E2ECPSelNSFlipAddP99 time.Duration `json:"e2e_cp_sel_ns_flip_add_p99_ns,omitempty"` + + // CP-list source-update fan-out (earliest + slowest). Zero when no + // CP-list shape. + E2ECPListSourceUpdateSamples int `json:"e2e_cp_list_source_update_samples,omitempty"` + E2ECPListSourceUpdateEarliestP50 time.Duration `json:"e2e_cp_list_source_update_earliest_p50_ns,omitempty"` + E2ECPListSourceUpdateEarliestP95 time.Duration `json:"e2e_cp_list_source_update_earliest_p95_ns,omitempty"` + E2ECPListSourceUpdateEarliestP99 time.Duration `json:"e2e_cp_list_source_update_earliest_p99_ns,omitempty"` + E2ECPListSourceUpdateSlowestP50 time.Duration `json:"e2e_cp_list_source_update_slowest_p50_ns,omitempty"` + E2ECPListSourceUpdateSlowestP95 time.Duration `json:"e2e_cp_list_source_update_slowest_p95_ns,omitempty"` + E2ECPListSourceUpdateSlowestP99 time.Duration `json:"e2e_cp_list_source_update_slowest_p99_ns,omitempty"` + + // CP-list self-heal latency (per-destination). + E2ECPListSelfHealSamples int `json:"e2e_cp_list_self_heal_samples,omitempty"` + E2ECPListSelfHealP50 time.Duration `json:"e2e_cp_list_self_heal_p50_ns,omitempty"` + E2ECPListSelfHealP95 time.Duration `json:"e2e_cp_list_self_heal_p95_ns,omitempty"` + E2ECPListSelfHealP99 time.Duration `json:"e2e_cp_list_self_heal_p99_ns,omitempty"` } func (r *Report) WriteJSON(w io.Writer) error { @@ -89,25 +133,50 @@ func (r *Report) WriteText(w io.Writer) error { row("reconcile_p95_ms\t%.2f\n", r.Measurements.ReconcileP95Ms) row("reconcile_p99_ms\t%.2f\n", r.Measurements.ReconcileP99Ms) if r.Profile.NamespacedProjections > 0 { - row("e2e_np_p50\t%s\n", r.Measurements.E2ENPP50) - row("e2e_np_p95\t%s\n", r.Measurements.E2ENPP95) - row("e2e_np_p99\t%s\n", r.Measurements.E2ENPP99) + row("e2e_np_source_update_p50\t%s\n", r.Measurements.E2ENPSourceUpdateP50) + row("e2e_np_source_update_p95\t%s\n", r.Measurements.E2ENPSourceUpdateP95) + row("e2e_np_source_update_p99\t%s\n", r.Measurements.E2ENPSourceUpdateP99) + if r.Measurements.E2ENPSelfHealSamples > 0 { + row("e2e_np_self_heal_p50\t%s\n", r.Measurements.E2ENPSelfHealP50) + row("e2e_np_self_heal_p95\t%s\n", r.Measurements.E2ENPSelfHealP95) + row("e2e_np_self_heal_p99\t%s\n", r.Measurements.E2ENPSelfHealP99) + } } if r.Profile.SelectorNamespaces > 0 { - row("e2e_cp_sel_earliest_p50\t%s\n", r.Measurements.E2ECPSelEarliestP50) - row("e2e_cp_sel_earliest_p95\t%s\n", r.Measurements.E2ECPSelEarliestP95) - row("e2e_cp_sel_earliest_p99\t%s\n", r.Measurements.E2ECPSelEarliestP99) - row("e2e_cp_sel_slowest_p50\t%s\n", r.Measurements.E2ECPSelSlowestP50) - row("e2e_cp_sel_slowest_p95\t%s\n", r.Measurements.E2ECPSelSlowestP95) - row("e2e_cp_sel_slowest_p99\t%s\n", r.Measurements.E2ECPSelSlowestP99) + row("e2e_cp_sel_source_update_earliest_p50\t%s\n", r.Measurements.E2ECPSelSourceUpdateEarliestP50) + row("e2e_cp_sel_source_update_earliest_p95\t%s\n", r.Measurements.E2ECPSelSourceUpdateEarliestP95) + row("e2e_cp_sel_source_update_earliest_p99\t%s\n", r.Measurements.E2ECPSelSourceUpdateEarliestP99) + row("e2e_cp_sel_source_update_slowest_p50\t%s\n", r.Measurements.E2ECPSelSourceUpdateSlowestP50) + row("e2e_cp_sel_source_update_slowest_p95\t%s\n", r.Measurements.E2ECPSelSourceUpdateSlowestP95) + row("e2e_cp_sel_source_update_slowest_p99\t%s\n", r.Measurements.E2ECPSelSourceUpdateSlowestP99) + if r.Measurements.E2ECPSelSelfHealSamples > 0 { + row("e2e_cp_sel_self_heal_p50\t%s\n", r.Measurements.E2ECPSelSelfHealP50) + row("e2e_cp_sel_self_heal_p95\t%s\n", r.Measurements.E2ECPSelSelfHealP95) + row("e2e_cp_sel_self_heal_p99\t%s\n", r.Measurements.E2ECPSelSelfHealP99) + } + if r.Measurements.E2ECPSelNSFlipCleanupSamples > 0 { + row("e2e_cp_sel_ns_flip_cleanup_p50\t%s\n", r.Measurements.E2ECPSelNSFlipCleanupP50) + row("e2e_cp_sel_ns_flip_cleanup_p95\t%s\n", r.Measurements.E2ECPSelNSFlipCleanupP95) + row("e2e_cp_sel_ns_flip_cleanup_p99\t%s\n", r.Measurements.E2ECPSelNSFlipCleanupP99) + } + if r.Measurements.E2ECPSelNSFlipAddSamples > 0 { + row("e2e_cp_sel_ns_flip_add_p50\t%s\n", r.Measurements.E2ECPSelNSFlipAddP50) + row("e2e_cp_sel_ns_flip_add_p95\t%s\n", r.Measurements.E2ECPSelNSFlipAddP95) + row("e2e_cp_sel_ns_flip_add_p99\t%s\n", r.Measurements.E2ECPSelNSFlipAddP99) + } } if r.Profile.ListNamespaces > 0 { - row("e2e_cp_list_earliest_p50\t%s\n", r.Measurements.E2ECPListEarliestP50) - row("e2e_cp_list_earliest_p95\t%s\n", r.Measurements.E2ECPListEarliestP95) - row("e2e_cp_list_earliest_p99\t%s\n", r.Measurements.E2ECPListEarliestP99) - row("e2e_cp_list_slowest_p50\t%s\n", r.Measurements.E2ECPListSlowestP50) - row("e2e_cp_list_slowest_p95\t%s\n", r.Measurements.E2ECPListSlowestP95) - row("e2e_cp_list_slowest_p99\t%s\n", r.Measurements.E2ECPListSlowestP99) + row("e2e_cp_list_source_update_earliest_p50\t%s\n", r.Measurements.E2ECPListSourceUpdateEarliestP50) + row("e2e_cp_list_source_update_earliest_p95\t%s\n", r.Measurements.E2ECPListSourceUpdateEarliestP95) + row("e2e_cp_list_source_update_earliest_p99\t%s\n", r.Measurements.E2ECPListSourceUpdateEarliestP99) + row("e2e_cp_list_source_update_slowest_p50\t%s\n", r.Measurements.E2ECPListSourceUpdateSlowestP50) + row("e2e_cp_list_source_update_slowest_p95\t%s\n", r.Measurements.E2ECPListSourceUpdateSlowestP95) + row("e2e_cp_list_source_update_slowest_p99\t%s\n", r.Measurements.E2ECPListSourceUpdateSlowestP99) + if r.Measurements.E2ECPListSelfHealSamples > 0 { + row("e2e_cp_list_self_heal_p50\t%s\n", r.Measurements.E2ECPListSelfHealP50) + row("e2e_cp_list_self_heal_p95\t%s\n", r.Measurements.E2ECPListSelfHealP95) + row("e2e_cp_list_self_heal_p99\t%s\n", r.Measurements.E2ECPListSelfHealP99) + } } row("duration_seconds\t%.1f\n", r.DurationSeconds) return tw.Flush() diff --git a/test/bench/report_test.go b/test/bench/report_test.go index 528cce0..6f65b6c 100644 --- a/test/bench/report_test.go +++ b/test/bench/report_test.go @@ -16,16 +16,19 @@ func TestReportJSON(t *testing.T) { Timestamp: "2026-04-18T12:00:00Z", }, Measurements: Measurements{ - WatchedGVKs: 10, - ControllerHeapMB: 42.0, - ControllerCPUDelta: 5.4, - ReconcileP50Ms: 6.3, - ReconcileP95Ms: 18.1, - ReconcileP99Ms: 27.0, - E2ENPSamples: 100, - E2ENPP50: 50 * time.Millisecond, - E2ENPP95: 180 * time.Millisecond, - E2ENPP99: 420 * time.Millisecond, + WatchedGVKs: 10, + ControllerHeapMB: 42.0, + ControllerCPUDelta: 5.4, + ReconcileP50Ms: 6.3, + ReconcileP95Ms: 18.1, + ReconcileP99Ms: 27.0, + E2ENPSourceUpdateSamples: 100, + E2ENPSourceUpdateP50: 50 * time.Millisecond, + E2ENPSourceUpdateP95: 180 * time.Millisecond, + E2ENPSourceUpdateP99: 420 * time.Millisecond, + E2ENPSelfHealSamples: 100, + E2ENPSelfHealP50: 70 * time.Millisecond, + E2ENPSelfHealP95: 220 * time.Millisecond, }, DurationSeconds: 123.0, } @@ -41,8 +44,31 @@ func TestReportJSON(t *testing.T) { if back.Profile.Name != "np-typical" { t.Errorf("profile name lost: %+v", back) } - if back.Measurements.E2ENPSamples != 100 { - t.Errorf("E2ENPSamples lost: %+v", back.Measurements) + if back.Measurements.E2ENPSourceUpdateSamples != 100 { + t.Errorf("E2ENPSourceUpdateSamples lost: %+v", back.Measurements) + } + if back.Measurements.E2ENPSelfHealP50 != 70*time.Millisecond { + t.Errorf("E2ENPSelfHealP50 lost: %+v", back.Measurements) + } + // Verify the renamed JSON tags actually flowed through (catches a typo + // where the field renames but the json tag stays old). + raw := buf.String() + for _, want := range []string{ + `"e2e_np_source_update_p50_ns"`, + `"e2e_np_self_heal_p50_ns"`, + } { + if !strings.Contains(raw, want) { + t.Errorf("JSON missing renamed tag %q:\n%s", want, raw) + } + } + for _, banned := range []string{ + `"e2e_np_p50_ns"`, + `"e2e_cp_sel_earliest_p50_ns"`, + `"e2e_cp_list_slowest_p99_ns"`, + } { + if strings.Contains(raw, banned) { + t.Errorf("JSON unexpectedly includes legacy tag %q:\n%s", banned, raw) + } } } @@ -50,9 +76,12 @@ func TestReportText_NPOnly(t *testing.T) { r := Report{ Profile: Profile{Name: "np-typical", NamespacedProjections: 100, GVKs: 10, Namespaces: 10}, Measurements: Measurements{ - ReconcileP50Ms: 6.3, - E2ENPP50: 50 * time.Millisecond, - E2ENPP95: 180 * time.Millisecond, + ReconcileP50Ms: 6.3, + E2ENPSourceUpdateP50: 50 * time.Millisecond, + E2ENPSourceUpdateP95: 180 * time.Millisecond, + E2ENPSelfHealSamples: 100, + E2ENPSelfHealP50: 70 * time.Millisecond, + E2ENPSelfHealP95: 200 * time.Millisecond, }, } var buf bytes.Buffer @@ -62,7 +91,8 @@ func TestReportText_NPOnly(t *testing.T) { out := buf.String() for _, want := range []string{ "np-typical", "namespaced_projections", "100", - "e2e_np_p50", "e2e_np_p95", "e2e_np_p99", + "e2e_np_source_update_p50", "e2e_np_source_update_p95", "e2e_np_source_update_p99", + "e2e_np_self_heal_p50", "e2e_np_self_heal_p95", "e2e_np_self_heal_p99", "controller_rss_mb", "controller_cpu_seconds_delta", } { if !strings.Contains(out, want) { @@ -71,24 +101,61 @@ func TestReportText_NPOnly(t *testing.T) { } // NP-only profile must NOT print CP-* rows. for _, banned := range []string{ - "e2e_cp_sel_earliest", "e2e_cp_sel_slowest", - "e2e_cp_list_earliest", "e2e_cp_list_slowest", + "e2e_cp_sel_source_update_earliest", "e2e_cp_sel_source_update_slowest", + "e2e_cp_list_source_update_earliest", "e2e_cp_list_source_update_slowest", + "e2e_cp_sel_self_heal", "e2e_cp_sel_ns_flip", + "e2e_cp_list_self_heal", } { if strings.Contains(out, banned) { t.Errorf("NP-only text output unexpectedly includes %q:\n%s", banned, out) } } + // Legacy unlabeled-event names must be gone. + for _, banned := range []string{"e2e_np_p50\t", "e2e_np_p95\t", "e2e_np_p99\t"} { + if strings.Contains(out, banned) { + t.Errorf("NP-only text output unexpectedly includes legacy field %q:\n%s", banned, out) + } + } +} + +func TestReportText_NPOnly_NoSelfHealRowsWhenSamplesZero(t *testing.T) { + // When NamespacedProjections is set but the harness skipped self-heal + // (e.g. measurement aborted), the source-update rows still print but the + // self-heal rows must not. + r := Report{ + Profile: Profile{Name: "np-typical", NamespacedProjections: 100, GVKs: 10, Namespaces: 10}, + Measurements: Measurements{ + E2ENPSourceUpdateP50: 50 * time.Millisecond, + }, + } + var buf bytes.Buffer + if err := r.WriteText(&buf); err != nil { + t.Fatalf("WriteText: %v", err) + } + out := buf.String() + if !strings.Contains(out, "e2e_np_source_update_p50") { + t.Errorf("text output missing source-update row:\n%s", out) + } + if strings.Contains(out, "e2e_np_self_heal") { + t.Errorf("text output unexpectedly includes self-heal row when samples=0:\n%s", out) + } } func TestReportText_CPSelector(t *testing.T) { r := Report{ Profile: Profile{Name: "cp-selector-typical", SelectorNamespaces: 50, GVKs: 1, Namespaces: 1}, Measurements: Measurements{ - ReconcileP50Ms: 4.0, - E2ECPSelEarliestP50: 40 * time.Millisecond, - E2ECPSelEarliestP99: 120 * time.Millisecond, - E2ECPSelSlowestP50: 400 * time.Millisecond, - E2ECPSelSlowestP99: 950 * time.Millisecond, + ReconcileP50Ms: 4.0, + E2ECPSelSourceUpdateEarliestP50: 40 * time.Millisecond, + E2ECPSelSourceUpdateEarliestP99: 120 * time.Millisecond, + E2ECPSelSourceUpdateSlowestP50: 400 * time.Millisecond, + E2ECPSelSourceUpdateSlowestP99: 950 * time.Millisecond, + E2ECPSelSelfHealSamples: 20, + E2ECPSelSelfHealP50: 80 * time.Millisecond, + E2ECPSelNSFlipCleanupSamples: 20, + E2ECPSelNSFlipCleanupP50: 90 * time.Millisecond, + E2ECPSelNSFlipAddSamples: 20, + E2ECPSelNSFlipAddP50: 120 * time.Millisecond, }, } var buf bytes.Buffer @@ -98,7 +165,10 @@ func TestReportText_CPSelector(t *testing.T) { out := buf.String() for _, want := range []string{ "selector_namespaces", "50", - "e2e_cp_sel_earliest_p50", "e2e_cp_sel_slowest_p99", + "e2e_cp_sel_source_update_earliest_p50", "e2e_cp_sel_source_update_slowest_p99", + "e2e_cp_sel_self_heal_p50", + "e2e_cp_sel_ns_flip_cleanup_p50", + "e2e_cp_sel_ns_flip_add_p50", } { if !strings.Contains(out, want) { t.Errorf("cp-selector text output missing %q:\n%s", want, out) @@ -106,28 +176,61 @@ func TestReportText_CPSelector(t *testing.T) { } // CP-selector-only must NOT print NP or CP-list rows. for _, banned := range []string{ - "e2e_np_p50", "e2e_np_p95", - "e2e_cp_list_earliest", "e2e_cp_list_slowest", + "e2e_np_source_update_p50", "e2e_np_source_update_p95", + "e2e_np_self_heal", + "e2e_cp_list_source_update_earliest", "e2e_cp_list_source_update_slowest", + "e2e_cp_list_self_heal", } { if strings.Contains(out, banned) { t.Errorf("cp-selector text output unexpectedly includes %q:\n%s", banned, out) } } - // Legacy field names must be gone. - for _, banned := range []string{"e2e_first_ns", "e2e_last_ns", "e2e_earliest_p50", "e2e_slowest_p50"} { + // Legacy unlabeled field names must be gone. + for _, banned := range []string{ + "e2e_first_ns", "e2e_last_ns", + "e2e_earliest_p50", "e2e_slowest_p50", + "e2e_cp_sel_earliest_p50\t", "e2e_cp_sel_slowest_p50\t", + } { if strings.Contains(out, banned) { t.Errorf("cp-selector text output unexpectedly includes legacy field %q:\n%s", banned, out) } } } +func TestReportText_CPSelector_SkipNSFlipWhenSamplesZero(t *testing.T) { + // SelectorNamespaces=1 means K=0 for ns-flip (it's len/2). The harness + // then leaves the ns-flip fields zero, and WriteText must not emit ns- + // flip rows. + r := Report{ + Profile: Profile{Name: "cp-selector-tiny", SelectorNamespaces: 1, GVKs: 1, Namespaces: 1}, + Measurements: Measurements{ + E2ECPSelSourceUpdateEarliestP50: 40 * time.Millisecond, + E2ECPSelSelfHealSamples: 1, + E2ECPSelSelfHealP50: 80 * time.Millisecond, + }, + } + var buf bytes.Buffer + if err := r.WriteText(&buf); err != nil { + t.Fatalf("WriteText: %v", err) + } + out := buf.String() + if !strings.Contains(out, "e2e_cp_sel_self_heal_p50") { + t.Errorf("text output missing self-heal row:\n%s", out) + } + if strings.Contains(out, "e2e_cp_sel_ns_flip") { + t.Errorf("text output unexpectedly includes ns-flip row when samples=0:\n%s", out) + } +} + func TestReportText_CPList(t *testing.T) { r := Report{ Profile: Profile{Name: "cp-list-typical", ListNamespaces: 10, GVKs: 1, Namespaces: 1}, Measurements: Measurements{ - ReconcileP50Ms: 3.0, - E2ECPListEarliestP50: 25 * time.Millisecond, - E2ECPListSlowestP99: 300 * time.Millisecond, + ReconcileP50Ms: 3.0, + E2ECPListSourceUpdateEarliestP50: 25 * time.Millisecond, + E2ECPListSourceUpdateSlowestP99: 300 * time.Millisecond, + E2ECPListSelfHealSamples: 10, + E2ECPListSelfHealP50: 60 * time.Millisecond, }, } var buf bytes.Buffer @@ -137,15 +240,17 @@ func TestReportText_CPList(t *testing.T) { out := buf.String() for _, want := range []string{ "list_namespaces", "10", - "e2e_cp_list_earliest_p50", "e2e_cp_list_slowest_p99", + "e2e_cp_list_source_update_earliest_p50", "e2e_cp_list_source_update_slowest_p99", + "e2e_cp_list_self_heal_p50", } { if !strings.Contains(out, want) { t.Errorf("cp-list text output missing %q:\n%s", want, out) } } for _, banned := range []string{ - "e2e_np_p50", - "e2e_cp_sel_earliest", "e2e_cp_sel_slowest", + "e2e_np_source_update_p50", + "e2e_cp_sel_source_update_earliest", "e2e_cp_sel_source_update_slowest", + "e2e_cp_list_ns_flip", "e2e_cp_sel_ns_flip", } { if strings.Contains(out, banned) { t.Errorf("cp-list text output unexpectedly includes %q:\n%s", banned, out) @@ -160,12 +265,22 @@ func TestReportText_Mixed(t *testing.T) { SelectorNamespaces: 50, ListNamespaces: 10, GVKs: 10, Namespaces: 10, }, Measurements: Measurements{ - ReconcileP50Ms: 5.0, - E2ENPP50: 40 * time.Millisecond, - E2ECPSelEarliestP50: 35 * time.Millisecond, - E2ECPSelSlowestP50: 250 * time.Millisecond, - E2ECPListEarliestP50: 20 * time.Millisecond, - E2ECPListSlowestP50: 120 * time.Millisecond, + ReconcileP50Ms: 5.0, + E2ENPSourceUpdateP50: 40 * time.Millisecond, + E2ENPSelfHealSamples: 100, + E2ENPSelfHealP50: 60 * time.Millisecond, + E2ECPSelSourceUpdateEarliestP50: 35 * time.Millisecond, + E2ECPSelSourceUpdateSlowestP50: 250 * time.Millisecond, + E2ECPSelSelfHealSamples: 20, + E2ECPSelSelfHealP50: 80 * time.Millisecond, + E2ECPSelNSFlipCleanupSamples: 20, + E2ECPSelNSFlipCleanupP50: 90 * time.Millisecond, + E2ECPSelNSFlipAddSamples: 20, + E2ECPSelNSFlipAddP50: 100 * time.Millisecond, + E2ECPListSourceUpdateEarliestP50: 20 * time.Millisecond, + E2ECPListSourceUpdateSlowestP50: 120 * time.Millisecond, + E2ECPListSelfHealSamples: 10, + E2ECPListSelfHealP50: 60 * time.Millisecond, }, } var buf bytes.Buffer @@ -175,9 +290,14 @@ func TestReportText_Mixed(t *testing.T) { out := buf.String() for _, want := range []string{ "mixed-typical", - "e2e_np_p50", - "e2e_cp_sel_earliest_p50", "e2e_cp_sel_slowest_p50", - "e2e_cp_list_earliest_p50", "e2e_cp_list_slowest_p50", + "e2e_np_source_update_p50", + "e2e_np_self_heal_p50", + "e2e_cp_sel_source_update_earliest_p50", "e2e_cp_sel_source_update_slowest_p50", + "e2e_cp_sel_self_heal_p50", + "e2e_cp_sel_ns_flip_cleanup_p50", + "e2e_cp_sel_ns_flip_add_p50", + "e2e_cp_list_source_update_earliest_p50", "e2e_cp_list_source_update_slowest_p50", + "e2e_cp_list_self_heal_p50", } { if !strings.Contains(out, want) { t.Errorf("mixed text output missing %q:\n%s", want, out) diff --git a/test/bench/runner.go b/test/bench/runner.go index 29e53f5..1a27007 100644 --- a/test/bench/runner.go +++ b/test/bench/runner.go @@ -9,11 +9,37 @@ import ( "time" ) -// fanoutStamps is the number of stamps applied per CP-* fan-out measurement -// pass. 30 lands the p99 estimator on the 30th sample (index 29); see -// quantiles(). +// fanoutStamps is the number of stamps applied per CP-* fan-out source-update +// measurement pass. 30 lands the p99 estimator on the 30th sample (index 29); +// see quantiles(). const fanoutStamps = 30 +// npSourceUpdateMaxSamples / npSelfHealMaxSamples cap the NP-shape per-event +// measurement work at K = min(this, len(NPRefs)). 100 keeps measurement wall +// bounded for stress profiles without losing distribution shape. +const ( + npSourceUpdateMaxSamples = 100 + npSelfHealMaxSamples = 100 +) + +// cpFanoutSelfHealMaxSamples / cpFanoutNSFlipMaxSamples cap the CP fan-out +// self-heal and ns-flip event sample sizes. K = min(this, len(dstNs)/2 for +// ns-flip, this for self-heal). The /2 floor for ns-flip avoids flipping +// every destination off and on in one pass — leaving half the fan-out alone +// keeps the controller's other reconcile work as a backdrop, which mimics +// real-world steady state. +const ( + cpFanoutSelfHealMaxSamples = 20 + cpFanoutNSFlipMaxSamples = 20 +) + +// betweenEventsSettle is the pause inserted between event measurements within +// one profile, so the tail of the previous event (controller queue drain, +// cache settling) doesn't leak into the next event's distribution. 5s is +// loose enough for any of the events on a small profile, tight enough to +// keep total wall reasonable for stress runs. +const betweenEventsSettle = 5 * time.Second + // runProfile runs bootstrap → measure → teardown for one profile and returns // the Report. Mixed profiles run all applicable measurement paths and emit // every distribution that was exercised. @@ -46,38 +72,106 @@ func runProfile(ctx context.Context, c *clients, p Profile, metricsURL string) ( baseline = MetricsSnapshot{} } - // Measurement window. Run every applicable shape; mixed profiles get - // all three populated. + // Measurement window. Run every applicable shape and event; mixed + // profiles get all three topologies populated. Each event is followed by + // a betweenEventsSettle so the next event starts from a quiet + // controller. var m Measurements if p.NamespacedProjections > 0 { - sample := res.NPRefs - if len(sample) > 100 { - sample = sample[:100] - } + // NP source-update. + sample := capSample(res.NPRefs, npSourceUpdateMaxSamples) latency, mErr := measureE2ESingle(ctx, c, sample) if mErr != nil { - return nil, fmt.Errorf("measure NP e2e: %w", mErr) + return nil, fmt.Errorf("measure NP source-update: %w", mErr) } - m.E2ENPSamples = latency.Samples - m.E2ENPP50, m.E2ENPP95, m.E2ENPP99 = latency.P50, latency.P95, latency.P99 + m.E2ENPSourceUpdateSamples = latency.Samples + m.E2ENPSourceUpdateP50, m.E2ENPSourceUpdateP95, m.E2ENPSourceUpdateP99 = latency.P50, latency.P95, latency.P99 + + // NP self-heal. + time.Sleep(betweenEventsSettle) + sh, mErr := measureSelfHealNP(ctx, c, capSample(res.NPRefs, npSelfHealMaxSamples)) + if mErr != nil { + return nil, fmt.Errorf("measure NP self-heal: %w", mErr) + } + m.E2ENPSelfHealSamples = sh.Samples + m.E2ENPSelfHealP50, m.E2ENPSelfHealP95, m.E2ENPSelfHealP99 = sh.P50, sh.P95, sh.P99 } if p.SelectorNamespaces > 0 { + // CP-selector source-update. + if m.E2ENPSourceUpdateSamples > 0 || m.E2ENPSelfHealSamples > 0 { + time.Sleep(betweenEventsSettle) + } fan, mErr := measureE2EClusterFanout(ctx, c, *res.CPSelectorRef, res.CPSelectorDsts, fanoutStamps) if mErr != nil { - return nil, fmt.Errorf("measure cp-selector e2e: %w", mErr) + return nil, fmt.Errorf("measure cp-selector source-update: %w", mErr) + } + m.E2ECPSelSourceUpdateSamples = fan.Samples + m.E2ECPSelSourceUpdateEarliestP50, m.E2ECPSelSourceUpdateEarliestP95, m.E2ECPSelSourceUpdateEarliestP99 = + fan.Earliest.P50, fan.Earliest.P95, fan.Earliest.P99 + m.E2ECPSelSourceUpdateSlowestP50, m.E2ECPSelSourceUpdateSlowestP95, m.E2ECPSelSourceUpdateSlowestP99 = + fan.Slowest.P50, fan.Slowest.P95, fan.Slowest.P99 + + // CP-selector self-heal: K destinations from the front of the set. + // For SelectorNamespaces=1 this is K=1; otherwise capped at 20. + time.Sleep(betweenEventsSettle) + sh, mErr := measureSelfHealClusterFanout(ctx, c, res.CPSelectorRef.GVKIdx, + res.CPSelectorRef.SrcName, capSample(res.CPSelectorDsts, cpFanoutSelfHealMaxSamples)) + if mErr != nil { + return nil, fmt.Errorf("measure cp-selector self-heal: %w", mErr) + } + m.E2ECPSelSelfHealSamples = sh.Samples + m.E2ECPSelSelfHealP50, m.E2ECPSelSelfHealP95, m.E2ECPSelSelfHealP99 = sh.P50, sh.P95, sh.P99 + + // CP-selector ns-flip: cap at min(20, len(dstSet)/2). The /2 floor + // keeps the rest of the fanout as a steady backdrop. For tiny + // fanouts (e.g. 1) we skip the event entirely — there's no + // independent backdrop to keep stable. + nsFlipK := len(res.CPSelectorDsts) / 2 + if nsFlipK > cpFanoutNSFlipMaxSamples { + nsFlipK = cpFanoutNSFlipMaxSamples + } + if nsFlipK > 0 { + time.Sleep(betweenEventsSettle) + cleanup, add, mErr := measureNSFlip(ctx, c, + res.CPSelectorRef.GVKIdx, res.CPSelectorRef.SrcName, + capSample(res.CPSelectorDsts, nsFlipK), + cpSelectorLabelKey, cpSelectorLabelValue) + if mErr != nil { + return nil, fmt.Errorf("measure cp-selector ns-flip: %w", mErr) + } + m.E2ECPSelNSFlipCleanupSamples = cleanup.Samples + m.E2ECPSelNSFlipCleanupP50, m.E2ECPSelNSFlipCleanupP95, m.E2ECPSelNSFlipCleanupP99 = + cleanup.P50, cleanup.P95, cleanup.P99 + m.E2ECPSelNSFlipAddSamples = add.Samples + m.E2ECPSelNSFlipAddP50, m.E2ECPSelNSFlipAddP95, m.E2ECPSelNSFlipAddP99 = + add.P50, add.P95, add.P99 } - m.E2ECPSelSamples = fan.Samples - m.E2ECPSelEarliestP50, m.E2ECPSelEarliestP95, m.E2ECPSelEarliestP99 = fan.Earliest.P50, fan.Earliest.P95, fan.Earliest.P99 - m.E2ECPSelSlowestP50, m.E2ECPSelSlowestP95, m.E2ECPSelSlowestP99 = fan.Slowest.P50, fan.Slowest.P95, fan.Slowest.P99 } if p.ListNamespaces > 0 { + // CP-list source-update. + if m.E2ENPSourceUpdateSamples > 0 || m.E2ECPSelSourceUpdateSamples > 0 { + time.Sleep(betweenEventsSettle) + } fan, mErr := measureE2EClusterFanout(ctx, c, *res.CPListRef, res.CPListDsts, fanoutStamps) if mErr != nil { - return nil, fmt.Errorf("measure cp-list e2e: %w", mErr) + return nil, fmt.Errorf("measure cp-list source-update: %w", mErr) + } + m.E2ECPListSourceUpdateSamples = fan.Samples + m.E2ECPListSourceUpdateEarliestP50, m.E2ECPListSourceUpdateEarliestP95, m.E2ECPListSourceUpdateEarliestP99 = + fan.Earliest.P50, fan.Earliest.P95, fan.Earliest.P99 + m.E2ECPListSourceUpdateSlowestP50, m.E2ECPListSourceUpdateSlowestP95, m.E2ECPListSourceUpdateSlowestP99 = + fan.Slowest.P50, fan.Slowest.P95, fan.Slowest.P99 + + // CP-list self-heal. List destinations don't react to namespace + // label changes, so ns-flip is not exercised on this path. + time.Sleep(betweenEventsSettle) + sh, mErr := measureSelfHealClusterFanout(ctx, c, res.CPListRef.GVKIdx, + res.CPListRef.SrcName, capSample(res.CPListDsts, cpFanoutSelfHealMaxSamples)) + if mErr != nil { + return nil, fmt.Errorf("measure cp-list self-heal: %w", mErr) } - m.E2ECPListSamples = fan.Samples - m.E2ECPListEarliestP50, m.E2ECPListEarliestP95, m.E2ECPListEarliestP99 = fan.Earliest.P50, fan.Earliest.P95, fan.Earliest.P99 - m.E2ECPListSlowestP50, m.E2ECPListSlowestP95, m.E2ECPListSlowestP99 = fan.Slowest.P50, fan.Slowest.P95, fan.Slowest.P99 + m.E2ECPListSelfHealSamples = sh.Samples + m.E2ECPListSelfHealP50, m.E2ECPListSelfHealP95, m.E2ECPListSelfHealP99 = sh.P50, sh.P95, sh.P99 } // Final scrape. Same tolerance rationale as the baseline above.