From 29785f8086c9cc4b40fecf116633c82ea3e3d11e Mon Sep 17 00:00:00 2001 From: be0x74a Date: Fri, 8 May 2026 16:50:01 +0200 Subject: [PATCH] fix(bench): synchronously wait for teardown deletes to complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #95 fixed the namespace teardown→bootstrap race by waiting for any Terminating namespace inside ensureNamespace. The next end-to-end run hit the same race class on a different resource: profile np-stress's installCRDs got Create→IsAlreadyExists on a still-Terminating CRD from np-typical's teardown, skipped the Create, slept 3s, and during that sleep the CRD finalizer completed. The follow-up createSource then saw "the server could not find the requested resource". Rather than playing whack-a-mole with one Terminating-aware Ensure per resource type (namespaces today, CRDs tomorrow, ClusterProjections next week), centralize the cleanup-completion wait in teardown itself. After issuing every Delete, teardown now polls until every namespace, CRD, and ClusterProjection it deleted is observed NotFound. Bounded at 120s; on timeout the function returns silently (next bootstrap will surface genuinely stuck state). The PR #95 ensureNamespace wait stays in place as defense-in-depth — it covers external-actor deletes that happen during a run, not just the inter-profile teardown race this commit closes. --- test/bench/teardown.go | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/test/bench/teardown.go b/test/bench/teardown.go index 78fa506..57e3186 100644 --- a/test/bench/teardown.go +++ b/test/bench/teardown.go @@ -10,8 +10,13 @@ import ( // teardown reverses bootstrap. Deletes Projection / ClusterProjection CRs // first (so the controller stops reconciling), then namespaces (which -// cascade-delete contents), then CRDs. Best-effort throughout: logs failures -// but continues. +// cascade-delete contents), then CRDs. Best-effort on every individual +// Delete (logs failures but continues), but synchronously waits for the +// resources to be fully NotFound before returning so the next profile +// in --profile=full sequences starts from a clean slate. Without that +// final wait, async finalizers leave Terminating shells around and the +// next bootstrap races them: namespace Get-succeeds-then-disappears, +// CRD Create-AlreadyExists-then-disappears, etc. func teardown(ctx context.Context, c *clients, res *bootstrapResult) { // Delete namespaced Projections. for _, p := range res.NPRefs { @@ -19,11 +24,14 @@ func teardown(ctx context.Context, c *clients, res *bootstrapResult) { Delete(ctx, p.ProjName, metav1.DeleteOptions{}) } // Delete the ClusterProjection CRs (cluster-scoped, no namespace arg). + cpNames := make([]string, 0, 2) if res.CPSelectorRef != nil { _ = c.dynamic.Resource(cprojGVR).Delete(ctx, res.CPSelectorRef.CPName, metav1.DeleteOptions{}) + cpNames = append(cpNames, res.CPSelectorRef.CPName) } if res.CPListRef != nil { _ = c.dynamic.Resource(cprojGVR).Delete(ctx, res.CPListRef.CPName, metav1.DeleteOptions{}) + cpNames = append(cpNames, res.CPListRef.CPName) } // Let finalizers run. @@ -45,9 +53,61 @@ func teardown(ctx context.Context, c *clients, res *bootstrapResult) { // Delete bench CRDs. The cluster admin can run again and the CRDs get // recreated idempotently via installCRDs. + crdNames := make([]string, 0, res.Profile.GVKs) for i := 0; i < res.Profile.GVKs; i++ { crdName := benchPlural(i) + "." + benchGroup _ = c.apiext.ApiextensionsV1().CustomResourceDefinitions(). Delete(ctx, crdName, metav1.DeleteOptions{}) + crdNames = append(crdNames, crdName) + } + + // Synchronous wait: poll until every namespace, CRD, and CP we deleted + // is fully NotFound. Bounded at 120s; on timeout we proceed silently + // (the next bootstrap will surface the issue if state is genuinely + // stuck). The wait is cheap when there's nothing to wait for — first + // poll iteration sees everything NotFound and returns. + waitDeleted(ctx, c, allNs, crdNames, cpNames) +} + +// waitDeleted polls until every named namespace, CRD, and ClusterProjection +// is observed NotFound, or the 120s deadline is reached. Returns silently +// in either case — teardown is best-effort by contract. +func waitDeleted(ctx context.Context, c *clients, namespaces, crdNames, cpNames []string) { + deadline := time.Now().Add(120 * time.Second) + for { + anyRemaining := false + for _, ns := range namespaces { + _, err := c.kube.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{}) + if !apierrors.IsNotFound(err) { + anyRemaining = true + break + } + } + if !anyRemaining { + for _, name := range crdNames { + _, err := c.apiext.ApiextensionsV1().CustomResourceDefinitions(). + Get(ctx, name, metav1.GetOptions{}) + if !apierrors.IsNotFound(err) { + anyRemaining = true + break + } + } + } + if !anyRemaining { + for _, name := range cpNames { + _, err := c.dynamic.Resource(cprojGVR).Get(ctx, name, metav1.GetOptions{}) + if !apierrors.IsNotFound(err) { + anyRemaining = true + break + } + } + } + if !anyRemaining { + return + } + if time.Now().After(deadline) { + return // best-effort timeout; next bootstrap will surface stuck state + } + time.Sleep(1 * time.Second) } }