From 64b66a341e5f90e1e36926899a5d7326cedfcdc3 Mon Sep 17 00:00:00 2001
From: Albert Mavashev <amavashev@k2n.io>
Date: Thu, 18 Jun 2026 13:02:57 -0400
Subject: [PATCH] ci(bench): make p99 latency metrics non-gating in the release
 gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The release gate failed v0.1.25.34 on commit_p99 (+94% vs baseline) while
every p50 and throughput metric was within tolerance. p99 tail latency on a
200-iteration micro-benchmark over shared GitHub runners swings ~2x run-to-run
(commit_p99 was 6.5 -> 8.2 -> 12.6 across three runs) from GC pauses / runner
contention, far beyond the 25% threshold — and same-machine .21->.34 showed
only +8% on commit_p99, so it's noise, not a regression. No single-sample
baseline can stabilize a metric that varies 2x.

HEADLINE_METRICS gains a third element, `gating`. reserve_p99_ms and
commit_p99_ms are now non-gating: still measured and shown in the summary
table (labelled "noisy (non-gating)" when they exceed the threshold) but no
longer failing the build. p50 latency (reserve/commit/release/event) and
concurrent_throughput_32t stay hard gates. Applies to the release gate and the
nightly trend check.

Verified: p99-only breach now passes (exit 0); a real +100% commit_p50
regression still fails (exit 1); bootstrap and trend modes unaffected.
CI-tooling only — no production/spec/wire change, no version bump. AUDIT updated.
---
 AUDIT.md                    | 10 +++++++
 scripts/check-regression.py | 55 ++++++++++++++++++++++++++-----------
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/AUDIT.md b/AUDIT.md
index be23468..f33f158 100644
--- a/AUDIT.md
+++ b/AUDIT.md
@@ -5,6 +5,16 @@
 
 ---
 
+### 2026-06-18 — Benchmark release gate: p99 metrics non-gating (no version bump)
+
+The release gate (`scripts/check-regression.py`) failed the v0.1.25.34 release on `commit_p99` (+94% vs baseline) while every p50 and throughput metric was within tolerance.
+
+**Why.** p99 tail latency on a 200-iteration micro-benchmark over shared GitHub runners swings ~2× run-to-run (`commit_p99` measured 6.5 → 8.2 → 12.6 across three runs) — GC pauses and runner contention dominate the tail, far beyond the 25% threshold. No single-sample baseline can stabilize that, and same-machine `.21`→`.34` showed only +8% on `commit_p99`, so it's noise, not a code regression.
+
+**Change.** `HEADLINE_METRICS` gains a third element, `gating`. The p99 metrics (`reserve_p99_ms`, `commit_p99_ms`) are now **non-gating**: still measured and shown in the summary table (labelled `noisy (non-gating)` when they exceed the threshold) but no longer failing the build. The stable signals — p50 latency (reserve/commit/release/event) and `concurrent_throughput_32t` — remain hard gates. Applies to both the release gate and the nightly trend check.
+
+**Verified.** A p99-only breach now passes (exit 0); a real p50 regression (+100% `commit_p50`) still fails (exit 1); bootstrap and trend modes unaffected. CI-tooling only — no production/spec/wire change, no version bump.
+
 ### 2026-06-18 — v0.1.25.34: surface committed metadata on `getReservation`
 
 Commit-time metadata was write-only on the server — accepted, persisted, never returned. Fixes runcycles/cycles-server#197.
diff --git a/scripts/check-regression.py b/scripts/check-regression.py
index d353a3f..79c5f64 100644
--- a/scripts/check-regression.py
+++ b/scripts/check-regression.py
@@ -32,16 +32,20 @@
 from typing import Dict, List, Optional
 
 
-# Headline metrics the gate cares about. Direction: True = lower-is-better
-# (latency). False = higher-is-better (throughput).
+# Headline metrics. 2nd element = direction (True = lower-is-better/latency,
+# False = higher-is-better/throughput). 3rd element = GATING: whether a breach
+# fails the build. p99 tail latency on a 200-iteration micro-benchmark over
+# shared CI runners swings ~2x run-to-run (GC pauses / runner contention), far
+# beyond any sane threshold, so p99 is tracked + reported but NON-GATING; the
+# stable signals — p50 latency and concurrent throughput — are the hard gates.
 HEADLINE_METRICS = [
-    ("reserve_p50_ms",           True),
-    ("reserve_p99_ms",           True),
-    ("commit_p50_ms",            True),
-    ("commit_p99_ms",            True),
-    ("release_p50_ms",           True),
-    ("event_p50_ms",             True),
-    ("concurrent_throughput_32t", False),
+    ("reserve_p50_ms",            True,  True),
+    ("reserve_p99_ms",            True,  False),
+    ("commit_p50_ms",             True,  True),
+    ("commit_p99_ms",             True,  False),
+    ("release_p50_ms",            True,  True),
+    ("event_p50_ms",              True,  True),
+    ("concurrent_throughput_32t", False, True),
 ]
 
 
@@ -68,7 +72,7 @@ def rolling_median(history: List[dict], window: int) -> Dict[str, float]:
     """Take the last `window` history entries, median each metric."""
     recent = history[-window:] if window > 0 else history
     result: Dict[str, float] = {}
-    for metric, _ in HEADLINE_METRICS:
+    for metric, *_ in HEADLINE_METRICS:
         values = [
             e[metric] for e in recent
             if metric in e and isinstance(e[metric], (int, float))
@@ -94,7 +98,7 @@ def compare(
 ) -> List[dict]:
     """Return a list of {metric, current, baseline, change, regressed}."""
     results = []
-    for metric, lower_is_better in HEADLINE_METRICS:
+    for metric, lower_is_better, gating in HEADLINE_METRICS:
         c = current.get(metric)
         b = baseline.get(metric)
         if c is None or b is None:
@@ -104,16 +108,23 @@ def compare(
                 "baseline": b,
                 "change_pct": None,
                 "regressed": False,
+                "breached": False,
+                "gating": gating,
                 "note": "missing",
             })
             continue
         change = pct_change(c, b, lower_is_better)
+        breached = change > threshold
         results.append({
             "metric": metric,
             "current": c,
             "baseline": b,
             "change_pct": round(change * 100, 1),
-            "regressed": change > threshold,
+            # Only a GATING metric's breach fails the build; a non-gating
+            # (p99) breach is reported but does not regress the gate.
+            "regressed": breached and gating,
+            "breached": breached,
+            "gating": gating,
             "note": None,
         })
     return results
@@ -149,9 +160,21 @@ def format_summary(
         else:
             sign = "+" if r["change_pct"] >= 0 else ""
             delta = f"{sign}{r['change_pct']}%"
-            status = "REGRESSED" if r["regressed"] else "OK"
+            if r["regressed"]:
+                status = "REGRESSED"
+            elif r.get("breached"):
+                # exceeded threshold but the metric is non-gating (p99 noise)
+                status = "noisy (non-gating)"
+            else:
+                status = "OK"
         lines.append(f"| `{r['metric']}` | {b} | {c} | {delta} | {status} |")
     lines.append("")
+    lines.append(
+        "_p99 latency is non-gating (informational): tail latency on a "
+        "micro-benchmark over shared CI runners is too noisy to gate; p50 "
+        "and throughput are the gating signals._"
+    )
+    lines.append("")
     lines.append(
         f"**Overall: {'REGRESSION DETECTED' if any_regressed else 'OK'}**"
     )
@@ -169,7 +192,7 @@ def run_release(args) -> int:
     # the gate. Accept and let the caller overwrite baseline.json with
     # `current`.
     if not baseline_raw or not any(
-        m in baseline_raw for m, _ in HEADLINE_METRICS
+        m in baseline_raw for m, *_ in HEADLINE_METRICS
     ):
         results = [
             {
@@ -180,7 +203,7 @@ def run_release(args) -> int:
                 "regressed": False,
                 "note": "bootstrap",
             }
-            for m, _ in HEADLINE_METRICS
+            for m, *_ in HEADLINE_METRICS
         ]
         print(format_summary(
             "release-gate", results, args.threshold,
@@ -190,7 +213,7 @@ def run_release(args) -> int:
 
     baseline = {
         m: baseline_raw[m]
-        for m, _ in HEADLINE_METRICS
+        for m, *_ in HEADLINE_METRICS
         if m in baseline_raw
     }
     results = compare(current, baseline, args.threshold)