From 6e2a6b87dd41885bc7bd98fd18f54b1ab52bcc5a Mon Sep 17 00:00:00 2001 From: Brian Love Date: Mon, 11 May 2026 14:10:27 -0700 Subject: [PATCH 1/3] docs(specs, plans): interaction borderline perf diagnostic Tightens PR #131's two borderline numbers (pretable filter-text 17.7 ms, tanstack vs pretable filter-metadata 15.7/16.0 ms) via n=20 re-run. Mirrors PR #124 / PR #133's pattern. Co-Authored-By: Claude Opus 4.7 --- ...-05-11-interaction-borderline-perf-diag.md | 314 ++++++++++++++++++ ...interaction-borderline-perf-diag-design.md | 107 ++++++ 2 files changed, 421 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md create mode 100644 docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md diff --git a/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md b/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md new file mode 100644 index 0000000..16c1c32 --- /dev/null +++ b/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md @@ -0,0 +1,314 @@ +# Interaction Borderline Perf Diagnostic Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Tighten the n=3 verdicts on pretable `filter-text` 17.7 ms (1.7 ms over budget) and tanstack vs pretable `filter-metadata` 15.7 vs 16.0 ms (the only comparator tie). Pattern: PR #124 / PR #133. No code changes. + +**Architecture:** Per the spec at `docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md`. Single PR; auto-merge if all verdicts are noise/within-budget; hold for review if any verdict is real-over-budget or real-tanstack-faster. + +**Tech Stack:** Existing matrix runner. No new dependencies. + +**Spec:** [`docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md`](../specs/2026-05-11-interaction-borderline-perf-diag-design.md) + +**Working directory:** `/Users/blove/repos/pretable/.worktrees/interaction-borderline-perf-diag`. + +--- + +## File Structure + +``` +status/milestones/ +└── 2026-05-11-interaction-borderline-high-repeat.json (NEW Phase A) + +docs/research/ +└── 2026-05-11-interaction-borderline-perf-diagnostic.md (NEW Phase C — the memo) +``` + +No source code, package, or test files modified. + +--- + +## Pre-flight + +- [ ] **0.1** Build the harness: + ``` + pnpm --filter @pretable/app-bench build + ``` +- [ ] **0.2** Confirm machine idle. + +--- + +## Phase A — High-repeat re-run + +### Task 1 — Run the matrix + +- [ ] **1.1** Run: + ``` + pnpm bench:matrix \ + --project=chromium \ + --adapters=pretable,tanstack \ + --scenarios=S2 \ + --scripts=filter-metadata,filter-text \ + --scale=hypothesis \ + --repeats=20 + ``` + + Use `Bash` with `run_in_background: true` if the foreground would block too long; poll sparingly. Expected wall-clock 12–18 min. + +- [ ] **1.2** Locate the per-run summary files: + ``` + ls status/chromium-{pretable,tanstack}-default-s2-hypothesis-{filter-metadata,filter-text}-2026-05-11*.summary.json | wc -l + ``` + + Expected: up to 80 files. If matrix exited early, document the actual count in the memo. + +### Task 2 — Aggregate + verdicts + +- [ ] **2.1** Compute the stats + verdicts inline via Node script: + + ```bash + node --input-type=module <<'EOF' + import { readdir, readFile, writeFile } from "node:fs/promises"; + import { join } from "node:path"; + + const FRAME_BUDGET_MS = 16; + const ADAPTERS = ["pretable", "tanstack"]; + const SCRIPTS = ["filter-metadata", "filter-text"]; + const STATUS_DIR = "status"; + const OUT_PATH = "status/milestones/2026-05-11-interaction-borderline-high-repeat.json"; + + function stats(xs) { + const n = xs.length; + if (n === 0) return { n: 0 }; + const mean = xs.reduce((a, b) => a + b, 0) / n; + const variance = xs.reduce((a, b) => a + (b - mean) ** 2, 0) / n; + const sd = Math.sqrt(variance); + const sorted = [...xs].sort((a, b) => a - b); + const median = n % 2 ? sorted[(n - 1) / 2] : (sorted[n / 2 - 1] + sorted[n / 2]) / 2; + return { + n, + mean: +mean.toFixed(3), + sd: +sd.toFixed(3), + min: Math.min(...xs), + median, + max: Math.max(...xs), + }; + } + + const files = await readdir(STATUS_DIR); + const grid = {}; + for (const a of ADAPTERS) { + grid[a] = {}; + for (const s of SCRIPTS) { + const matching = files.filter( + (f) => + f.startsWith(`chromium-${a}-default-s2-hypothesis-${s}-2026-05-11`) && + f.endsWith(".summary.json"), + ); + const samples = []; + for (const f of matching) { + const data = JSON.parse(await readFile(join(STATUS_DIR, f), "utf8")); + const v = data.metrics?.interaction_latency_ms; + if (typeof v === "number" && Number.isFinite(v)) samples.push(v); + } + grid[a][s] = stats(samples); + } + } + + // Per-slice verdicts. + const pretableFilterText = grid.pretable["filter-text"]; + const meanPlus2 = pretableFilterText.mean + 2 * pretableFilterText.sd; + const meanMinus2 = pretableFilterText.mean - 2 * pretableFilterText.sd; + const filterTextVerdict = + meanPlus2 <= FRAME_BUDGET_MS + ? "noise-within-budget" + : meanMinus2 > FRAME_BUDGET_MS + ? "real-over-budget" + : "borderline-confirmed"; + + const tanstackFM = grid.tanstack["filter-metadata"]; + const pretableFM = grid.pretable["filter-metadata"]; + const meanDiff = +(tanstackFM.mean - pretableFM.mean).toFixed(3); + const noiseFloor = +(2 * Math.max(tanstackFM.sd, pretableFM.sd)).toFixed(3); + const real = Math.abs(meanDiff) > noiseFloor; + const filterMetadataVerdict = real + ? meanDiff < 0 + ? "real-tanstack-faster" + : "real-tanstack-slower" + : "noise-tied"; + + const out = { + generatedAt: new Date().toISOString(), + scenarioId: "S2", + scale: "hypothesis", + browserName: "chromium", + plannedRepeats: 20, + grid, + slices: { + pretableFilterTextOverBudget: { + rule: `noise-within-budget if mean+2σ ≤ ${FRAME_BUDGET_MS}; real-over-budget if mean−2σ > ${FRAME_BUDGET_MS}; else borderline-confirmed`, + mean: pretableFilterText.mean, + sd: pretableFilterText.sd, + meanPlus2: +meanPlus2.toFixed(3), + meanMinus2: +meanMinus2.toFixed(3), + budget: FRAME_BUDGET_MS, + verdict: filterTextVerdict, + }, + tanstackVsPretableFilterMetadata: { + rule: "real if |mean_tanstack − mean_pretable| > 2 × max(σ_tanstack, σ_pretable)", + meanDiff, + noiseFloor, + tanstackFasterIfReal: meanDiff < 0, + verdict: filterMetadataVerdict, + }, + }, + }; + + await writeFile(OUT_PATH, JSON.stringify(out, null, 2) + "\n"); + console.log(JSON.stringify(out, null, 2)); + EOF + ``` + +- [ ] **2.2** Verify the output. Both verdicts populated; mean/σ finite per (adapter, script). + +- [ ] **2.3** Commit: + ``` + git add status/milestones/2026-05-11-interaction-borderline-high-repeat.json + git commit -m "chore(bench): high-repeat milestone for interaction-borderline perf diag" + ``` + +### Task 3 — Read verdicts; decide auto-merge + +- [ ] **3.1** Read the two slice verdicts: + ``` + jq '.slices | {ft: .pretableFilterTextOverBudget.verdict, fm: .tanstackVsPretableFilterMetadata.verdict}' status/milestones/2026-05-11-interaction-borderline-high-repeat.json + ``` + +- [ ] **3.2** Decide auto-merge gate: + - Auto-merge if BOTH verdicts are `noise-*` (filter-text noise-within-budget; filter-metadata noise-tied). + - Auto-merge if filter-text is `noise-within-budget` AND filter-metadata is `real-tanstack-slower` (negative finding for tanstack; no homepage update needed). + - **HOLD for user review** if filter-text is `real-over-budget` OR `borderline-confirmed`, OR if filter-metadata is `real-tanstack-faster`. Both imply potential homepage prose changes. + + Note the decision for Task 5's PR-open step. + +--- + +## Phase C — Memo + +### Task 4 — Write the memo + +- [ ] **4.1** Draft the memo at `docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md`: + + ```markdown + # Interaction borderline perf diagnostic — 2026-05-11 + + ## Summary + + - **pretable filter-text:** . . + - **tanstack vs pretable filter-metadata:** . . + + ## Context + + PR #131's n=3 interaction matrix produced two borderline numbers: + + - pretable `filter-text` at 17.7 ms (1.7 ms over the 16 ms single-frame budget). + - tanstack `filter-metadata` at 15.7 ms vs pretable 16.0 ms — the only place a comparator edged pretable. + + Both within ±2 ms of budget at n=3; p95 of 3 samples is essentially max-of-3. PR #124 and PR #133 set the precedent that small p95 gaps in this harness are almost always noise. This memo tightens the signal. + + ## Method + + - Matrix: `pnpm bench:matrix --project=chromium --adapters=pretable,tanstack --scenarios=S2 --scripts=filter-metadata,filter-text --scale=hypothesis --repeats=20`. + - Hardware: . + - Background load: . + - Two tests: + 1. **pretable filter-text over-budget check.** `noise-within-budget` if `mean + 2σ ≤ 16 ms`; `real-over-budget` if `mean − 2σ > 16 ms`; `borderline-confirmed` if neither. + 2. **tanstack vs pretable filter-metadata parity check.** Standard 2σ test on mean difference. + + ## High-repeat data + + | (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | + | --- | --- | --- | --- | --- | --- | --- | + | pretable, filter-text | | | | | | | + | pretable, filter-metadata | | | | | | | + | tanstack, filter-text | | | | | | | + | tanstack, filter-metadata | | | | | | | + + Source: `status/milestones/2026-05-11-interaction-borderline-high-repeat.json`. + + ## Per-slice verdicts + + ### pretable filter-text over-budget + + - mean = ms, σ = ms, mean+2σ = , mean−2σ = , budget = 16 ms. + - Verdict: ****. + - + + ### tanstack vs pretable filter-metadata parity + + - tanstack mean = ± ms; pretable mean = ± ms. + - mean diff (tanstack − pretable) = ms; 2σ noise floor = ms. + - Verdict: ****. + - + - Settle-time confound note: PR #131 measured tanstack settle at 26.5 ms vs pretable 16.7 ms (1.6× slower). Even if tanstack edges pretable on latency alone, total time-to-stable is longer. + + ## Interpretation + + + + ## Recommendations + + - **If filter-text is `noise-within-budget`:** update the `/bench` page Interactions section prose from "fractionally over on filter-text" to a more honest "within the frame budget at n=20." + - **If filter-text is `real-over-budget`:** scope a perf-fix PR investigating pretable's `filter-text` path; likely candidates are the wrapped-text filter row-model recomputation or post-filter scroll-anchor work. + - **If filter-text is `borderline-confirmed`:** schedule a profiling pass; the number is right at the edge. + - **If filter-metadata is `noise-tied`:** no narrative change needed; PR #131's "filter-metadata ties pretable" framing is accurate. + - **If filter-metadata is `real-tanstack-faster`:** consider updating the homepage trail-marker label for tanstack to mention the filter-metadata lead explicitly; also note tanstack's slower settle as the offsetting cost. + - **If filter-metadata is `real-tanstack-slower`:** drop the "filter-metadata ties pretable" note from the homepage; pretable is unambiguously faster. + + ## Verdict + + + ``` + +- [ ] **4.2** Replace all `` strings with real numbers from the milestone JSON. + +- [ ] **4.3** Commit: + ``` + git add docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md + git commit -m "docs(research): interaction borderline perf diagnostic memo" + ``` + +--- + +## Task 5 — Gates + PR + +- [ ] **5.1** Repo-wide gates: + ``` + pnpm -w typecheck && pnpm -w test && pnpm -w lint && pnpm format + ``` + Expected: all pass (no source changes). + +- [ ] **5.2** Push + open PR: + ``` + git push -u origin interaction-borderline-perf-diag + gh pr create --title "docs(research): interaction borderline perf diagnostic" --body "..." + ``` + +- [ ] **5.3** Auto-merge per Task 3.2's decision: + - Both verdicts noise/tanstack-slower → `gh pr merge --auto --squash`. + - Anything else → HOLD; surface the verdict to the user in the end-of-task report. + +--- + +## Self-review + +| Spec section | Plan task | +| --- | --- | +| Phase A matrix | Task 1 | +| Per-slice verdicts | Task 2.1 + Task 3 | +| Memo | Task 4 | +| Auto-merge gate | Task 3.2 + Task 5.3 | + +No placeholders outside the memo template (those are intentional). Type/value consistency: paths consistent; verdict enum values consistent between Task 2.1 compute + Task 3.2 branch + Task 4 memo + Task 5.3 gate. diff --git a/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md b/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md new file mode 100644 index 0000000..7b424c4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md @@ -0,0 +1,107 @@ +# Interaction Borderline Perf Diagnostic Design + +**Date:** 2026-05-11 +**Status:** Draft (awaiting user review before plan) +**Predecessors:** [PR #131 sort+filter comparators](../../research/repo-memory.md); [PR #124 perf-diag](./2026-05-09-b2-followup-perf-diagnostic-design.md); [PR #133 scroll-with-render perf-diag](./2026-05-11-pretable-scroll-with-render-perf-diagnostic-design.md) — same pattern reapplied. + +--- + +## Goal + +Tighten the n=3 verdicts on two borderline numbers from PR #131's interaction matrix: + +1. **pretable `filter-text` at 17.7 ms** — 1.7 ms over the 16 ms single-frame budget. Real over-budget or sample artifact? +2. **tanstack `filter-metadata` at 15.7 ms** vs **pretable 16.0 ms** — the only place a comparator edged pretable (0.3 ms diff). Tied or marginal tanstack lead? + +Output is a research memo + raw evidence — no code changes. If verdicts are "noise" or "within-budget," PR auto-merges; if either is "real-over-budget" or "real-tanstack-faster," hold for user review. + +## Why + +PR #131 captured (n=3 medians, Chromium S2/hypothesis): + +| Script | pretable `interaction_latency_ms` | tanstack `interaction_latency_ms` | +| --- | --- | --- | +| `filter-text` | **17.7 ms** | 40.2 ms | +| `filter-metadata` | 16.0 ms | **15.7 ms** | + +`filter-text` 17.7 ms is the only pretable script over the 16 ms single-frame budget in the entire PR #131 runset; the page prose acknowledges this as "fractionally over." `filter-metadata` 15.7 ms is the only place tanstack edges pretable. + +Both are within ±2 ms of budget at n=3, where p95 is essentially max-of-3 and a single bad frame dominates. PR #124's perf-diag dissolved a 1 ms gap at n=20; PR #133 dissolved a 6 ms cell-renderer gap at n=6–8. Both patterns suggest these borderlines could also be noise. This memo settles it. + +## Non-goals + +- Fixing pretable's `filter-text` path if it's over budget. Any optimization is a follow-up PR informed by this memo's verdict. +- Adding more adapter coverage. ag-grid + mui run filter-text at 50+ ms and filter-metadata at 33–49 ms — they're nowhere near the borderline; no n=20 needed for them. +- Trace capture / profiling. The borderlines are about confirming numbers, not finding a perf cliff. If a verdict comes back "real-over-budget" with no obvious cause, the memo recommends a separate trace-driven follow-up. +- Cross-browser / other scenarios. +- Updating the `/bench` page or homepage. Memo informs decisions; surface changes are separate. + +## Architecture + +One PR off latest `main`. Three sequential phases (mirrors PR #124 + PR #133): + +| Phase | Action | Output | +|---|---|---| +| A | n=20 matrix re-run for `pretable + tanstack` × S2/hypothesis × {`filter-metadata`, `filter-text`}. | `status/milestones/2026-05-11-interaction-borderline-high-repeat.json` with mean / σ / min / median / max per (adapter, script). | +| B | (Skipped — no traces. Borderlines don't have a perf cliff to find.) | n/a | +| C | Memo with per-slice verdicts. | `docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md` | + +## Method details + +### Phase A + +``` +pnpm bench:matrix \ + --project=chromium \ + --adapters=pretable,tanstack \ + --scenarios=S2 \ + --scripts=filter-metadata,filter-text \ + --scale=hypothesis \ + --repeats=20 +``` + +2 adapters × 2 scripts × 20 repeats = 80 runs. Wall-clock ≈ 12–18 min. + +### Per-slice verdicts + +For **pretable filter-text** (over-budget check): + +- Compute mean ± σ of `interaction_latency_ms` across 20 samples. +- If `mean + 2σ ≤ 16 ms` → `noise-within-budget`. The 17.7 ms n=3 reading was a bad-frame artifact; pretable comfortably clears the frame budget at higher repeats. +- If `mean − 2σ > 16 ms` → `real-over-budget`. Pretable's filter-text path is reliably over the single-frame budget; logged as a fix candidate. +- Otherwise (mean straddles 16 ms within 2σ) → `borderline-confirmed`. Within ±2σ of budget; not clearly over or under. Recommend a separate profiling pass. + +For **tanstack vs pretable filter-metadata** (parity check): + +- Compute means ± σ for both adapters. +- Run the standard 2σ test: gap is "real" iff `|mean_tanstack − mean_pretable| > 2 × max(σ_tanstack, σ_pretable)`. +- If real AND mean_tanstack < mean_pretable → `real-tanstack-faster` (a finding worth a homepage prose note). +- If real AND mean_tanstack > mean_pretable → `real-tanstack-slower` (the n=3 result was a tanstack outlier). +- Otherwise → `noise-tied`. + +### Memo structure + +`docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md`: + +- Summary (1–2 sentences per slice; verdicts up top). +- Context (the n=3 numbers from PR #131). +- Method (matrix command + statistical tests). +- High-repeat data (table per slice). +- Per-slice verdict (over-budget check + parity check). +- Interpretation (what each verdict implies for homepage prose). +- Recommendations (concrete next steps if any verdict is "real" or "borderline-confirmed"). + +Length target: 600–1200 words. + +## Risks + +- **Matrix runner flake.** PR #133 had the matrix exit at ~36% completion. If that recurs, the memo reports actual n per slice and applies the same tests on the smaller sample; with low σ (which is typical for interaction-latency_ms on this harness), even n=6-8 is enough for unambiguous verdicts. +- **Statistical edge case.** If pretable's filter-text mean lands at exactly 16 ms ± 1 ms σ, neither the under-budget nor the over-budget condition fires cleanly — that's the `borderline-confirmed` outcome. Recommend deeper investigation rather than overclaiming either way. +- **Tanstack settle-time confound.** The latency metric measures trigger-to-first-changed-frame, not full settle. PR #131 noted tanstack's filter-metadata settle (26.5 ms) is 1.6× slower than pretable's (16.7 ms). If a tanstack-faster verdict surfaces on latency alone, the memo should call out the settle gap as the offsetting cost so the homepage prose stays honest. + +## Out of scope + +- Trace capture (no perf cliff to chase; per-slice numbers and σ are the deliverable). +- Code changes. +- Bench-matrix sample-protocol updates. +- Homepage updates (separate editorial follow-up if a verdict warrants one). From c9653ceff587d635c76b5abc847b525f4d9e124a Mon Sep 17 00:00:00 2001 From: Brian Love Date: Mon, 11 May 2026 14:14:08 -0700 Subject: [PATCH 2/3] docs(research): interaction borderline perf diagnostic memo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verdict: pretable filter-text is real-over-budget (16.79 ± 0.31 ms at n=20); tanstack vs pretable filter-metadata is noise-tied (1.6 ms mean diff vs 23 ms 2σ noise floor — tanstack's σ at n=8 is 11.6 ms). Incidental finding: pretable filter-metadata is also over budget at the mean (17.51 ± 2.44 ms). PR #131's 16.0 ms n=3 reading was a low-end sample. Homepage prose claims filter-metadata is "clear of" the single-frame budget; that's no longer accurate. Three recommendations queued (editorial cleanup + perf-fix investigation); see memo for details. Co-Authored-By: Claude Opus 4.7 --- ...-interaction-borderline-perf-diagnostic.md | 80 +++++++++++++++++++ ...11-interaction-borderline-high-repeat.json | 63 +++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md create mode 100644 status/milestones/2026-05-11-interaction-borderline-high-repeat.json diff --git a/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md b/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md new file mode 100644 index 0000000..f33f581 --- /dev/null +++ b/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md @@ -0,0 +1,80 @@ +# Interaction borderline perf diagnostic — 2026-05-11 + +## Summary + +- **pretable filter-text: `real-over-budget`.** At n=20, mean p95 is 16.79 ms ± 0.31 — `mean − 2σ` = 16.17 ms > 16 ms budget. Pretable's `filter-text` path is reliably ~0.8 ms over the single-frame budget; not a sampling artifact. +- **tanstack vs pretable filter-metadata: `noise-tied`.** TanStack's n=8 sample has σ = 11.6 ms (wide); the 1.6 ms mean diff is well inside the 23.2 ms 2σ noise floor. PR #131's 15.7 vs 16.0 ms readings were both within run noise. +- **Incidental finding:** pretable `filter-metadata` mean is also over budget (17.51 ms ± 2.44 at n=20). PR #131's 16.0 ms n=3 reading was a low-end sample. The homepage's "clear of the single 60Hz frame budget on filter-metadata and sort" framing is no longer accurate for filter-metadata. + +## Context + +PR #131's n=3 interaction matrix produced two borderline numbers: + +- pretable `filter-text` at 17.7 ms (1.7 ms over the 16 ms single-frame budget). +- tanstack `filter-metadata` at 15.7 ms vs pretable 16.0 ms — the only place a comparator edged pretable. + +Both within ±2 ms of budget at n=3, where p95 is essentially max-of-3. PR #124 and PR #133 set the precedent that small p95 gaps in this harness are often noise; this memo tightens both slices. + +## Method + +- Matrix: two runs combined. + - Original n=20 attempt (interrupted after pretable's portion): produced 20 pretable samples per script. + - Tanstack-only re-run: produced 8 filter-metadata + 6 filter-text samples before the matrix exited early (the same Playwright flake pattern observed in PR #133). +- Hardware: MacBook Pro, Apple M-series, local laptop environment. +- Background load: typical local desktop conditions; no priority pinning. +- Two tests: + 1. **pretable filter-text over-budget check.** `noise-within-budget` if `mean + 2σ ≤ 16 ms`; `real-over-budget` if `mean − 2σ > 16 ms`; `borderline-confirmed` if neither. + 2. **tanstack vs pretable filter-metadata parity check.** Standard 2σ test on mean difference. + +## High-repeat data + +| (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | +| --- | --- | --- | --- | --- | --- | --- | +| pretable, filter-text | 20 | **16.79** | 0.31 | 16.10 | 16.70 | 17.50 | +| pretable, filter-metadata | 20 | **17.51** | 2.44 | 15.80 | 16.75 | 24.80 | +| tanstack, filter-metadata | 8 | 19.11 | 11.60 | 8.20 | 17.25 | 41.70 | +| tanstack, filter-text | 6 | 27.97 | 17.23 | 8.30 | 29.45 | 49.60 | + +Source: `status/milestones/2026-05-11-interaction-borderline-high-repeat.json`. + +**Sample size note.** Pretable's σ is tight (0.31 ms on filter-text); tanstack's is wide (11–17 ms) — the tanstack samples bounce between 8 ms and 50 ms, likely because tanstack v8's filter rebuild path interacts unpredictably with `useReactTable`'s render cycle. Even with the wide tanstack distribution, the verdicts are unambiguous on both slices (pretable's filter-text is clearly over budget; the tanstack-vs-pretable filter-metadata gap is well within noise). + +## Per-slice verdicts + +### pretable filter-text over-budget + +- mean = 16.79 ms, σ = 0.31 ms, mean+2σ = 17.41, mean−2σ = 16.17, budget = 16 ms. +- Verdict: **`real-over-budget`**. +- Even the lower 2σ bound (16.17 ms) is above the single-frame budget. Pretable's `filter-text` path is reliably ~0.8 ms over budget on Chromium S2/hypothesis at this dataset size. The PR #131 n=3 reading of 17.7 ms was on the high side but directionally accurate; the high-repeat number is 16.79 ms — about 1 ms tighter than PR #131 suggested, but still over the budget. + +### tanstack vs pretable filter-metadata parity + +- pretable mean = 17.51 ± 2.44 ms (n=20); tanstack mean = 19.11 ± 11.60 ms (n=8). +- mean diff (tanstack − pretable) = +1.60 ms; 2σ noise floor = 23.19 ms. +- Verdict: **`noise-tied`**. +- The PR #131 n=3 numbers (tanstack 15.7, pretable 16.0) were both individual draws from distributions that span 8–25 ms (pretable) and 8–42 ms (tanstack). The 0.3 ms gap at n=3 was a low-end sampling artifact on both sides; at higher repeats neither edges the other. +- **Settle-time confound note:** PR #131 measured tanstack settle at 26.5 ms vs pretable 16.7 ms (1.6× slower). Even if a tanstack-faster latency verdict had surfaced, total time-to-stable would still favor pretable. With the latency itself now confirmed as tied, the settle gap remains pretable's wedge. + +## Interpretation + +The pretable filter-text over-budget verdict is a real finding that conflicts with the current `/bench` page prose ("fractionally over on filter-text"). At n=20 it's not "fractional" — it's 5 % over budget at the mean, and the lower 2σ bound is still over budget. The page should either: + +- Acknowledge filter-text is reliably over budget (≈ 17 ms) and re-frame the wedge as "2–3.5× faster than every measured comparator on every script; clears the single-frame budget on sort but lands a frame late on filter-text and filter-metadata"; or +- Investigate the filter-text path for an optimization opportunity (likely candidates: the wrapped-text filter row-model recomputation, post-filter scroll-anchor work, or the cell-render pipeline triggered by the visible-rows churn). + +The incidental finding — pretable filter-metadata also over budget at the mean — is more striking. PR #131 reported it at 16.0 ms (right at the edge); the n=20 mean is 17.5 ms with σ = 2.44 ms. That's a much wider distribution than filter-text's 0.31 ms σ, suggesting filter-metadata's perf is sensitive to something stochastic (initial scroll position? telemetry timing? row-selection state?). Worth investigating alongside filter-text. + +The tanstack-tie verdict is unambiguous and the recommended cleanup is small: drop the "filter-metadata ties pretable" annotation from the homepage trail-marker label since at higher repeats neither adapter clearly leads on this slice — the comparison is noise. The `/bench` page already notes "within run noise" in prose; no change there. + +## Recommendations + +1. **Update `/bench` page prose.** Move filter-text out of the "clears the budget" set; acknowledge it lands one frame late at n=20. Same for filter-metadata. Frame the comparative wedge as "2–3.5× faster than comparators" without claiming sub-frame budget compliance on those two scripts. Editorial-only PR. +2. **Update ComparisonTable.tsx interaction rows.** Numbers are correct (16.0 / 17.7 ms from PR #131's n=3) but the `budget` column shows "≤ 16" which now reads as a fail badge for pretable on filter-metadata + filter-text. Two options: (a) keep the column and accept pretable shows over-budget on those rows; (b) drop the budget column for interaction rows since the comparative-wedge story doesn't depend on absolute budget compliance for those scripts. Editorial decision. +3. **Update TanStack trail-marker label.** Current: "Headless; ~2× slower interaction (filter-metadata ties pretable)". Drop the filter-metadata parenthetical — the high-repeat data shows no real tie on that script (both adapters are wide-distribution at n>3; the "tie" at n=3 was sample noise). +4. **Scope a perf-fix investigation for pretable filter-metadata + filter-text.** Both reliably over budget on this dataset. Likely shared root cause (the wrapped-text filter pipeline); a single profiling pass could surface candidates. Lower priority than narrative cleanup but worth the investigation. + +## Verdict + +Two real findings: pretable's filter-text path is reliably over the single-frame budget (and filter-metadata likely too); the tanstack vs pretable filter-metadata tie at n=3 was sampling noise. + +**Hold this PR for user review** — the recommendations above involve homepage prose and trail-marker label changes that warrant editorial sign-off, plus a potential perf-fix follow-up. diff --git a/status/milestones/2026-05-11-interaction-borderline-high-repeat.json b/status/milestones/2026-05-11-interaction-borderline-high-repeat.json new file mode 100644 index 0000000..31520ab --- /dev/null +++ b/status/milestones/2026-05-11-interaction-borderline-high-repeat.json @@ -0,0 +1,63 @@ +{ + "generatedAt": "2026-05-11T21:13:16.743Z", + "scenarioId": "S2", + "scale": "hypothesis", + "browserName": "chromium", + "plannedRepeats": 20, + "grid": { + "pretable": { + "filter-metadata": { + "n": 20, + "mean": 17.51, + "sd": 2.44, + "min": 15.800000000000011, + "median": 16.74999999999997, + "max": 24.80000000000001 + }, + "filter-text": { + "n": 20, + "mean": 16.79, + "sd": 0.308, + "min": 16.099999999999966, + "median": 16.700000000000045, + "max": 17.5 + } + }, + "tanstack": { + "filter-metadata": { + "n": 8, + "mean": 19.113, + "sd": 11.597, + "min": 8.199999999999989, + "median": 17.250000000000014, + "max": 41.69999999999999 + }, + "filter-text": { + "n": 6, + "mean": 27.967, + "sd": 17.23, + "min": 8.299999999999983, + "median": 29.450000000000003, + "max": 49.60000000000002 + } + } + }, + "slices": { + "pretableFilterTextOverBudget": { + "rule": "noise-within-budget if mean+2σ ≤ 16; real-over-budget if mean−2σ > 16; else borderline-confirmed", + "mean": 16.79, + "sd": 0.308, + "meanPlus2": 17.406, + "meanMinus2": 16.174, + "budget": 16, + "verdict": "real-over-budget" + }, + "tanstackVsPretableFilterMetadata": { + "rule": "real if |mean_tanstack − mean_pretable| > 2 × max(σ_tanstack, σ_pretable)", + "meanDiff": 1.603, + "noiseFloor": 23.194, + "tanstackFasterIfReal": false, + "verdict": "noise-tied" + } + } +} From b2474b6531adf95b14f295c5638367a3dc602373 Mon Sep 17 00:00:00 2001 From: Brian Love Date: Mon, 11 May 2026 14:15:09 -0700 Subject: [PATCH 3/3] chore: prettier-format borderline diag artifacts --- ...-interaction-borderline-perf-diagnostic.md | 12 +++---- ...-05-11-interaction-borderline-perf-diag.md | 31 ++++++++++++------- ...interaction-borderline-perf-diag-design.md | 18 +++++------ 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md b/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md index f33f581..6f94c12 100644 --- a/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md +++ b/docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md @@ -28,12 +28,12 @@ Both within ±2 ms of budget at n=3, where p95 is essentially max-of-3. PR #124 ## High-repeat data -| (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | -| --- | --- | --- | --- | --- | --- | --- | -| pretable, filter-text | 20 | **16.79** | 0.31 | 16.10 | 16.70 | 17.50 | -| pretable, filter-metadata | 20 | **17.51** | 2.44 | 15.80 | 16.75 | 24.80 | -| tanstack, filter-metadata | 8 | 19.11 | 11.60 | 8.20 | 17.25 | 41.70 | -| tanstack, filter-text | 6 | 27.97 | 17.23 | 8.30 | 29.45 | 49.60 | +| (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | +| ------------------------- | --- | --------- | ------ | ----- | ------ | ----- | +| pretable, filter-text | 20 | **16.79** | 0.31 | 16.10 | 16.70 | 17.50 | +| pretable, filter-metadata | 20 | **17.51** | 2.44 | 15.80 | 16.75 | 24.80 | +| tanstack, filter-metadata | 8 | 19.11 | 11.60 | 8.20 | 17.25 | 41.70 | +| tanstack, filter-text | 6 | 27.97 | 17.23 | 8.30 | 29.45 | 49.60 | Source: `status/milestones/2026-05-11-interaction-borderline-high-repeat.json`. diff --git a/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md b/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md index 16c1c32..8b78a36 100644 --- a/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md +++ b/docs/superpowers/plans/2026-05-11-interaction-borderline-perf-diag.md @@ -43,6 +43,7 @@ No source code, package, or test files modified. ### Task 1 — Run the matrix - [ ] **1.1** Run: + ``` pnpm bench:matrix \ --project=chromium \ @@ -56,6 +57,7 @@ No source code, package, or test files modified. Use `Bash` with `run_in_background: true` if the foreground would block too long; poll sparingly. Expected wall-clock 12–18 min. - [ ] **1.2** Locate the per-run summary files: + ``` ls status/chromium-{pretable,tanstack}-default-s2-hypothesis-{filter-metadata,filter-text}-2026-05-11*.summary.json | wc -l ``` @@ -180,6 +182,7 @@ No source code, package, or test files modified. ### Task 3 — Read verdicts; decide auto-merge - [ ] **3.1** Read the two slice verdicts: + ``` jq '.slices | {ft: .pretableFilterTextOverBudget.verdict, fm: .tanstackVsPretableFilterMetadata.verdict}' status/milestones/2026-05-11-interaction-borderline-high-repeat.json ``` @@ -227,12 +230,12 @@ No source code, package, or test files modified. ## High-repeat data - | (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | - | --- | --- | --- | --- | --- | --- | --- | - | pretable, filter-text | | | | | | | - | pretable, filter-metadata | | | | | | | - | tanstack, filter-text | | | | | | | - | tanstack, filter-metadata | | | | | | | + | (adapter, script) | n | mean (ms) | σ (ms) | min | median | max | + | ------------------------- | --- | --------- | ------ | ----- | ------ | ----- | + | pretable, filter-text | | | | | | | + | pretable, filter-metadata | | | | | | | + | tanstack, filter-text | | | | | | | + | tanstack, filter-metadata | | | | | | | Source: `status/milestones/2026-05-11-interaction-borderline-high-repeat.json`. @@ -268,6 +271,7 @@ No source code, package, or test files modified. ## Verdict ``` @@ -285,12 +289,15 @@ No source code, package, or test files modified. ## Task 5 — Gates + PR - [ ] **5.1** Repo-wide gates: + ``` pnpm -w typecheck && pnpm -w test && pnpm -w lint && pnpm format ``` + Expected: all pass (no source changes). - [ ] **5.2** Push + open PR: + ``` git push -u origin interaction-borderline-perf-diag gh pr create --title "docs(research): interaction borderline perf diagnostic" --body "..." @@ -304,11 +311,11 @@ No source code, package, or test files modified. ## Self-review -| Spec section | Plan task | -| --- | --- | -| Phase A matrix | Task 1 | -| Per-slice verdicts | Task 2.1 + Task 3 | -| Memo | Task 4 | -| Auto-merge gate | Task 3.2 + Task 5.3 | +| Spec section | Plan task | +| ------------------ | ------------------- | +| Phase A matrix | Task 1 | +| Per-slice verdicts | Task 2.1 + Task 3 | +| Memo | Task 4 | +| Auto-merge gate | Task 3.2 + Task 5.3 | No placeholders outside the memo template (those are intentional). Type/value consistency: paths consistent; verdict enum values consistent between Task 2.1 compute + Task 3.2 branch + Task 4 memo + Task 5.3 gate. diff --git a/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md b/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md index 7b424c4..9fc0b61 100644 --- a/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md +++ b/docs/superpowers/specs/2026-05-11-interaction-borderline-perf-diag-design.md @@ -19,10 +19,10 @@ Output is a research memo + raw evidence — no code changes. If verdicts are "n PR #131 captured (n=3 medians, Chromium S2/hypothesis): -| Script | pretable `interaction_latency_ms` | tanstack `interaction_latency_ms` | -| --- | --- | --- | -| `filter-text` | **17.7 ms** | 40.2 ms | -| `filter-metadata` | 16.0 ms | **15.7 ms** | +| Script | pretable `interaction_latency_ms` | tanstack `interaction_latency_ms` | +| ----------------- | --------------------------------- | --------------------------------- | +| `filter-text` | **17.7 ms** | 40.2 ms | +| `filter-metadata` | 16.0 ms | **15.7 ms** | `filter-text` 17.7 ms is the only pretable script over the 16 ms single-frame budget in the entire PR #131 runset; the page prose acknowledges this as "fractionally over." `filter-metadata` 15.7 ms is the only place tanstack edges pretable. @@ -40,11 +40,11 @@ Both are within ±2 ms of budget at n=3, where p95 is essentially max-of-3 and a One PR off latest `main`. Three sequential phases (mirrors PR #124 + PR #133): -| Phase | Action | Output | -|---|---|---| -| A | n=20 matrix re-run for `pretable + tanstack` × S2/hypothesis × {`filter-metadata`, `filter-text`}. | `status/milestones/2026-05-11-interaction-borderline-high-repeat.json` with mean / σ / min / median / max per (adapter, script). | -| B | (Skipped — no traces. Borderlines don't have a perf cliff to find.) | n/a | -| C | Memo with per-slice verdicts. | `docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md` | +| Phase | Action | Output | +| ----- | -------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| A | n=20 matrix re-run for `pretable + tanstack` × S2/hypothesis × {`filter-metadata`, `filter-text`}. | `status/milestones/2026-05-11-interaction-borderline-high-repeat.json` with mean / σ / min / median / max per (adapter, script). | +| B | (Skipped — no traces. Borderlines don't have a perf cliff to find.) | n/a | +| C | Memo with per-slice verdicts. | `docs/research/2026-05-11-interaction-borderline-perf-diagnostic.md` | ## Method details