From ea353edf003e6f22b3acb0020dd29c00c22d5de1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 1 May 2026 15:23:26 -0500
Subject: [PATCH 1/4] Add statistics::compute_percentiles, use it in summaries
 of measure_cold

Percentiles on empty dataset are NaN, not infinity

Add Robust statistics of CPU times to summary

Fixed name for nv/cold/time/gpu/q3, corrected value reported for
nv/cold/time/gpu/ir/relative

Use median and IR to compute location and noise in measure_cold

Also in stdrel_criterion, compute noise as IR / median.
---
 nvbench/detail/measure_cold.cu      | 110 +++++++++++++++++++++++++++-
 nvbench/detail/measure_cpu_only.cxx |  50 +++++++++++++
 nvbench/detail/statistics.cuh       |  53 ++++++++++++++
 nvbench/detail/stdrel_criterion.cxx |  17 +++--
 testing/statistics.cu               |  46 ++++++++++++
 5 files changed, 264 insertions(+), 12 deletions(-)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 76eb9415..da887254 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -207,6 +207,7 @@ void measure_cold_base::generate_summaries()
     summ.set_int64("value", m_total_samples);
   }
 
+  // cpu time statistics
   {
     auto &summ = m_state.add_summary("nv/cold/time/cpu/min");
     summ.set_string("name", "Min CPU Time");
@@ -238,6 +239,7 @@ void measure_cold_base::generate_summaries()
     summ.set_string("description",
                     "Mean isolated kernel execution time "
                     "(measured on host CPU)");
+    summ.set_string("hide", "Hidden by default.");
     summ.set_float64("value", cpu_mean);
   }
 
@@ -248,7 +250,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "duration");
-    summ.set_string("description", "Standard deviation of isolated CPU times");
+    summ.set_string("description", "Standard deviation of isolated kernel execution CPU times");
     summ.set_float64("value", cpu_stdev);
     summ.set_string("hide", "Hidden by default.");
   }
@@ -258,10 +260,60 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description", "Relative standard deviation of isolated CPU times");
+    summ.set_string("description",
+                    "Relative standard deviation of isolated kernel execution CPU times");
+    summ.set_string("hide", "Hidden by default.");
     summ.set_float64("value", cpu_noise);
   }
 
+  const auto [cpu_time_first_quartile, cpu_time_median, cpu_time_third_quartile] =
+    nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
+                                                     m_cpu_times.cend(),
+                                                     {25, 50, 75});
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/q1");
+    summ.set_string("name", "Q1");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "First quartile of isolated kernel execution CPU times");
+    summ.set_float64("value", cpu_time_first_quartile);
+    summ.set_string("hide", "Hidden by default.");
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/median");
+    summ.set_string("name", "CPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Median of isolated kernel execution CPU times");
+    summ.set_float64("value", cpu_time_median);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/q3");
+    summ.set_string("name", "Q3");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Third quartile of isolated kernel execution CPU times");
+    summ.set_string("hide", "Hidden by default.");
+    summ.set_float64("value", cpu_time_third_quartile);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/absolute");
+    summ.set_string("name", "IR");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Interquartile range of isolated kernel execution CPU times");
+    summ.set_string("hide", "Hidden by default.");
+    const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile;
+    summ.set_float64("value", cpu_time_ir);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/relative");
+    summ.set_string("name", "Noise");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description",
+                    "Relative interquartile range of isolated kernel execution CPU times");
+    const auto cpu_time_ir      = cpu_time_third_quartile - cpu_time_first_quartile;
+    const auto cpu_robust_noise = cpu_time_ir / cpu_time_median;
+    summ.set_float64("value", cpu_robust_noise);
+  }
+
+  // gpu time statistics
   {
     auto &summ = m_state.add_summary("nv/cold/time/gpu/min");
     summ.set_string("name", "Min GPU Time");
@@ -292,6 +344,7 @@ void measure_cold_base::generate_summaries()
     summ.set_string("description",
                     "Mean isolated kernel execution time "
                     "(measured with CUDA events)");
+    summ.set_string("hide", "Hidden by default.");
     summ.set_float64("value", cuda_mean);
   }
 
@@ -302,7 +355,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "duration");
-    summ.set_string("description", "Standard deviation of isolated GPU times");
+    summ.set_string("description", "Standard deviation of isolated kernel execution GPU times");
     summ.set_float64("value", cuda_stdev);
     summ.set_string("hide", "Hidden by default.");
   }
@@ -312,10 +365,59 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description", "Relative standard deviation of isolated GPU times");
+    summ.set_string("description",
+                    "Relative standard deviation of isolated kernel execution GPU times");
+    summ.set_string("hide", "Hidden by default.");
     summ.set_float64("value", cuda_noise);
   }
 
+  const auto [cuda_time_first_quartile, cuda_time_median, cuda_time_third_quartile] =
+    nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
+                                                     m_cuda_times.cend(),
+                                                     {25, 50, 75});
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/q1");
+    summ.set_string("name", "Q1");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "First quartile of isolated kernel execution GPU times");
+    summ.set_float64("value", cuda_time_first_quartile);
+    summ.set_string("hide", "Hidden by default.");
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/median");
+    summ.set_string("name", "GPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Median of isolated kernel execution GPU times");
+    summ.set_float64("value", cuda_time_median);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/q3");
+    summ.set_string("name", "Q3");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Third quartile of isolated kernel execution GPU times");
+    summ.set_string("hide", "Hidden by default.");
+    summ.set_float64("value", cuda_time_third_quartile);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/absolute");
+    summ.set_string("name", "IR");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Interquartile range of isolated kernel execution GPU times");
+    summ.set_string("hide", "Hidden by default.");
+    const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile;
+    summ.set_float64("value", cuda_time_ir);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/relative");
+    summ.set_string("name", "Noise");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description",
+                    "Relative interquartile range of isolated kernel execution GPU times");
+    const auto cuda_time_ir      = cuda_time_third_quartile - cuda_time_first_quartile;
+    const auto cuda_robust_noise = cuda_time_ir / cuda_time_median;
+    summ.set_float64("value", cuda_robust_noise);
+  }
+
   if (const auto items = m_state.get_element_count(); items != 0)
   {
     auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx
index bacde6f0..8a3977ff 100644
--- a/nvbench/detail/measure_cpu_only.cxx
+++ b/nvbench/detail/measure_cpu_only.cxx
@@ -173,6 +173,56 @@ void measure_cpu_only_base::generate_summaries()
     summ.set_float64("value", cpu_noise);
   }
 
+  const auto [cpu_first_quartile, cpu_median, cpu_third_quartile] =
+    nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
+                                                     m_cpu_times.cend(),
+                                                     {25, 50, 75});
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q1");
+    summ.set_string("name", "Q1");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "First quartile of CPU times of isolated kernel executions");
+    summ.set_float64("value", cpu_first_quartile);
+    summ.set_string("hide", "Hidden by default.");
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/median");
+    summ.set_string("name", "Median");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Median of CPU times of isolated kernel executions");
+    summ.set_float64("value", cpu_median);
+    summ.set_string("hide", "Hidden by default.");
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q3");
+    summ.set_string("name", "Q3");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Third quartile of CPU times of isolated kernel executions");
+    summ.set_string("hide", "Hidden by default.");
+    summ.set_float64("value", cpu_third_quartile);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/absolute");
+    summ.set_string("name", "IR");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Interquartile range of CPU times of isolated kernel executions");
+    summ.set_string("hide", "Hidden by default.");
+    const auto cpu_ir = cpu_third_quartile - cpu_first_quartile;
+    summ.set_float64("value", cpu_ir);
+  }
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/relative");
+    summ.set_string("name", "IR");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description",
+                    "Relative interquartile range of CPU times of isolated kernel executions");
+    summ.set_string("hide", "Hidden by default.");
+    const auto cpu_ir           = cpu_third_quartile - cpu_first_quartile;
+    const auto cpu_robust_noise = cpu_ir / cpu_median;
+    summ.set_float64("value", cpu_robust_noise);
+  }
+
   if (const auto items = m_state.get_element_count(); items != 0)
   {
     auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate");
diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh
index 225403bb..93920a82 100644
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -31,12 +31,15 @@
 #include <nvbench/detail/transform_reduce.cuh>
 #include <nvbench/types.cuh>
 
+#include <algorithm>
+#include <array>
 #include <cmath>
 #include <functional>
 #include <iterator>
 #include <limits>
 #include <numeric>
 #include <type_traits>
+#include <vector>
 
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
@@ -93,6 +96,56 @@ nvbench::float64_t compute_mean(It first, It last)
   return std::accumulate(first, last, 0.0) / static_cast<nvbench::float64_t>(num);
 }
 
+/**
+ * Computes exact percentile values using rank round(p / 100 * (S - 1)).
+ *
+ * The input range is copied before sorting, so const iterators are supported.
+ * If the input has fewer than 1 sample, all percentiles are returned as infinity.
+ */
+template <typename Iter,
+          std::size_t N,
+          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+std::array<ValueType, N> compute_percentiles(Iter first, Iter last, std::array<int, N> percentiles)
+{
+  std::array<ValueType, N> result{};
+
+  const auto num = std::distance(first, last);
+  if (num < 1)
+  {
+    result.fill(std::numeric_limits<ValueType>::quiet_NaN());
+    return result;
+  }
+
+  std::vector<ValueType> sorted(first, last);
+  std::sort(sorted.begin(), sorted.end());
+
+  const auto max_rank = static_cast<nvbench::float64_t>(sorted.size() - 1);
+  for (std::size_t i = 0; i < N; ++i)
+  {
+    const auto clamped_percentile = std::clamp(percentiles[i], 0, 100);
+
+    const auto quantile = static_cast<nvbench::float64_t>(clamped_percentile) / 100.0;
+    const auto rank     = static_cast<std::size_t>(std::round(quantile * max_rank));
+
+    result[i] = sorted[rank];
+  }
+
+  return result;
+}
+
+/**
+ * Overload that supports calls like `compute_percentiles(first, last, {25, 50, 75})`.
+ */
+template <typename Iter,
+          std::size_t N,
+          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+std::array<ValueType, N> compute_percentiles(Iter first, Iter last, const int (&percentiles)[N])
+{
+  std::array<int, N> percentile_array{};
+  std::copy(std::begin(percentiles), std::end(percentiles), percentile_array.begin());
+  return compute_percentiles(first, last, percentile_array);
+}
+
 /**
  * Computes linear regression and returns the slope and intercept
  *
diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx
index 8960818d..0c53c3e1 100644
--- a/nvbench/detail/stdrel_criterion.cxx
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -42,14 +42,15 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
   m_cuda_times.push_back(measurement);
 
   // Compute convergence statistics using CUDA timings:
-  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
-                                                                              m_cuda_times.cend(),
-                                                                              mean_cuda_time);
-  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
-  if (std::isfinite(cuda_rel_stdev))
+  const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] =
+    nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
+                                                     m_cuda_times.cend(),
+                                                     {25, 50, 75});
+  const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median;
+
+  if (std::isfinite(cuda_noise))
   {
-    m_noise_tracker.push_back(cuda_rel_stdev);
+    m_noise_tracker.push_back(cuda_noise);
   }
 }
 
@@ -66,7 +67,7 @@ bool stdrel_criterion::do_is_finished()
     return true;
   }
 
-  // Check if the noise (cuda rel stdev) has converged by inspecting a
+  // Check if the noise has converged by inspecting a
   // trailing window of recorded noise measurements.
   // This helps identify benchmarks that are inherently noisy and would
   // never converge to the target stdev threshold. This check ensures that the
diff --git a/testing/statistics.cu b/testing/statistics.cu
index db4a40db..ed49a5ba 100644
--- a/testing/statistics.cu
+++ b/testing/statistics.cu
@@ -20,6 +20,7 @@
 #include <nvbench/types.cuh>
 
 #include <algorithm>
+#include <array>
 #include <vector>
 
 #include "test_asserts.cuh"
@@ -52,6 +53,50 @@ void test_std()
   ASSERT(std::abs(actual - expected) < 0.001);
 }
 
+void test_percentiles()
+{
+  {
+    const std::vector<nvbench::float64_t> data{40.0, 10.0, 30.0, 20.0};
+    const auto actual = statistics::compute_percentiles(data.cbegin(),
+                                                        data.cend(),
+                                                        std::array<int, 5>{0, 25, 50, 75, 100});
+    const std::array<nvbench::float64_t, 5> expected{10.0, 20.0, 30.0, 30.0, 40.0};
+    ASSERT(actual == expected);
+  }
+
+  {
+    const std::vector<nvbench::float64_t> data{42.0};
+    const auto actual =
+      statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 3>{25, 50, 75});
+    const std::array<nvbench::float64_t, 3> expected{42.0, 42.0, 42.0};
+    ASSERT(actual == expected);
+  }
+
+  {
+    const std::vector<nvbench::float64_t> data{40.0, 10.0, 30.0, 20.0};
+    const auto actual = statistics::compute_percentiles(data.cbegin(), data.cend(), {25, 50, 75});
+    const std::array<nvbench::float64_t, 3> expected{20.0, 30.0, 30.0};
+    ASSERT(actual == expected);
+  }
+
+  {
+    const std::vector<nvbench::float64_t> data{10.0, 20.0, 30.0, 40.0};
+    const auto actual =
+      statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 2>{-25, 125});
+    const std::array<nvbench::float64_t, 2> expected{10.0, 40.0};
+    ASSERT(actual == expected);
+  }
+
+  {
+    const std::vector<nvbench::float64_t> data;
+    const auto actual =
+      statistics::compute_percentiles(data.cbegin(), data.cend(), std::array<int, 3>{25, 50, 75});
+    ASSERT(!std::isfinite(actual[0]));
+    ASSERT(!std::isfinite(actual[1]));
+    ASSERT(!std::isfinite(actual[2]));
+  }
+}
+
 void test_lin_regression()
 {
   {
@@ -126,6 +171,7 @@ int main()
 {
   test_mean();
   test_std();
+  test_percentiles();
   test_lin_regression();
   test_r2();
   test_slope_conversion();

From 976c1125b29fe3a567e8d66f3942d15615f6c7ac Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 May 2026 15:58:42 -0500
Subject: [PATCH 2/4] Tweaks for nvbench_compare

1. For JSON files that contains repeated measurements of run-time
   axis values, make sure that scripts compares corresponding
   reference entries.

   If cmp had two states with the same name and ref had two, we
   would compare measurements for each state in cmp against the
   first state in ref.

   Change here introduces counters tracking how many times each
   particular axis value, and retrieve corresponding entry in ref.

Previously, I had

```

|  BlockSize  |  NumBlocks  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |      Diff |   %Diff |  Status  |
|-------------|-------------|------------|-------------|------------|-------------|-----------|---------|----------|
|     2^8     |     64      |   1.776 ms |       0.46% |   1.777 ms |       0.40% |  1.024 us |   0.06% |   SAME   |
|     2^8     |     64      |   1.776 ms |       0.46% |   1.774 ms |       0.52% | -2.048 us |  -0.12% |   SAME   |
|     2^8     |     64      |   1.776 ms |       0.46% |   1.773 ms |       0.52% | -3.072 us |  -0.17% |   SAME   |
|     2^8     |     64      |   1.776 ms |       0.46% |   1.774 ms |       0.58% | -2.048 us |  -0.12% |   SAME   |
|     2^8     |     64      |   1.776 ms |       0.46% |   1.773 ms |       0.58% | -3.072 us |  -0.17% |   SAME   |
```

and now it becomes

```

|  BlockSize  |  NumBlocks  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |      Diff |   %Diff |  Status  |
|-------------|-------------|------------|-------------|------------|-------------|-----------|---------|----------|
|     2^8     |     64      |   1.776 ms |       0.46% |   1.777 ms |       0.40% |  1.024 us |   0.06% |   SAME   |
|     2^8     |     64      |   1.773 ms |       0.64% |   1.774 ms |       0.52% |  1.024 us |   0.06% |   SAME   |
|     2^8     |     64      |   1.774 ms |       0.46% |   1.773 ms |       0.52% | -1.024 us |  -0.06% |   SAME   |
|     2^8     |     64      |   1.773 ms |       0.46% |   1.774 ms |       0.58% |  1.024 us |   0.06% |   SAME   |
|     2^8     |     64      |   1.774 ms |       0.52% |   1.773 ms |       0.58% | -1.024 us |  -0.06% |   SAME   |
```

With the following raw data expected

```
(py313) opavlyk@NV-22T4X34:~/repos/nvbench$ jq '. | .benchmarks[] | .states[] | .summaries[] | select(.tag == "nv/cold/time/gpu/median") | .data[] | .value' base.json
"0.0017756160497665405"
"0.0017725440263748169"
"0.001773568034172058"
"0.0017725440263748169"
"0.001773568034172058"

(py313) opavlyk@NV-22T4X34:~/repos/nvbench$ jq '. | .benchmarks[] | .states[] | .summaries[] | select(.tag == "nv/cold/time/gpu/median") | .data[] | .value' test.json
"0.0017766400575637818"
"0.001773568034172058"
"0.0017725440263748169"
"0.001773568034172058"
"0.0017725440263748169"
```

2. nvbench_compare changes from using min_noise = min(ref_noise, cmp_noise) to using max_noise = max(ref_noise, cmp_noise)
   Using larger of ref and cmp noise level as a reference against which to gauge timing difference ratio makes more sense.
---
 python/scripts/nvbench_compare.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py
index c6370332..209e0d15 100644
--- a/python/scripts/nvbench_compare.py
+++ b/python/scripts/nvbench_compare.py
@@ -5,6 +5,7 @@
 import os
 import sys
 from enum import StrEnum
+from itertools import islice
 
 import jsondiff
 import tabulate
@@ -347,11 +348,18 @@ def compare_benches(
         for cmp_device_id in cmp_device_ids:
             rows = []
             plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}
+            counters = {}
 
             for cmp_state in cmp_states:
                 cmp_state_name = cmp_state["name"]
+                counters[cmp_state_name] = counters.get(cmp_state_name, 0) + 1
                 ref_state = next(
-                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
+                    islice(
+                        filter(lambda st: st["name"] == cmp_state_name, ref_states),
+                        counters[cmp_state_name] - 1,
+                        None,
+                    ),
+                    None,
                 )
                 if not ref_state:
                     continue
@@ -424,15 +432,15 @@ def extract_value(summary):
                 if ref_noise and cmp_noise:
                     ref_noise = float(ref_noise)
                     cmp_noise = float(cmp_noise)
-                    min_noise = min(ref_noise, cmp_noise)
+                    max_noise = max(ref_noise, cmp_noise)
                 elif ref_noise:
                     ref_noise = float(ref_noise)
-                    min_noise = ref_noise
+                    max_noise = ref_noise
                 elif cmp_noise:
                     cmp_noise = float(cmp_noise)
-                    min_noise = cmp_noise
+                    max_noise = cmp_noise
                 else:
-                    min_noise = None  # Noise is inf
+                    max_noise = None  # Noise is inf
 
                 if plot_along:
                     axis_name = []
@@ -461,11 +469,11 @@ def extract_value(summary):
                 global failure_count
 
                 config_count += 1
-                if not min_noise:
+                if max_noise is None:
                     unknown_count += 1
                     status_label = "????"
                     status = colorize(status_label, Fore.YELLOW, Emoji.YELLOW, no_color)
-                elif abs(frac_diff) <= min_noise:
+                elif abs(frac_diff) <= max_noise:
                     pass_count += 1
                     status_label = "SAME"
                     status = colorize(status_label, Fore.BLUE, Emoji.BLUE, no_color)
@@ -695,9 +703,9 @@ def main():
 
     print("# Summary\n")
     print("- Total Matches: %d" % config_count)
-    print("  - Pass    (diff <= min_noise): %d" % pass_count)
+    print("  - Pass    (diff <= max_noise): %d" % pass_count)
     print("  - Unknown (infinite noise):    %d" % unknown_count)
-    print("  - Failure (diff > min_noise):  %d" % failure_count)
+    print("  - Failure (diff > max_noise):  %d" % failure_count)
     return failure_count
 
 

From 2333a2d82b9b302720317ae6344a8e0708237196 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 May 2026 16:11:22 -0500
Subject: [PATCH 3/4] Use median and IR/relative as cmp_time/ref_time and
 cmp_noise/ref_noise

These measures are less sensitive to outliers
---
 python/scripts/nvbench_compare.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py
index 209e0d15..118e2128 100644
--- a/python/scripts/nvbench_compare.py
+++ b/python/scripts/nvbench_compare.py
@@ -385,16 +385,16 @@ def lookup_summary(summaries, tag):
                     return next(filter(lambda s: s["tag"] == tag, summaries), None)
 
                 cmp_time_summary = lookup_summary(
-                    cmp_summaries, "nv/cold/time/gpu/mean"
+                    cmp_summaries, "nv/cold/time/gpu/median"
                 )
                 ref_time_summary = lookup_summary(
-                    ref_summaries, "nv/cold/time/gpu/mean"
+                    ref_summaries, "nv/cold/time/gpu/median"
                 )
                 cmp_noise_summary = lookup_summary(
-                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
+                    cmp_summaries, "nv/cold/time/gpu/ir/relative"
                 )
                 ref_noise_summary = lookup_summary(
-                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
+                    ref_summaries, "nv/cold/time/gpu/ir/relative"
                 )
 
                 # TODO: Use other timings, too. Maybe multiple rows, with a

From 6e4507273fc67dd8d02ea219ff633e6af86a2c6b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 5 May 2026 07:44:25 -0500
Subject: [PATCH 4/4] Require at least 5 samples to begin estimating noise
 level

---
 nvbench/detail/stdrel_criterion.cxx | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx
index 0c53c3e1..4365cd47 100644
--- a/nvbench/detail/stdrel_criterion.cxx
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -41,16 +41,20 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
   m_total_cuda_time += measurement;
   m_cuda_times.push_back(measurement);
 
-  // Compute convergence statistics using CUDA timings:
-  const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] =
-    nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
-                                                     m_cuda_times.cend(),
-                                                     {25, 50, 75});
-  const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median;
-
-  if (std::isfinite(cuda_noise))
+  // require at least 5 samples for meaningful noise estimate
+  if (m_total_samples > 4)
   {
-    m_noise_tracker.push_back(cuda_noise);
+    // Compute convergence statistics using CUDA timings:
+    const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] =
+      nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
+                                                       m_cuda_times.cend(),
+                                                       {25, 50, 75});
+    const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median;
+
+    if (std::isfinite(cuda_noise))
+    {
+      m_noise_tracker.push_back(cuda_noise);
+    }
   }
 }