diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 76eb9415..da887254 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -207,6 +207,7 @@ void measure_cold_base::generate_summaries() summ.set_int64("value", m_total_samples); } + // cpu time statistics { auto &summ = m_state.add_summary("nv/cold/time/cpu/min"); summ.set_string("name", "Min CPU Time"); @@ -238,6 +239,7 @@ void measure_cold_base::generate_summaries() summ.set_string("description", "Mean isolated kernel execution time " "(measured on host CPU)"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cpu_mean); } @@ -248,7 +250,7 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute"); summ.set_string("name", "Noise"); summ.set_string("hint", "duration"); - summ.set_string("description", "Standard deviation of isolated CPU times"); + summ.set_string("description", "Standard deviation of isolated kernel execution CPU times"); summ.set_float64("value", cpu_stdev); summ.set_string("hide", "Hidden by default."); } @@ -258,10 +260,60 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative"); summ.set_string("name", "Noise"); summ.set_string("hint", "percentage"); - summ.set_string("description", "Relative standard deviation of isolated CPU times"); + summ.set_string("description", + "Relative standard deviation of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cpu_noise); } + const auto [cpu_time_first_quartile, cpu_time_median, cpu_time_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(), + m_cpu_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of isolated kernel execution CPU times"); + summ.set_float64("value", cpu_time_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/median"); + summ.set_string("name", "CPU Time"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of isolated kernel execution CPU times"); + summ.set_float64("value", cpu_time_median); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cpu_time_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Interquartile range of isolated kernel execution CPU times"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile; + summ.set_float64("value", cpu_time_ir); + } + { + auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/relative"); + summ.set_string("name", "Noise"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of isolated kernel execution CPU times"); + const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile; + const auto cpu_robust_noise = cpu_time_ir / cpu_time_median; + summ.set_float64("value", cpu_robust_noise); + } + + // gpu time statistics { auto &summ = m_state.add_summary("nv/cold/time/gpu/min"); summ.set_string("name", "Min GPU Time"); @@ -292,6 +344,7 @@ void measure_cold_base::generate_summaries() summ.set_string("description", "Mean isolated kernel execution time " "(measured with CUDA events)"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cuda_mean); } @@ -302,7 +355,7 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute"); summ.set_string("name", "Noise"); summ.set_string("hint", "duration"); - summ.set_string("description", "Standard deviation of isolated GPU times"); + summ.set_string("description", "Standard deviation of isolated kernel execution GPU times"); summ.set_float64("value", cuda_stdev); summ.set_string("hide", "Hidden by default."); } @@ -312,10 +365,59 @@ void measure_cold_base::generate_summaries() auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative"); summ.set_string("name", "Noise"); summ.set_string("hint", "percentage"); - summ.set_string("description", "Relative standard deviation of isolated GPU times"); + summ.set_string("description", + "Relative standard deviation of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); summ.set_float64("value", cuda_noise); } + const auto [cuda_time_first_quartile, cuda_time_median, cuda_time_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(), + m_cuda_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of isolated kernel execution GPU times"); + summ.set_float64("value", cuda_time_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/median"); + summ.set_string("name", "GPU Time"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of isolated kernel execution GPU times"); + summ.set_float64("value", cuda_time_median); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cuda_time_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Interquartile range of isolated kernel execution GPU times"); + summ.set_string("hide", "Hidden by default."); + const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile; + summ.set_float64("value", cuda_time_ir); + } + { + auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/relative"); + summ.set_string("name", "Noise"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of isolated kernel execution GPU times"); + const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile; + const auto cuda_robust_noise = cuda_time_ir / cuda_time_median; + summ.set_float64("value", cuda_robust_noise); + } + if (const auto items = m_state.get_element_count(); items != 0) { auto &summ = m_state.add_summary("nv/cold/bw/item_rate"); diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx index bacde6f0..8a3977ff 100644 --- a/nvbench/detail/measure_cpu_only.cxx +++ b/nvbench/detail/measure_cpu_only.cxx @@ -173,6 +173,56 @@ void measure_cpu_only_base::generate_summaries() summ.set_float64("value", cpu_noise); } + const auto [cpu_first_quartile, cpu_median, cpu_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(), + m_cpu_times.cend(), + {25, 50, 75}); + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q1"); + summ.set_string("name", "Q1"); + summ.set_string("hint", "duration"); + summ.set_string("description", "First quartile of CPU times of isolated kernel executions"); + summ.set_float64("value", cpu_first_quartile); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/median"); + summ.set_string("name", "Median"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Median of CPU times of isolated kernel executions"); + summ.set_float64("value", cpu_median); + summ.set_string("hide", "Hidden by default."); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q3"); + summ.set_string("name", "Q3"); + summ.set_string("hint", "duration"); + summ.set_string("description", "Third quartile of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", cpu_third_quartile); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/absolute"); + summ.set_string("name", "IR"); + summ.set_string("hint", "duration"); + summ.set_string("description", + "Interquartile range of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_ir = cpu_third_quartile - cpu_first_quartile; + summ.set_float64("value", cpu_ir); + } + { + auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/relative"); + summ.set_string("name", "IR"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Relative interquartile range of CPU times of isolated kernel executions"); + summ.set_string("hide", "Hidden by default."); + const auto cpu_ir = cpu_third_quartile - cpu_first_quartile; + const auto cpu_robust_noise = cpu_ir / cpu_median; + summ.set_float64("value", cpu_robust_noise); + } + if (const auto items = m_state.get_element_count(); items != 0) { auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate"); diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh index 225403bb..93920a82 100644 --- a/nvbench/detail/statistics.cuh +++ b/nvbench/detail/statistics.cuh @@ -31,12 +31,15 @@ #include #include +#include +#include #include #include #include #include #include #include +#include #ifndef M_PI #define M_PI 3.14159265358979323846 @@ -93,6 +96,56 @@ nvbench::float64_t compute_mean(It first, It last) return std::accumulate(first, last, 0.0) / static_cast(num); } +/** + * Computes exact percentile values using rank round(p / 100 * (S - 1)). + * + * The input range is copied before sorting, so const iterators are supported. + * If the input has fewer than 1 sample, all percentiles are returned as infinity. + */ +template ::value_type> +std::array compute_percentiles(Iter first, Iter last, std::array percentiles) +{ + std::array result{}; + + const auto num = std::distance(first, last); + if (num < 1) + { + result.fill(std::numeric_limits::quiet_NaN()); + return result; + } + + std::vector sorted(first, last); + std::sort(sorted.begin(), sorted.end()); + + const auto max_rank = static_cast(sorted.size() - 1); + for (std::size_t i = 0; i < N; ++i) + { + const auto clamped_percentile = std::clamp(percentiles[i], 0, 100); + + const auto quantile = static_cast(clamped_percentile) / 100.0; + const auto rank = static_cast(std::round(quantile * max_rank)); + + result[i] = sorted[rank]; + } + + return result; +} + +/** + * Overload that supports calls like `compute_percentiles(first, last, {25, 50, 75})`. + */ +template ::value_type> +std::array compute_percentiles(Iter first, Iter last, const int (&percentiles)[N]) +{ + std::array percentile_array{}; + std::copy(std::begin(percentiles), std::end(percentiles), percentile_array.begin()); + return compute_percentiles(first, last, percentile_array); +} + /** * Computes linear regression and returns the slope and intercept * diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx index 8960818d..4365cd47 100644 --- a/nvbench/detail/stdrel_criterion.cxx +++ b/nvbench/detail/stdrel_criterion.cxx @@ -41,15 +41,20 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement) m_total_cuda_time += measurement; m_cuda_times.push_back(measurement); - // Compute convergence statistics using CUDA timings: - const auto mean_cuda_time = m_total_cuda_time / static_cast(m_total_samples); - const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(), - m_cuda_times.cend(), - mean_cuda_time); - const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time; - if (std::isfinite(cuda_rel_stdev)) + // require at least 5 samples for meaningful noise estimate + if (m_total_samples > 4) { - m_noise_tracker.push_back(cuda_rel_stdev); + // Compute convergence statistics using CUDA timings: + const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] = + nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(), + m_cuda_times.cend(), + {25, 50, 75}); + const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median; + + if (std::isfinite(cuda_noise)) + { + m_noise_tracker.push_back(cuda_noise); + } } } @@ -66,7 +71,7 @@ bool stdrel_criterion::do_is_finished() return true; } - // Check if the noise (cuda rel stdev) has converged by inspecting a + // Check if the noise has converged by inspecting a // trailing window of recorded noise measurements. // This helps identify benchmarks that are inherently noisy and would // never converge to the target stdev threshold. This check ensures that the diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py index c6370332..118e2128 100644 --- a/python/scripts/nvbench_compare.py +++ b/python/scripts/nvbench_compare.py @@ -5,6 +5,7 @@ import os import sys from enum import StrEnum +from itertools import islice import jsondiff import tabulate @@ -347,11 +348,18 @@ def compare_benches( for cmp_device_id in cmp_device_ids: rows = [] plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}} + counters = {} for cmp_state in cmp_states: cmp_state_name = cmp_state["name"] + counters[cmp_state_name] = counters.get(cmp_state_name, 0) + 1 ref_state = next( - filter(lambda st: st["name"] == cmp_state_name, ref_states), None + islice( + filter(lambda st: st["name"] == cmp_state_name, ref_states), + counters[cmp_state_name] - 1, + None, + ), + None, ) if not ref_state: continue @@ -377,16 +385,16 @@ def lookup_summary(summaries, tag): return next(filter(lambda s: s["tag"] == tag, summaries), None) cmp_time_summary = lookup_summary( - cmp_summaries, "nv/cold/time/gpu/mean" + cmp_summaries, "nv/cold/time/gpu/median" ) ref_time_summary = lookup_summary( - ref_summaries, "nv/cold/time/gpu/mean" + ref_summaries, "nv/cold/time/gpu/median" ) cmp_noise_summary = lookup_summary( - cmp_summaries, "nv/cold/time/gpu/stdev/relative" + cmp_summaries, "nv/cold/time/gpu/ir/relative" ) ref_noise_summary = lookup_summary( - ref_summaries, "nv/cold/time/gpu/stdev/relative" + ref_summaries, "nv/cold/time/gpu/ir/relative" ) # TODO: Use other timings, too. Maybe multiple rows, with a @@ -424,15 +432,15 @@ def extract_value(summary): if ref_noise and cmp_noise: ref_noise = float(ref_noise) cmp_noise = float(cmp_noise) - min_noise = min(ref_noise, cmp_noise) + max_noise = max(ref_noise, cmp_noise) elif ref_noise: ref_noise = float(ref_noise) - min_noise = ref_noise + max_noise = ref_noise elif cmp_noise: cmp_noise = float(cmp_noise) - min_noise = cmp_noise + max_noise = cmp_noise else: - min_noise = None # Noise is inf + max_noise = None # Noise is inf if plot_along: axis_name = [] @@ -461,11 +469,11 @@ def extract_value(summary): global failure_count config_count += 1 - if not min_noise: + if max_noise is None: unknown_count += 1 status_label = "????" status = colorize(status_label, Fore.YELLOW, Emoji.YELLOW, no_color) - elif abs(frac_diff) <= min_noise: + elif abs(frac_diff) <= max_noise: pass_count += 1 status_label = "SAME" status = colorize(status_label, Fore.BLUE, Emoji.BLUE, no_color) @@ -695,9 +703,9 @@ def main(): print("# Summary\n") print("- Total Matches: %d" % config_count) - print(" - Pass (diff <= min_noise): %d" % pass_count) + print(" - Pass (diff <= max_noise): %d" % pass_count) print(" - Unknown (infinite noise): %d" % unknown_count) - print(" - Failure (diff > min_noise): %d" % failure_count) + print(" - Failure (diff > max_noise): %d" % failure_count) return failure_count diff --git a/testing/statistics.cu b/testing/statistics.cu index db4a40db..ed49a5ba 100644 --- a/testing/statistics.cu +++ b/testing/statistics.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include "test_asserts.cuh" @@ -52,6 +53,50 @@ void test_std() ASSERT(std::abs(actual - expected) < 0.001); } +void test_percentiles() +{ + { + const std::vector data{40.0, 10.0, 30.0, 20.0}; + const auto actual = statistics::compute_percentiles(data.cbegin(), + data.cend(), + std::array{0, 25, 50, 75, 100}); + const std::array expected{10.0, 20.0, 30.0, 30.0, 40.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{42.0}; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{25, 50, 75}); + const std::array expected{42.0, 42.0, 42.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{40.0, 10.0, 30.0, 20.0}; + const auto actual = statistics::compute_percentiles(data.cbegin(), data.cend(), {25, 50, 75}); + const std::array expected{20.0, 30.0, 30.0}; + ASSERT(actual == expected); + } + + { + const std::vector data{10.0, 20.0, 30.0, 40.0}; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{-25, 125}); + const std::array expected{10.0, 40.0}; + ASSERT(actual == expected); + } + + { + const std::vector data; + const auto actual = + statistics::compute_percentiles(data.cbegin(), data.cend(), std::array{25, 50, 75}); + ASSERT(!std::isfinite(actual[0])); + ASSERT(!std::isfinite(actual[1])); + ASSERT(!std::isfinite(actual[2])); + } +} + void test_lin_regression() { { @@ -126,6 +171,7 @@ int main() { test_mean(); test_std(); + test_percentiles(); test_lin_regression(); test_r2(); test_slope_conversion();