Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 106 additions & 4 deletions nvbench/detail/measure_cold.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ void measure_cold_base::generate_summaries()
summ.set_int64("value", m_total_samples);
}

// cpu time statistics
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/min");
summ.set_string("name", "Min CPU Time");
Expand Down Expand Up @@ -238,6 +239,7 @@ void measure_cold_base::generate_summaries()
summ.set_string("description",
"Mean isolated kernel execution time "
"(measured on host CPU)");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_mean);
}

Expand All @@ -248,7 +250,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/absolute");
summ.set_string("name", "Noise");
summ.set_string("hint", "duration");
summ.set_string("description", "Standard deviation of isolated CPU times");
summ.set_string("description", "Standard deviation of isolated kernel execution CPU times");
summ.set_float64("value", cpu_stdev);
summ.set_string("hide", "Hidden by default.");
}
Expand All @@ -258,10 +260,60 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description", "Relative standard deviation of isolated CPU times");
summ.set_string("description",
"Relative standard deviation of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_noise);
}

const auto [cpu_time_first_quartile, cpu_time_median, cpu_time_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
m_cpu_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of isolated kernel execution CPU times");
summ.set_float64("value", cpu_time_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/median");
summ.set_string("name", "CPU Time");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of isolated kernel execution CPU times");
summ.set_float64("value", cpu_time_median);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_time_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description", "Interquartile range of isolated kernel execution CPU times");
summ.set_string("hide", "Hidden by default.");
const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile;
summ.set_float64("value", cpu_time_ir);
}
{
auto &summ = m_state.add_summary("nv/cold/time/cpu/ir/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of isolated kernel execution CPU times");
const auto cpu_time_ir = cpu_time_third_quartile - cpu_time_first_quartile;
const auto cpu_robust_noise = cpu_time_ir / cpu_time_median;
summ.set_float64("value", cpu_robust_noise);
}
Comment on lines +306 to +314
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

important: Relative IR is computed as IQR / median at Line 312 and Line 417 with no zero/finite guard. For very short or quantized timings, median can be zero and produce inf/nan summaries. Guard both calculations and publish only finite values.

As per coding guidelines, "nvbench/**/*: Focus on benchmark correctness, ... measurement semantics, statistical summaries, and test coverage."

Also applies to: 411-419


// gpu time statistics
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/min");
summ.set_string("name", "Min GPU Time");
Expand Down Expand Up @@ -292,6 +344,7 @@ void measure_cold_base::generate_summaries()
summ.set_string("description",
"Mean isolated kernel execution time "
"(measured with CUDA events)");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_mean);
}

Expand All @@ -302,7 +355,7 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/absolute");
summ.set_string("name", "Noise");
summ.set_string("hint", "duration");
summ.set_string("description", "Standard deviation of isolated GPU times");
summ.set_string("description", "Standard deviation of isolated kernel execution GPU times");
summ.set_float64("value", cuda_stdev);
summ.set_string("hide", "Hidden by default.");
}
Expand All @@ -312,10 +365,59 @@ void measure_cold_base::generate_summaries()
auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description", "Relative standard deviation of isolated GPU times");
summ.set_string("description",
"Relative standard deviation of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_noise);
}

const auto [cuda_time_first_quartile, cuda_time_median, cuda_time_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
m_cuda_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of isolated kernel execution GPU times");
summ.set_float64("value", cuda_time_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/median");
summ.set_string("name", "GPU Time");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of isolated kernel execution GPU times");
summ.set_float64("value", cuda_time_median);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cuda_time_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description", "Interquartile range of isolated kernel execution GPU times");
summ.set_string("hide", "Hidden by default.");
const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile;
summ.set_float64("value", cuda_time_ir);
}
{
auto &summ = m_state.add_summary("nv/cold/time/gpu/ir/relative");
summ.set_string("name", "Noise");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of isolated kernel execution GPU times");
const auto cuda_time_ir = cuda_time_third_quartile - cuda_time_first_quartile;
const auto cuda_robust_noise = cuda_time_ir / cuda_time_median;
summ.set_float64("value", cuda_robust_noise);
}

if (const auto items = m_state.get_element_count(); items != 0)
{
auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
Expand Down
50 changes: 50 additions & 0 deletions nvbench/detail/measure_cpu_only.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,56 @@ void measure_cpu_only_base::generate_summaries()
summ.set_float64("value", cpu_noise);
}

const auto [cpu_first_quartile, cpu_median, cpu_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cpu_times.cbegin(),
m_cpu_times.cend(),
{25, 50, 75});
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q1");
summ.set_string("name", "Q1");
summ.set_string("hint", "duration");
summ.set_string("description", "First quartile of CPU times of isolated kernel executions");
summ.set_float64("value", cpu_first_quartile);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/median");
summ.set_string("name", "Median");
summ.set_string("hint", "duration");
summ.set_string("description", "Median of CPU times of isolated kernel executions");
summ.set_float64("value", cpu_median);
summ.set_string("hide", "Hidden by default.");
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/q3");
summ.set_string("name", "Q3");
summ.set_string("hint", "duration");
summ.set_string("description", "Third quartile of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
summ.set_float64("value", cpu_third_quartile);
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/absolute");
summ.set_string("name", "IR");
summ.set_string("hint", "duration");
summ.set_string("description",
"Interquartile range of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
const auto cpu_ir = cpu_third_quartile - cpu_first_quartile;
summ.set_float64("value", cpu_ir);
}
{
auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/ir/relative");
summ.set_string("name", "IR");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Relative interquartile range of CPU times of isolated kernel executions");
summ.set_string("hide", "Hidden by default.");
const auto cpu_ir = cpu_third_quartile - cpu_first_quartile;
const auto cpu_robust_noise = cpu_ir / cpu_median;
summ.set_float64("value", cpu_robust_noise);
Comment on lines +215 to +223
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

important: The relative IR metric at Line 222 divides by cpu_median without a zero/finite guard. This can emit inf/nan in nv/cpu_only/time/cpu/ir/relative and break downstream consumers that expect finite numeric summaries. Guard cpu_median and only publish finite values.

As per coding guidelines, "nvbench/**/*: Focus on benchmark correctness, ... measurement semantics, statistical summaries, and test coverage."

}

if (const auto items = m_state.get_element_count(); items != 0)
{
auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate");
Expand Down
53 changes: 53 additions & 0 deletions nvbench/detail/statistics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,15 @@
#include <nvbench/detail/transform_reduce.cuh>
#include <nvbench/types.cuh>

#include <algorithm>
#include <array>
#include <cmath>
#include <functional>
#include <iterator>
#include <limits>
#include <numeric>
#include <type_traits>
#include <vector>

#ifndef M_PI
#define M_PI 3.14159265358979323846
Expand Down Expand Up @@ -93,6 +96,56 @@ nvbench::float64_t compute_mean(It first, It last)
return std::accumulate(first, last, 0.0) / static_cast<nvbench::float64_t>(num);
}

/**
* Computes exact percentile values using rank round(p / 100 * (S - 1)).
*
* The input range is copied before sorting, so const iterators are supported.
* If the input has fewer than 1 sample, all percentiles are returned as infinity.
*/
Comment on lines +99 to +104
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

suggestion: Line 103 says empty input returns infinity, but Lines 115-116 return quiet_NaN(). Update the doc comment to match the implementation contract.

Also applies to: 115-116

template <typename Iter,
std::size_t N,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
std::array<ValueType, N> compute_percentiles(Iter first, Iter last, std::array<int, N> percentiles)
{
std::array<ValueType, N> result{};

const auto num = std::distance(first, last);
if (num < 1)
{
result.fill(std::numeric_limits<ValueType>::quiet_NaN());
return result;
}

std::vector<ValueType> sorted(first, last);
std::sort(sorted.begin(), sorted.end());

const auto max_rank = static_cast<nvbench::float64_t>(sorted.size() - 1);
for (std::size_t i = 0; i < N; ++i)
{
const auto clamped_percentile = std::clamp(percentiles[i], 0, 100);

const auto quantile = static_cast<nvbench::float64_t>(clamped_percentile) / 100.0;
const auto rank = static_cast<std::size_t>(std::round(quantile * max_rank));

result[i] = sorted[rank];
}

return result;
}

/**
* Overload that supports calls like `compute_percentiles(first, last, {25, 50, 75})`.
*/
template <typename Iter,
std::size_t N,
typename ValueType = typename std::iterator_traits<Iter>::value_type>
std::array<ValueType, N> compute_percentiles(Iter first, Iter last, const int (&percentiles)[N])
{
std::array<int, N> percentile_array{};
std::copy(std::begin(percentiles), std::end(percentiles), percentile_array.begin());
return compute_percentiles(first, last, percentile_array);
}

/**
* Computes linear regression and returns the slope and intercept
*
Expand Down
23 changes: 14 additions & 9 deletions nvbench/detail/stdrel_criterion.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,20 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
m_total_cuda_time += measurement;
m_cuda_times.push_back(measurement);

// Compute convergence statistics using CUDA timings:
const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
const auto cuda_stdev = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
m_cuda_times.cend(),
mean_cuda_time);
const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
if (std::isfinite(cuda_rel_stdev))
// require at least 5 samples for meaningful noise estimate
if (m_total_samples > 4)
{
m_noise_tracker.push_back(cuda_rel_stdev);
// Compute convergence statistics using CUDA timings:
const auto [cuda_first_quartile, cuda_median, cuda_third_quartile] =
nvbench::detail::statistics::compute_percentiles(m_cuda_times.cbegin(),
m_cuda_times.cend(),
{25, 50, 75});
const auto cuda_noise = (cuda_third_quartile - cuda_first_quartile) / cuda_median;

if (std::isfinite(cuda_noise))
{
m_noise_tracker.push_back(cuda_noise);
}
Comment on lines +44 to +57
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

critical: With the new gate at Line 45, m_noise_tracker can stay empty while do_is_finished() still reaches m_noise_tracker.back() at Line 69 (e.g., min-time reached before 5 samples). This is undefined behavior and can crash. Add an explicit empty check before any back() access.

As per coding guidelines, "nvbench/**/*: Focus on benchmark correctness, CUDA stream/event ordering, synchronization behavior, error handling, ... measurement semantics, statistical summaries, and test coverage."

}
}

Expand All @@ -66,7 +71,7 @@ bool stdrel_criterion::do_is_finished()
return true;
}

// Check if the noise (cuda rel stdev) has converged by inspecting a
// Check if the noise has converged by inspecting a
// trailing window of recorded noise measurements.
// This helps identify benchmarks that are inherently noisy and would
// never converge to the target stdev threshold. This check ensures that the
Expand Down
Loading
Loading