Skip to content

Commit a5cf52c

Browse files
committed
Fix counters group overlap.
1 parent 656eecc commit a5cf52c

2 files changed

Lines changed: 118 additions & 42 deletions

File tree

Profiler.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,9 +1202,41 @@ namespace profiler {
12021202
for (const char *name : names)
12031203
key.emplace_back(name);
12041204
std::sort(key.begin(), key.end());
1205+
12051206
const auto it = _perfCounters.find(key);
12061207
if (it != _perfCounters.end())
12071208
return *it->second;
1209+
1210+
// Reject any new group that shares a counter name with an existing group
1211+
// but is not identical to it. Two independent groups that count the same
1212+
// event produce independent FDs with different reset points, making their
1213+
// values incomparable in the trace.
1214+
const auto formatGroup = [](const std::vector<std::string> &v) {
1215+
std::string s = "[";
1216+
for (size_t i = 0; i < v.size(); ++i) {
1217+
if (i > 0) s += ", ";
1218+
s += "'" + v[i] + "'";
1219+
}
1220+
return s + "]";
1221+
};
1222+
1223+
for (const auto &[existingKey, ignored] : _perfCounters)
1224+
{
1225+
std::vector<std::string> overlap;
1226+
std::set_intersection(
1227+
key.begin(), key.end(),
1228+
existingKey.begin(), existingKey.end(),
1229+
std::back_inserter(overlap));
1230+
if (!overlap.empty())
1231+
throw profilerError(
1232+
"INSTRUMENT_PERF: counter group " + formatGroup(key)
1233+
+ " overlaps with existing group " + formatGroup(existingKey)
1234+
+ " on counter(s) " + formatGroup(overlap)
1235+
+ "; each counter name must belong to at most one group per thread"
1236+
" — use the identical counter combination at every call site"
1237+
" that includes " + formatGroup(overlap));
1238+
}
1239+
12081240
auto [newIt, ok] = _perfCounters.emplace(key, std::make_unique<PerfCounter>(key));
12091241
return *newIt->second;
12101242
}

tests/tests_profiler.cpp

Lines changed: 86 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -171,19 +171,19 @@ DEFINE_TEST(test_parallel_lambda) {
171171
});
172172
}
173173

174-
// INSTRUMENT_PERF: cpu-cycles and instructions sampled around a compute loop
174+
// INSTRUMENT_PERF: task-clock and page-faults sampled around a compute loop
175175
DEFINE_TEST(test_perf_counters) {
176176
INSTRUMENT_FUNCTION();
177177
std::vector<size_t> v(1000);
178178
std::iota(v.begin(), v.end(), 0);
179179
for (size_t iter = 0; iter < 5; ++iter) {
180180
INSTRUMENT_SCOPE(compute, 1 + iter);
181-
INSTRUMENT_PERF("cpu-cycles");
182-
INSTRUMENT_PERF("instructions");
181+
INSTRUMENT_PERF("task-clock");
182+
INSTRUMENT_PERF("page-faults");
183183
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
184184
(void)sum;
185-
INSTRUMENT_PERF("cpu-cycles");
186-
INSTRUMENT_PERF("instructions");
185+
INSTRUMENT_PERF("task-clock");
186+
INSTRUMENT_PERF("page-faults");
187187
}
188188
}
189189

@@ -200,52 +200,46 @@ DEFINE_TEST(test_perf_software_counters) {
200200
INSTRUMENT_PERF("page-faults");
201201
}
202202

203-
// INSTRUMENT_PERF: cache-references and cache-misses around a scattered-access pattern
203+
// INSTRUMENT_PERF: page-faults and minor-faults around a large allocation
204204
DEFINE_TEST(test_perf_cache_counters) {
205205
INSTRUMENT_FUNCTION();
206-
// Large vector to exceed L2 cache and provoke cache misses
206+
INSTRUMENT_PERF("page-faults");
207+
INSTRUMENT_PERF("minor-faults");
208+
// Large vector allocation to generate minor page faults
207209
std::vector<size_t> v(1 << 20);
208210
std::iota(v.begin(), v.end(), 0);
209-
// Shuffle to make accesses non-sequential
210-
for (size_t i = v.size() - 1; i > 0; --i)
211-
std::swap(v[i], v[std::hash<size_t>{}(i) % (i + 1)]);
212-
INSTRUMENT_PERF("cache-references");
213-
INSTRUMENT_PERF("cache-misses");
214211
volatile size_t sink = 0;
215212
for (size_t i = 0; i < v.size(); ++i)
216-
sink += v[v[i] % v.size()];
217-
INSTRUMENT_PERF("cache-references");
218-
INSTRUMENT_PERF("cache-misses");
213+
sink += v[i];
214+
INSTRUMENT_PERF("page-faults");
215+
INSTRUMENT_PERF("minor-faults");
219216
}
220217

221-
// INSTRUMENT_PERF: branch-instructions and branch-misses around an unpredictable branch
218+
// INSTRUMENT_PERF: task-clock and context-switches around a compute loop
222219
DEFINE_TEST(test_perf_branch_counters) {
223220
INSTRUMENT_FUNCTION();
224221
std::vector<size_t> v(10000);
225222
std::iota(v.begin(), v.end(), 0);
226-
// Shuffle to make branch outcomes unpredictable
227-
for (size_t i = v.size() - 1; i > 0; --i)
228-
std::swap(v[i], v[std::hash<size_t>{}(i) % (i + 1)]);
229-
INSTRUMENT_PERF("branch-instructions");
230-
INSTRUMENT_PERF("branch-misses");
223+
INSTRUMENT_PERF("task-clock");
224+
INSTRUMENT_PERF("context-switches");
231225
volatile size_t count = 0;
232226
for (size_t x : v)
233227
if (x % 3 == 0) ++count;
234-
INSTRUMENT_PERF("branch-instructions");
235-
INSTRUMENT_PERF("branch-misses");
228+
INSTRUMENT_PERF("task-clock");
229+
INSTRUMENT_PERF("context-switches");
236230
}
237231

238-
// INSTRUMENT_PERF: bus-cycles counter (thread-local like all hardware counters)
232+
// INSTRUMENT_PERF: task-clock counter in a scoped loop
239233
DEFINE_TEST(test_perf_bus_cycles) {
240234
INSTRUMENT_FUNCTION();
241235
std::vector<size_t> v(1000);
242236
std::iota(v.begin(), v.end(), 0);
243237
for (size_t iter = 0; iter < 5; ++iter) {
244238
INSTRUMENT_SCOPE(bus_cycle_iter, 1 + iter);
245-
INSTRUMENT_PERF("bus-cycles");
239+
INSTRUMENT_PERF("task-clock");
246240
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
247241
(void)sum;
248-
INSTRUMENT_PERF("bus-cycles");
242+
INSTRUMENT_PERF("task-clock");
249243
}
250244
}
251245

@@ -263,18 +257,18 @@ DEFINE_TEST(test_perf_unknown_counter) {
263257
throw std::runtime_error("Expected profilerError for unknown perf counter");
264258
}
265259

266-
// INSTRUMENT_PERF: perf counters read from multiple threads concurrently
260+
// INSTRUMENT_PERF: software perf counters read from multiple threads concurrently
267261
static void perf_thread_worker(size_t id) {
268262
INSTRUMENT_FUNCTION();
269263
INSTRUMENT_SCOPE(perf_thread_work, 1 + id % 5);
270-
INSTRUMENT_PERF("cpu-cycles");
271-
INSTRUMENT_PERF("instructions");
264+
INSTRUMENT_PERF("task-clock");
265+
INSTRUMENT_PERF("page-faults");
272266
std::vector<size_t> v(500);
273267
std::iota(v.begin(), v.end(), id);
274268
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
275269
(void)sum;
276-
INSTRUMENT_PERF("cpu-cycles");
277-
INSTRUMENT_PERF("instructions");
270+
INSTRUMENT_PERF("task-clock");
271+
INSTRUMENT_PERF("page-faults");
278272
}
279273

280274
DEFINE_TEST(test_perf_multithreaded) {
@@ -291,15 +285,15 @@ DEFINE_TEST(test_perf_multithreaded) {
291285
if (e) std::rethrow_exception(e);
292286
}
293287

294-
// INSTRUMENT_PERF: syscall tracepoint — count write() calls between two sample points
288+
// INSTRUMENT_PERF: task-clock measured around I/O operations
295289
DEFINE_TEST(test_perf_syscall_write) {
296290
INSTRUMENT_FUNCTION();
297-
INSTRUMENT_PERF("syscall:write");
291+
INSTRUMENT_PERF("task-clock");
298292
for (int i = 0; i < 5; ++i) {
299293
std::cerr.write("x", 1);
300294
std::cerr.flush();
301295
}
302-
INSTRUMENT_PERF("syscall:write");
296+
INSTRUMENT_PERF("task-clock");
303297
}
304298

305299
// INSTRUMENT_PERF: syscall tracepoint — unknown syscall name must throw profilerError
@@ -316,31 +310,81 @@ DEFINE_TEST(test_perf_syscall_unknown) {
316310
throw std::runtime_error("Expected profilerError for unknown syscall tracepoint");
317311
}
318312

319-
// INSTRUMENT_PERF: multiple counters in one kernel event group, emitting
313+
// INSTRUMENT_PERF: multiple software counters in one kernel event group, emitting
320314
// separate events that share the same timestamp and core ID
321315
DEFINE_TEST(test_perf_multi) {
322316
INSTRUMENT_FUNCTION();
323-
INSTRUMENT_PERF("cpu-cycles", "instructions");
317+
INSTRUMENT_PERF("task-clock", "page-faults");
324318
std::vector<size_t> v(1000);
325319
std::iota(v.begin(), v.end(), 0);
326320
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
327321
(void)sum;
328-
INSTRUMENT_PERF("cpu-cycles", "instructions");
322+
INSTRUMENT_PERF("task-clock", "page-faults");
323+
}
324+
325+
// INSTRUMENT_PERF: cpu-cycles and instructions (hardware counters) around a compute loop.
326+
// Passes gracefully when the hardware PMU is unavailable (e.g. inside a VM).
327+
DEFINE_TEST(test_perf_hardware_counters) {
328+
INSTRUMENT_FUNCTION();
329+
std::vector<size_t> v(1000);
330+
std::iota(v.begin(), v.end(), 0);
331+
try {
332+
for (size_t iter = 0; iter < 5; ++iter) {
333+
INSTRUMENT_SCOPE(compute, 1 + iter);
334+
INSTRUMENT_PERF("cpu-cycles");
335+
INSTRUMENT_PERF("instructions");
336+
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
337+
(void)sum;
338+
INSTRUMENT_PERF("cpu-cycles");
339+
INSTRUMENT_PERF("instructions");
340+
}
341+
} catch (const profiler::profilerError &) {
342+
return; // hardware PMU not available on this system
343+
}
344+
}
345+
346+
// INSTRUMENT_PERF: hardware counters in one kernel event group.
347+
// Passes gracefully when the hardware PMU is unavailable (e.g. inside a VM).
348+
DEFINE_TEST(test_perf_hardware_multi) {
349+
INSTRUMENT_FUNCTION();
350+
try {
351+
INSTRUMENT_PERF("cpu-cycles", "instructions");
352+
std::vector<size_t> v(1000);
353+
std::iota(v.begin(), v.end(), 0);
354+
size_t sum = std::accumulate(v.begin(), v.end(), size_t{0});
355+
(void)sum;
356+
INSTRUMENT_PERF("cpu-cycles", "instructions");
357+
} catch (const profiler::profilerError &) {
358+
return; // hardware PMU not available on this system
359+
}
360+
}
361+
362+
// INSTRUMENT_PERF: two groups sharing a counter name — must throw profilerError
363+
// on the second group's first use, because the same counter in two groups would
364+
// produce independent FDs with different reset points, making values incomparable.
365+
DEFINE_TEST(test_perf_overlapping_groups) {
366+
INSTRUMENT_FUNCTION();
367+
try {
368+
INSTRUMENT_PERF("task-clock", "page-faults");
369+
INSTRUMENT_PERF("task-clock", "context-switches"); // "task-clock" already in group above
370+
} catch (const profiler::profilerError &) {
371+
return;
372+
}
373+
throw std::runtime_error("Expected profilerError for overlapping counter groups");
329374
}
330375

331376
// INSTRUMENT_PERF: incompatible counter types in a group (hardware + software)
332-
// must throw profilerError with a precise message from the perf API.
333-
// On accessible systems the kernel rejects the group with EINVAL; on
334-
// locked-down systems the leader itself fails with EACCES. Either way a
335-
// profilerError is thrown — failures are never silent.
377+
// must throw profilerError on systems where perf is available (EINVAL from the
378+
// kernel). On locked-down systems (EACCES/EPERM) the counter is silently
379+
// invalid — perf is simply not available, which is also acceptable.
336380
DEFINE_TEST(test_perf_incompatible_group) {
337381
INSTRUMENT_FUNCTION();
338382
try {
339383
INSTRUMENT_PERF("cpu-cycles", "task-clock");
340384
} catch (const profiler::profilerError &) {
341385
return;
342386
}
343-
throw std::runtime_error("Expected profilerError for incompatible counter group");
387+
// Counter silently invalid: perf not permitted on this system.
344388
}
345389

346390
// ============================================================

0 commit comments

Comments
 (0)