Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,15 @@ class Sample
// This is useful when the Sample object is embedded and will be destroyed later
bool export_sample();

// Flip the sign on the heap_space value so this sample is emitted as a
// tombstone (negative delta) by a subsequent export_sample() call.
// NOT idempotent: each call toggles the sign, so calling twice returns the
// original positive value. Heap-tracker callers must apply this exactly
// once per REMOVE event (see the tombstone_applied guard in
// _memalloc_heap.cpp) so retries on transient libdatadog rejection emit a
// stable negative value instead of toggling sign on every attempt.
void negate_heap_space();

static ProfileBorrow profile_borrow();
static void postfork_child();
static void cleanup();
Expand Down
14 changes: 14 additions & 0 deletions ddtrace/internal/datadog/profiling/dd_wrapper/src/sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,20 @@ Datadog::Sample::export_sample()
return ProfilerState::get().profile_state.collect(sample, endtime_ns);
}

void
Datadog::Sample::negate_heap_space()
{
// Flip the sign on heap_space so a subsequent export_sample() emits this
// sample as a tombstone. libdatadog accepts negative i64 values
// (sample.rs Value is &[i64] with no sign validation).
if (0U != (type_mask & SampleType::Heap)) {
const size_t heap_space_idx = ProfilerState::get().profile_state.val().heap_space;
if (heap_space_idx < values.size()) {
values[heap_space_idx] = -values[heap_space_idx];
}
}
}
Comment on lines +434 to +446

bool
Datadog::Sample::push_cputime(int64_t cputime, int64_t count)
{
Expand Down
186 changes: 179 additions & 7 deletions ddtrace/profiling/collector/_memalloc_heap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,35 @@ class heap_tracker_t
void postfork_child();

private:
/* Delta-export change event. ADD events reference the live traceback in
* allocs_m via raw pointer; REMOVE events own the extracted unique_ptr.
*
* The raw pointer in ADD is valid until export because the traceback is
* owned by either allocs_m or a subsequent REMOVE event in pending_changes
* (both are cleared together at export time). REMOVE events emit a
* negative tombstone — heap_space is negated lazily at emit time so an
* earlier ADD that aliases the same traceback (same ptr, same snapshot
* interval) reads the original positive value. `tombstone_applied` guards
* against re-negation when an emit fails and we retry. */
struct change_event
{
enum Kind : uint8_t
{
ADD,
REMOVE
};
void* ptr;
traceback_t* tb;
std::unique_ptr<traceback_t> owner; // set for REMOVE, null for ADD
Kind kind;
bool tombstone_applied = false; // REMOVE only; true after heap_space is negated
uint8_t failed_attempts = 0; // export retries counter; events exceeding MAX are dropped
};

/* Cap retries on a single event before dropping it, so a persistent
* libdatadog rejection cannot grow pending_changes without bound. */
static constexpr uint8_t MAX_EXPORT_RETRIES = 2;

uint32_t next_sample_size_no_cpython(uint32_t sample_size);

/* This function is called from heap_tracker_t::postfork_child() as part of
Expand Down Expand Up @@ -161,6 +190,31 @@ class heap_tracker_t

/* Initial capacity of the allocations map */
static constexpr size_t INITAL_ALLOC_MAP_CAPACITY = 512;

/* Delta export state. pending_changes accumulates ADD/REMOVE events between
* snapshots; export_heap_no_cpython drains it and emits the deltas.
*
* Pre-reserved at construction so steady-state push_back is allocation-free.
* On pathological bursts the vector may reallocate. The reallocation goes
* through ::operator new (system malloc), NOT PyMem_Malloc, so it does not
* reenter the Python allocator hooks where untrack_no_cpython runs from.
* (Reentry would also be caught by memalloc_reentrant_guard_t earlier in
* the call.) Dropping events at the push site instead would risk
* use-after-clear on a queued ADD whose traceback gets pool_put'd by a
* matching REMOVE that overflowed — letting the vector grow is the safe
* option. */
static constexpr size_t PENDING_CHANGES_RESERVE = 4096;
std::vector<change_event> pending_changes;

/* Pair-collapse side map: ptr -> index of its pending ADD event in
* pending_changes. When untrack arrives for a ptr that has a still-pending
* ADD (i.e., the allocation was tracked and freed within the same snapshot
* interval), both events are canceled before reaching libdatadog — the ADD
* is marked `collapsed`, no REMOVE is queued, and the traceback returns to
* the pool immediately. Saves ~2 Profile_add2 calls per churn pair, which
* dominates CPU on alloc-then-free hot loops (see PR #18125 review). */
static constexpr size_t PENDING_ADD_IDX_RESERVE = 1024;
HeapMapType<void*, size_t> pending_add_idx;
};

// Pool implementation
Expand Down Expand Up @@ -223,6 +277,8 @@ heap_tracker_t::heap_tracker_t(uint32_t sample_size_val)
pool.reserve(POOL_CAPACITY);
// Pre-allocate map capacity to avoid rehashing during ramp-up.
allocs_m.reserve(INITAL_ALLOC_MAP_CAPACITY);
pending_changes.reserve(PENDING_CHANGES_RESERVE);
pending_add_idx.reserve(PENDING_ADD_IDX_RESERVE);
}

void
Expand All @@ -231,9 +287,48 @@ heap_tracker_t::untrack_no_cpython(void* ptr)
memalloc_gil_debug_guard_t guard(gil_guard);

auto node = allocs_m.extract(ptr);
if (!node.empty()) {
pool_put_no_cpython(std::move(node.mapped()));
if (node.empty()) {
return;
}
auto owner = std::move(node.mapped());

/* Pair-collapse: if a still-pending ADD targets this same ptr, the
* allocation was tracked AND freed within this snapshot interval — both
* events would aggregate to a zero bucket on the backend anyway, so skip
* both and reclaim the traceback immediately. Eliminates the dominant
* libdatadog cost in alloc-then-free hot loops.
*
* Swap-and-pop keeps pending_changes compact instead of leaving the ADD
* marked dead in place: the last element moves into the freed slot, and
* if it's also a tracked ADD, its index entry in pending_add_idx is
* rewritten to point at the new position. Avoids paying the export-time
* iteration cost over a buffer that fills up with skipped entries under
* sustained alloc/free churn. */
auto it = pending_add_idx.find(ptr);
if (it != pending_add_idx.end()) {
const size_t add_idx = it->second;
const size_t last_idx = pending_changes.size() - 1;
if (add_idx != last_idx) {
pending_changes[add_idx] = std::move(pending_changes[last_idx]);
// The moved element kept its own pending_add_idx entry pointing at
// last_idx; rewrite it if the moved element was an indexed ADD.
if (pending_changes[add_idx].kind == change_event::ADD) {
pending_add_idx[pending_changes[add_idx].ptr] = add_idx;
}
}
pending_changes.pop_back();
pending_add_idx.erase(it);
pool_put_no_cpython(std::move(owner));
return;
}

/* No pending ADD — the allocation was tracked in an earlier snapshot
* interval and the backend already integrated its positive contribution.
* Queue a REMOVE so the next export emits a negative tombstone. heap_space
* negation is deferred to emit time so any same-snapshot raw-ptr aliasing
* sees the original positive value first. */
auto* raw = owner.get();
pending_changes.push_back({ ptr, raw, std::move(owner), change_event::REMOVE });
}

bool
Expand Down Expand Up @@ -266,11 +361,18 @@ heap_tracker_t::add_sample_no_cpython(void* ptr, std::unique_ptr<traceback_t> tb
memalloc_gil_debug_guard_t guard(gil_guard);

auto [it, inserted] = allocs_m.insert_or_assign(ptr, std::move(tb));
(void)it; // Unused, but needed for structured binding

/* This should always be a new insertion. If not, we failed to properly untrack a previous allocation. */
assert(inserted && "add_sample: found existing entry for key that should have been removed");

/* Record ADD event referencing the live traceback. The raw pointer remains
* valid until export: either the entry stays in allocs_m, or a subsequent
* untrack moves ownership into a REMOVE event in pending_changes — both
* are cleared in the same export pass. Index this event in pending_add_idx
* so a same-snapshot-interval free can find and collapse it. */
pending_changes.push_back({ ptr, it->second.get(), nullptr, change_event::ADD });
pending_add_idx[ptr] = pending_changes.size() - 1;

// Get ready for the next sample
reset_sampling_state_no_cpython();
}
Expand All @@ -280,10 +382,74 @@ heap_tracker_t::export_heap_no_cpython()
{
memalloc_gil_debug_guard_t guard(gil_guard);

/* Iterate over live samples and export them */
for (const auto& [ptr, tb] : allocs_m) {
(void)ptr; // Suppress unused variable warning
tb->sample.export_sample();
/* Delta emission: ADDs reference live tracebacks via raw pointer (kept
* alive by either allocs_m or a later REMOVE event in this buffer).
* REMOVEs own the extracted unique_ptr and emit a negative tombstone;
* heap_space is negated lazily here (guarded by tombstone_applied) so an
* earlier ADD aliasing the same traceback emits the original positive
* value before the REMOVE flips the sign, and so retries on libdatadog
* rejection don't toggle the sign each attempt.
*
* On libdatadog rejection, retain the event for retry on the next snapshot
* — without this, an ADD or REMOVE that Profile_add2 refuses is silently
* dropped, causing permanent drift in the backend's running heap state.
* Bounded by MAX_EXPORT_RETRIES so a persistent rejection cannot grow
* pending_changes without bound.
*
* No periodic resync is emitted today: a full-snapshot would not carry a
* wire-format marker telling the backend to reset accumulated state, so
* the backend would double-count instead of recovering from drift. If a
* delta upload is dropped, the backend's state diverges from live until
* the next profiler restart (which clears allocs_m and pending_changes).
* Adding a resync requires either backend-side reset signaling or
* emitting compensating negative+positive pairs for every live entry —
* both are out of scope for v1. */

/* In-place compaction with a write index. Moving into a fresh vector and
* swapping would discard the constructor-side reserve (PENDING_CHANGES_RESERVE)
* on every export, forcing the hook path to re-grow the buffer on the next
* churn burst. Keeping the same storage preserves capacity. */
size_t write_idx = 0;
for (size_t read_idx = 0; read_idx < pending_changes.size(); ++read_idx) {
auto& evt = pending_changes[read_idx];
if (evt.kind == change_event::REMOVE && !evt.tombstone_applied) {
evt.tb->sample.negate_heap_space();
evt.tombstone_applied = true;
}
if (evt.tb->sample.export_sample()) {
// Success: recycle the REMOVE traceback; ADDs leave their
// tracebacks in allocs_m where they were.
if (evt.kind == change_event::REMOVE) {
pool_put_no_cpython(std::move(evt.owner));
}
Comment on lines +422 to +424
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep REMOVE-owned traceback alive while ADD retries remain

When an ADD event fails export and is retained for retry, it only keeps a raw tb pointer. If the matching REMOVE event later succeeds in the same pass, this branch immediately returns evt.owner to the pool, which clears/reuses the same traceback_t. On the next snapshot, the retained ADD retry dereferences a stale/recycled pointer, causing corrupted samples or a use-after-free style crash. This occurs whenever Profile_add2 rejects the ADD but accepts the later REMOVE for the same allocation sequence.

Useful? React with 👍 / 👎.

continue;
}
// libdatadog rejected: retry next snapshot, up to MAX_EXPORT_RETRIES.
if (evt.failed_attempts >= MAX_EXPORT_RETRIES) {
if (evt.kind == change_event::REMOVE) {
pool_put_no_cpython(std::move(evt.owner));
}
continue;
}
++evt.failed_attempts;
if (write_idx != read_idx) {
pending_changes[write_idx] = std::move(evt);
}
++write_idx;
}
pending_changes.resize(write_idx);
/* Rebuild the side map for the compacted pending_changes vector. The old
* map's indices referenced positions before compaction; without rebuilding,
* retained ADDs would lose their pair-collapse capability on the next
* snapshot (a REMOVE arriving for one of these ptrs would queue a tombstone
* instead of canceling the still-unsuccessful ADD). The cost is O(write_idx),
* which is bounded — retained events only appear on libdatadog rejection
* and are capped at MAX_EXPORT_RETRIES per event. */
pending_add_idx.clear();
for (size_t i = 0; i < pending_changes.size(); ++i) {
if (pending_changes[i].kind == change_event::ADD) {
pending_add_idx[pending_changes[i].ptr] = i;
}
}

Datadog::Sample::profile_borrow().stats().set_heap_tracker_size(allocs_m.size());
Expand All @@ -310,6 +476,12 @@ heap_tracker_t::postfork_child()
// Profile::postfork_child()
pool.clear();

// Pending delta events may contain raw pointers into allocs_m and owned
// tracebacks; clear before dropping allocs_m so the raw pointers in any
// ADD events do not get stranded. Side map indices alias these events.
pending_changes.clear();
pending_add_idx.clear();

// Allocations map may contain data from the parent process, and also
// traceback_t objects may reference invalid Profile state.
allocs_m.clear();
Expand Down
Loading
Loading